In [None]:
!pip install datasets transformers
!pip install hazm

In [None]:
from datasets import load_dataset

import glob
import pickle
import re 
from termcolor import colored
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
import math


from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# import the data-----------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
all_poems_beit_add = '...pd.read_pickle(all_poems_beit_add)

import hazm
normalizer = hazm.Normalizer(persian_numbers=False)
normalized = []

for text in all_poems:
    
    normalized.append(normalizer.normalize(text))

x_train, x_val = train_test_split(normalized, shuffle = True, test_size = 0.1)

(pd.DataFrame(pd.Series(x_train), columns = ['poetry'])).to_csv(train_path,
                                                                index=False)
(pd.DataFrame(pd.Series(x_val), columns = ['poetry'])).to_csv(val_path,
                                                                index=False)
                            
"""

In [None]:
val_path = '.../Data/all_poetry_val_beit.csv'
train_path = '.../Data/all_poetry_train_beit.csv'

In [None]:
dataset_poetry = load_dataset('csv', data_files={'train': train_path,
                                                'test': val_path})

In [None]:
model_path = "HooshvareLab/roberta-fa-zwnj-base"


# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               use_fast=True)

In [None]:
len(tokenizer)

In [None]:
tokenizer.add_tokens(['برآرد', 'برآید', 'وزآن', 'درآمد', 
                      'بدانگهی', 'نام‌آو',
                      'ناآشنا', 'بدخویی', 'براندیشم'])

model.resize_token_embeddings(len(tokenizer))

In [None]:
#tokenizer.encoder('سلام')

In [None]:
# model with specific vocab and folder
model_folder_path_Roberta = '.../Pretrained Models/Pretrained on beit/Roberta_0.4_beit/'

model = AutoModelForMaskedLM.from_pretrained(model_folder_path_Roberta)
tokenizer = AutoTokenizer.from_pretrained(model_folder_path_Roberta,
                                               use_fast=True)


In [None]:
training_args = TrainingArguments( 
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,  
    load_best_model_at_end=True,
    num_train_epochs=5
)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['poetry'])


In [None]:
tokenized_datasets = dataset_poetry.map(
    tokenize_function, 
    batched=True, 
    num_proc=4,
    batch_size=512)


tokenized_datasets["train"][1]

In [None]:
tokenized_datasets

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm_probability=0.4)

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)

In [None]:
trainer.train() # mask 0.4

In [None]:
#tokenizer.save_pretrained('.../Pretrained Models/Roberta_0.4_beit/')
#model.save_pretrained('.../Pretrained Models/Roberta_0.4_beit/')

In [None]:
trainer.train( ) # mask 0.6

In [None]:
#torch.save(model.state_dict(), '.../Pretrained Models/Roberta_0.4_beit/')

In [None]:
trainer.train() # 0.4

In [None]:
eval_results = trainer.evaluate()

In [None]:
import math
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") # mask 0.4