In [None]:
!pip install datasets transformers
!pip install hazm

In [None]:
from datasets import load_dataset

import glob
import pickle
import re 
from termcolor import colored
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
import math


from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# import the data-----------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

# creating the dataset

In [None]:
"""
all_poems_beit_add = '.../Data/beits_joined_cleaned.pickle'
all_poems = pd.read_pickle(all_poems_beit_add)

import hazm
normalizer = hazm.Normalizer(persian_numbers=False)
normalized = []

for text in all_poems:
    
    normalized.append(normalizer.normalize(text))

x_train, x_val = train_test_split(normalized, shuffle = True, test_size = 0.1)
"""

In [None]:
"""
(pd.DataFrame(pd.Series(x_train), columns = ['poetry'])).to_csv(train_path,
                                                                index=False)
(pd.DataFrame(pd.Series(x_val), columns = ['poetry'])).to_csv(val_path,
                                                                index=False)
                                                                """

# Reading the dataset and training

In [None]:
val_path = '.../Data/all_poetry_val_beit.csv'
train_path = '.../Data/all_poetry_train_beit.csv'

In [None]:
dataset_poetry = load_dataset('csv', data_files={'train': train_path,
                                                'test': val_path})

In [None]:
model_path = "HooshvareLab/distilbert-fa-zwnj-base"


# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               use_fast=True)

In [None]:
len(tokenizer)

In [None]:
tokenizer.add_tokens(['برآرد', 'برآید', 'وزآن', 'درآمد', 
                      'بدانگهی', 'نام‌آو',
                      'ناآشنا', 'بدخویی', 'براندیشم'])

model.resize_token_embeddings(len(tokenizer))

In [None]:
model_folder_path_distilbert = '.../Pretrained Models/Pretrained on beit/DistilBERT_0.4_beit/'
# model with specific vocab and folder

model = AutoModelForMaskedLM.from_pretrained(model_folder_path_distilbert)
tokenizer = AutoTokenizer.from_pretrained(model_folder_path_distilbert,
                                               use_fast=True)


In [None]:
training_args = TrainingArguments( 
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,  
    load_best_model_at_end=True,
    num_train_epochs=5
)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['poetry'])


In [None]:
tokenized_datasets = dataset_poetry.map(
    tokenize_function, 
    batched=True, 
    num_proc=5,
    batch_size=512)


tokenized_datasets["train"][1]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm_probability=0.4)

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)

In [None]:
trainer.train() # mask 0.4

In [None]:
eval_results = trainer.evaluate()

In [None]:
import math
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") # mask 0.4 Distilbert

In [None]:
"""
model.save_pretrained(model_folder_path_bert_beit_07)
tokenizer.save_pretrained(model_folder_path_bert_beit_07)
"""

# Phase ||

In [None]:
model_folder_path_bert_beit_07 = r'.../Pretrained Models/bert_beit_07/'


# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_folder_path_bert_beit_07)
tokenizer = AutoTokenizer.from_pretrained(model_folder_path_bert_beit_07,
                                               use_fast=True)

In [None]:
len(tokenizer)

In [None]:
training_args = TrainingArguments( 
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01, 
    load_best_model_at_end=True, 
)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['poetry'])


In [None]:
tokenized_datasets = dataset_poetry.map(
    tokenize_function, 
    batched=True, 
    num_proc=4, 
    batch_size=512)


tokenized_datasets["train"][1]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm_probability=0.7)

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,  
)

In [None]:
trainer.train() # mask 0.7

In [None]:
model_folder_path_bert_beit_07 = r'.../Pretrained Models/bert_beit_07_6Epochs/'

model.save_pretrained(model_folder_path_bert_beit_07)
tokenizer.save_pretrained(model_folder_path_bert_beit_07)

In [None]:
model_folder_path_bert_beit_07 = r'.../Pretrained Models/bert_beit_07_6Epochs/'


# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_folder_path_bert_beit_07)
tokenizer = AutoTokenizer.from_pretrained(model_folder_path_bert_beit_07,
                                               use_fast=True)

In [None]:
trainer.train() # mask 0.7

In [None]:
model_folder_path_bert_beit_07_2 = r'.../Pretrained Models/bert_beit_07_2/'


model.save_pretrained(model_folder_path_bert_beit_07_2)
tokenizer.save_pretrained(model_folder_path_bert_beit_07_2)