In [None]:
!pip install datasets transformers
!pip install hazm

In [None]:
from datasets import load_dataset

import glob
import pickle
import re 
from termcolor import colored
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
import math


from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# import the data-----------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

In [None]:
"""
all_poems_beit_add = '.../Data/beits_joined_cleaned.pickle'
all_poems = pd.read_pickle(all_poems_beit_add)
"""

In [None]:
"""
import hazm
normalizer = hazm.Normalizer(persian_numbers=False)
normalized = []

for text in all_poems:
    
    normalized.append(normalizer.normalize(text))
"""

In [None]:
#x_train, x_val = train_test_split(normalized, shuffle = True, test_size = 0.1)

In [None]:
val_path = '.../Data/all_poetry_val_beit.csv'
train_path = '.../Data/all_poetry_train_beit.csv'

In [None]:
"""
(pd.DataFrame(pd.Series(x_train), columns = ['poetry'])).to_csv(train_path,
                                                                index=False)
(pd.DataFrame(pd.Series(x_val), columns = ['poetry'])).to_csv(val_path,
                                                                index=False)
                                                                """

In [None]:
dataset_poetry = load_dataset('csv', data_files={'train': train_path,
                                                'test': val_path})

In [None]:

model_checkpoint_bert_V3 = 'HooshvareLab/bert-fa-zwnj-base'
# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint_bert_V3)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_bert_V3,
                                               use_fast=True)

In [None]:
len(tokenizer)

In [None]:
tokenizer.add_tokens(['برآرد', 'برآید', 'وزآن', 'درآمد', 
                      'بدانگهی', 'نام‌آو',
                      'ناآشنا', 'بدخویی', 'براندیشم'])

model.resize_token_embeddings(len(tokenizer))

In [None]:
#tokenizer.encoder('سلام')

In [None]:
tokenizer.vocab.keys()


In [None]:
tokenizer.vocab_size

In [None]:
training_args = TrainingArguments( 
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,  
    load_best_model_at_end=True,
    num_train_epochs=5
)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['poetry'])


In [None]:
tokenized_datasets = dataset_poetry.map(
    tokenize_function, 
    batched=True, 
    num_proc=4,
    batch_size=512)


tokenized_datasets["train"][1]

In [None]:
tokenizer.unk_token_id 

In [None]:
tokenized_datasets

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm_probability=0.4)

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)

In [None]:
trainer.train() # mask 0.4

In [None]:
tokenizer.save_pretrained('.../Pretrained Models/BERT_0.4_beit/')
model.save_pretrained('.../Pretrained Models/BERT_0.4_beit/')

In [None]:
eval_results = trainer.evaluate()

In [None]:
import math
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") # mask 0.4

In [None]:
sequence = 'هزار <mask> هست از نیای او افضل'


top_k=5
ids_ = tokenizer.encode(sequence,
                            return_tensors="pt",
                            add_special_tokens=False)

position = torch.where(ids_ == tokenizer.mask_token_id)

positions_list = position[1].numpy().tolist()
ids_ = ids_.to('cuda')
predictions_ids = {}
predictions_detokenized_sents = {}

for i in range(len(positions_list)):
    predictions_ids[i] = []
    predictions_detokenized_sents[i] = []
    # where do we have mask?
    # print(i)


    
    # if it was the first prediction, 
    # just go on and predict the first predictions
    

    if i==0:
        model_logits = model(ids_)['logits'][0][positions_list[0]]
        top_k_tokens = torch.topk(model_logits, top_k, dim=0).indices.tolist()

        for j in range(len(top_k_tokens)):
            #print(j)
            ids_[0][positions_list[0]] = top_k_tokens[j]
            pred = tokenizer.decode(ids_[0])

            # append the sentences and ids of this masked
            # token
            predictions_ids[i].append(ids_)
            predictions_detokenized_sents[i].append(pred)



    # if we already have some predictions, go on and fill the rest
    # of the masks by continuing the previous predictions
    if i!=0:
        for pred in predictions_ids[i-1]:
            print(pred)
            # get the logits
            model_logits = model(pred)['logits'][0][positions_list[i]]
            # get the top 5 of this prediction and masked token
            top_k_tokens = torch.topk(model_logits, top_k, dim=0)\
            .indices.tolist()

            for top_id in top_k_tokens:
                # print(top_id)
                
                ids_[0][positions_list[i]] = top_id
                pred = tokenizer.decode(ids_[0])

                # append the sentences and ids of this masked
                # token
                predictions_ids[i].append(ids_)
                predictions_detokenized_sents[i].append(pred)

    


In [None]:
predictions_detokenized_sents

In [None]:
import time, psutil
uptime = time.time() - psutil.boot_time()
print('How much I used?\n {} hours, and {:.2f} minutes '.format(uptime//3600, uptime%60))
remain = 24*60*60 - uptime
print('How much is remaining?\n {} hours, and {:.2f} minutes '.format(remain//3600, remain%60))

In [None]:
model.save_pretrained(model_folder_path_bert_beit_07)
tokenizer.save_pretrained(model_folder_path_bert_beit_07)

# Phase ||

In [None]:
model_folder_path_bert_beit_07 = r'.../Pretrained Models/bert_beit_07/'


# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_folder_path_bert_beit_07)
tokenizer = AutoTokenizer.from_pretrained(model_folder_path_bert_beit_07,
                                               use_fast=True)

In [None]:
len(tokenizer)

In [None]:
training_args = TrainingArguments( 
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01, 
    load_best_model_at_end=True, 
)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['poetry'])


In [None]:
tokenized_datasets = dataset_poetry.map(
    tokenize_function, 
    batched=True, 
    num_proc=4, 
    batch_size=512)


tokenized_datasets["train"][1]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm_probability=0.7, )

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,  
)

In [None]:
trainer.train() # mask 0.7

In [None]:
model_folder_path_bert_beit_07 = r'.../Pretrained Models/bert_beit_07_6Epochs/'

model.save_pretrained(model_folder_path_bert_beit_07)
tokenizer.save_pretrained(model_folder_path_bert_beit_07)

In [None]:
model_folder_path_bert_beit_07 = r'.../Pretrained Models/bert_beit_07_6Epochs/'


# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_folder_path_bert_beit_07)
tokenizer = AutoTokenizer.from_pretrained(model_folder_path_bert_beit_07,
                                               use_fast=True)

In [None]:
trainer.train() # mask 0.7

In [None]:
model_folder_path_bert_beit_07_2 = r'.../Pretrained Models/bert_beit_07_2/'


model.save_pretrained(model_folder_path_bert_beit_07_2)
tokenizer.save_pretrained(model_folder_path_bert_beit_07_2)