In [None]:
! pip install datasets transformers

In [None]:
from datasets import load_dataset

import glob
import pickle
import re 
from termcolor import colored
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
import math


from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [None]:
# import the data-----------------------------------------------------
from google.colab import drive
drive.mount('/content/drive')

In [None]:
val_path = '.../Data/poetry_validation_verse.csv'
train_path = '.../Data/poetry_training_verse.csv'


In [None]:
dataset_poetry = load_dataset('csv', data_files={'train': train_path,
                                                'test': val_path})

In [None]:
model_path = 'HooshvareLab/distilbert-fa-zwnj-base'


# model with specific vocab and folder
model = AutoModelForMaskedLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path,
                                               use_fast=True)

tokenizer.add_tokens(['برآرد', 'برآید', 'وزآن', 'درآمد', 
                      'بدانگهی', 'نام‌آو',
                      'ناآشنا', 'بدخویی', 'براندیشم'])

model.resize_token_embeddings(len(tokenizer))

In [None]:
tokenizer.vocab_size

In [None]:
training_args = TrainingArguments( 
    "test-clm",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01,  
    load_best_model_at_end=True,
    num_train_epochs=5
)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['poetry'])


In [None]:
tokenized_datasets = dataset_poetry.map(
    tokenize_function, 
    batched=True, 
    num_proc=4,
    batch_size=512)


tokenized_datasets["train"][1]

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
                                                mlm_probability=0.15)

In [None]:
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator, 
)

In [None]:
trainer.train()

In [None]:
#trainer.train()

In [None]:
eval_results = trainer.evaluate()

In [None]:
import math
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
import math
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
model.save_pretrained('.../Pretrained Models/Pretrained on verses/DistilBERT_0.15_Verse/with 5 epochs/')
tokenizer.save_pretrained('.../Pretrained Models/Pretrained on verses/DistilBERT_0.15_Verse/with 5 epochs/')