In [2]:
import glob
import codecs
import pandas as pd
import transformers
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from sentence_transformers import SentenceTransformer, models
from hazm import Normalizer, Stemmer, Lemmatizer, word_tokenize
from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForMaskedLM

# **Prepare the data**

In [3]:
normalizer = Normalizer()
stopwords = [normalizer.normalize(x.strip()) for x in codecs.open('stopwords.dat','r','utf-8').readlines()]
lemmatizer = Lemmatizer()
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stopwords]
    return tokens

In [4]:
# path = r'./Persian_poems_corpus/normalized/'                                                   # use your path
path = './subset_corpus/'
all_files = glob.glob(path + "/*.txt")

li = []

for filename in all_files:
    poet = filename.replace('\\', '/').split('/')[-1][:-9]
    if poet == 'khaghani':
        continue
    df = pd.read_csv(filename, names=['mesra'], header=None)
    df['beyt index'] = df.index // 2
    df = df.groupby(['beyt index']).agg({'mesra': ' '.join})
    df = df.rename(columns={'mesra':'beyt'})
    df['author'] = poet
    li.append(df)

df = pd.concat(li, axis=0, ignore_index=True)
df[['beyt']].to_csv('./corpus.csv', index=False)
# df.beyt = df.beyt.apply(preprocess)
df

Unnamed: 0,beyt,author
0,الا یا ایها الساقی ادر کاسا و ناولها که عشق آس...,hafez
1,به بوی نافه ای کاخر صبا زان طره بگشاید ز تاب ج...,hafez
2,مرا در منزل جانان چه امن عیش چون هر دم جرس فری...,hafez
3,به می سجاده رنگین کن گرت پیر مغان گوید که سالک...,hafez
4,شب تاریک و بیم موج و گردابی چنین هایل کجا دانن...,hafez
...,...,...
20402,ف و ما اعتدی الا علی من یعتدی بشر الینا بالرجا...,saadi
20403,و تقایض الدنیا بدولت سرمد مهمارجوت رجوت خیرالم...,saadi
20404,و اذا قصدت قصدت خیرالمقصد مدت حیوت الناس تحت ظ...,saadi
20405,لا زال فی اهنی الحیوت و ارغد هذی خلال الزاکیات...,saadi


In [5]:
class PoemsDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer):
        rows = []
    
        for row in df.itertuples():
            tokenized = tokenizer(row.beyt, padding='max_length', truncation='longest_first', max_length=30)
            rows.append(tokenized)
                
        self.__rows = rows
        
    def __len__(self):
        return len(self.__rows)
    
    def __getitem__(self, idx):
        return self.__rows[idx]

In [6]:
ds = load_dataset('csv', data_files='./corpus.csv')
ds['train'][10]

Using custom data configuration default-e4ead6903a3eb641


Downloading and preparing dataset csv/default to C:\Users\mozaf\.cache\huggingface\datasets\csv\default-e4ead6903a3eb641\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


100%|██████████| 1/1 [00:00<?, ?it/s]
100%|██████████| 1/1 [00:00<00:00, 249.69it/s]


Dataset csv downloaded and prepared to C:\Users\mozaf\.cache\huggingface\datasets\csv\default-e4ead6903a3eb641\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 77.35it/s]


{'beyt': 'ز روی دوست دل دشمنان چه دریابد چراغ مرده کجا شمع آفتاب کجا'}

# **Define the model and start training**

In [7]:
model_checkpoint = './trained_models/HooshvareLab_bert-fa-zwnj-base'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [9]:
tokenized_datasets = ds.map(lambda x: tokenizer(x['beyt']), batched=False, num_proc=1, remove_columns=['beyt'])

20407ex [00:06, 3135.60ex/s]


In [10]:
tokenized_datasets['train'][10]

{'input_ids': [2,
  607,
  2126,
  2966,
  2266,
  9198,
  2180,
  31684,
  5603,
  8419,
  6079,
  12494,
  595,
  1954,
  1949,
  6079,
  3],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [11]:
block_size = 128
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1,
)

100%|██████████| 21/21 [00:02<00:00,  7.10ba/s]


In [None]:
# model_checkpoint = './trained_models/HooshvareLab_bert-fa-zwnj-base'
# df = pd.read_csv('./corpus.csv')
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# dataset = PoemsDataset(df, tokenizer)

# dataset.__getitem__(10)

In [13]:
# model_checkpoint = './trained_models/HooshvareLab_bert-fa-zwnj-base'
model = AutoModelForMaskedLM.from_pretrained('./trained_models/HooshvareLab_bert-fa-zwnj-base')

# bert_base = models.Transformer('./trained_models/HooshvareLab_bert-fa-zwnj-base')
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
# tokenized_datasets = ds.map(tokenize_function, batched=True, num_proc=4, remove_columns=["beyt"])



In [14]:
training_args = TrainingArguments(
    output_dir='./trained_models/bert_finetuned',
    overwrite_output_dir=True,
    num_train_epochs=10,
    save_steps=1_000,
    save_total_limit=2,
    logging_steps=1,
    prediction_loss_only=True,
    evaluation_strategy='steps',
    no_cuda = True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    disable_tqdm=False,
    learning_rate=5e-5
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets['train']
)

trainer.train()

***** Running training *****
  Num examples = 2919
  Num Epochs = 10
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 3650
  return torch._C._cuda_getDeviceCount() > 0
  0%|          | 1/3650 [00:06<6:39:24,  6.57s/it]

{'loss': 1.9326, 'learning_rate': 4.998630136986302e-05, 'epoch': 0.0}


ValueError: Trainer: evaluation requires an eval_dataset.