<a href="https://colab.research.google.com/github/nahbos/AUT-Language-Understanding/blob/main/Proj/farsi_poem_generator_T5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sobhan Moradian Daghigh
Project: NLG - Poem

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import json
import os
import glob

import numpy as np
import pandas as pd

from tqdm import tqdm

In [4]:
!wget -nc https://raw.githubusercontent.com/nahbos/AUT-Language-Understanding/main/Proj/Datasets/train.csv
!wget -nc https://raw.githubusercontent.com/nahbos/AUT-Language-Understanding/main/Proj/Datasets/test.csv

File ‘train.csv’ already there; not retrieving.

File ‘test.csv’ already there; not retrieving.



In [5]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [6]:
train.head()

Unnamed: 0,cat_title,v_order,v_position,poem_text,poet_name,poem_id
0,رباعی,1,0,جز نقش تو در نظر نیامد ما را,حافظ,1
1,رباعی,2,1,جز کوی تو رهگذر نیامد ما را,حافظ,1
2,رباعی,3,0,خواب ارچه خوش آمد همه را در عهدت,حافظ,1
3,رباعی,4,1,حقا که به چشم در نیامد ما را,حافظ,1
4,رباعی,1,0,بر گیر شراب طرب‌انگیز و بیا,حافظ,2


In [7]:
poets = list(train['cat_title'].unique())
poets

['رباعی', 'قطعه', 'قصیده', 'غزل', '<unk>']

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8610 entries, 0 to 8609
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   cat_title   8610 non-null   object
 1   v_order     8610 non-null   int64 
 2   v_position  8610 non-null   int64 
 3   poem_text   8610 non-null   object
 4   poet_name   8610 non-null   object
 5   poem_id     8610 non-null   int64 
dtypes: int64(3), object(3)
memory usage: 403.7+ KB


In [9]:
list(train['cat_title']).count('<unk>')

496

In [10]:
import pandas as pd
import numpy as np

from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelWithLMHead
from transformers import AutoTokenizer

from IPython import display

In [11]:
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
model_name_or_path = "google/mt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)



In [13]:
texts = train['poem_text'].values.tolist()
max_seq = max([len(tokenizer.encode(text)) for text in texts])

print(f'The longest text is {max_seq} tokens long.')
print('This is for one hemistich!')

The longest text is 20 tokens long.
This is for one hemistich!


In [14]:
max_length = 20 * 2

In [15]:
groups = train.groupby(by='v_position')
hemistichs1, hemistichs2 = [], []
for grp, pdf in groups:
    if grp == 0:
        hemistichs1 = list(pdf['poem_text'])
    else:
        hemistichs2 = list(pdf['poem_text'])

verses = []
for h1, h2 in zip(hemistichs1, hemistichs2):
    verse = h1 + ' <sep> ' + h2
    verses.append(verse)

In [16]:
verses[0]

'جز نقش تو در نظر نیامد ما را <sep> جز کوی تو رهگذر نیامد ما را'

In [17]:
groups = test.groupby(by='v_position')
hemistichs1_test, hemistichs2_test = [], []
for grp, pdf in groups:
    if grp == 0:
        hemistichs1_test = list(pdf['poem_text'])
    else:
        hemistichs2_test = list(pdf['poem_text'])
  
verses_test = []
for h1_test, h2_test in zip(hemistichs1_test, hemistichs2_test):
    verse = h1_test + ' <sep> ' + h2_test
    verses_test.append(verse)

In [18]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from datasets import Dataset

In [20]:
train = {"input":verses[:-1], "label":verses[1:]}
dataset_train = Dataset.from_dict(train)

In [21]:
test = {"input":verses_test[:-1], "label":verses_test[1:]}
dataset_test = Dataset.from_dict(test)

In [22]:
from datasets import DatasetDict
datasets = DatasetDict({"train":dataset_train, "test":dataset_test})

In [23]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)


def setup(verses):
    input_list = verses['input']
    label_list = verses['label']
    
    model_inputs = tokenizer(input_list, 
                             max_length=max_length, 
                             padding="max_length", 
                             truncation=True)

    labels = tokenizer(label_list,
                       max_length=max_length, 
                       padding="max_length", 
                       truncation=True).input_ids

    labels_with_ignore_index = []
    for labels_example in labels:
      labels_example = [label if label != 0 else -100 for label in labels_example]
      labels_with_ignore_index.append(labels_example)
    
    model_inputs["labels"] = labels_with_ignore_index

    return model_inputs

In [24]:
dataset = datasets.map(setup, batched=True, remove_columns=datasets["train"].column_names)

Map:   0%|          | 0/4304 [00:00<?, ? examples/s]

Map:   0%|          | 0/499 [00:00<?, ? examples/s]

In [25]:
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch

dataset.set_format(type="torch", 
                   columns=['input_ids', 'attention_mask', 'labels'])

train_dataset, val_dataset = random_split(dataset["train"], [0.9, 0.1], generator=torch.Generator().manual_seed(42))

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=18)
val_dataloader = DataLoader(val_dataset, batch_size=18)
test_dataloader = DataLoader(dataset['test'], batch_size=18)

f'There are {len(train_dataset)} samples for training, and {len(val_dataset)} samples for validation testing'

'There are 3874 samples for training, and 430 samples for validation testing'

In [26]:
!pip install pytorch_lightning

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [27]:
from transformers import MT5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl

class PoemMT5(pl.LightningModule):
    def __init__(self, lr=5e-5, num_train_epochs=15, warmup_steps=1000):
        super().__init__()
        self.model = MT5ForConditionalGeneration.from_pretrained(model_name_or_path)
        self.save_hyperparameters()

    def forward(self, input_ids, attention_mask, labels=None):     
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs
    
    def common_step(self, batch, batch_idx):
        outputs = self(**batch)
        loss = outputs.loss
        return loss
      
    def training_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)
        self.log("training_loss", loss)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        self.log("validation_loss", loss, on_epoch=True)
        return loss

    def test_step(self, batch, batch_idx):
        loss = self.common_step(batch, batch_idx)     
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
        num_train_optimization_steps = self.hparams.num_train_epochs * len(train_dataloader)
        lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                    num_warmup_steps=self.hparams.warmup_steps,
                                                    num_training_steps=num_train_optimization_steps),
                        'name': 'learning_rate',
                        'interval':'step',
                        'frequency': 1}
        
        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    def train_dataloader(self):
        return train_dataloader

    def val_dataloader(self):
        return val_dataloader

    def test_dataloader(self):
        return test_dataloader

In [28]:
model = PoemMT5()

In [29]:
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor


early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = Trainer(accelerator='gpu',
                  default_root_dir="/content/Checkpoints",
                  callbacks=[early_stop_callback, lr_monitor])
trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
  rank_zero_warn(
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                        | Params
------------------------------------------------------
0 | model | MT5ForConditionalGeneration | 300 M 
------------------------------------------------------
300 M     Trainable params
0         Non-trainable params
300 M     Total params
1,200.707 Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [30]:
from transformers import MT5ForConditionalGeneration

model.model.save_pretrained('./')
model = MT5ForConditionalGeneration.from_pretrained('./')

In [56]:
example = datasets["train"][16]
print(example)
input_ids = tokenizer(example['input'], return_tensors='pt').input_ids
res = model.generate(input_ids, num_return_sequences=1, num_beams=1)
tokenizer.decode(res[0], skip_special_tokens=False)

{'input': 'نی قصه\u200cی آن شمع چگل بتوان گفت <sep> نی حال دل سوخته دل بتوان گفت', 'label': 'غم در دل تنگ من از آن است که نیست <sep> یک دوست که با او غم دل بتوان گفت'}


'<pad><extra_id_0> ی چشم ی چشم ی <sep> <sep> <sep'

In [54]:
example = datasets["test"][1]
print(example)
input_ids = tokenizer(example['input'], return_tensors='pt').input_ids
res = model.generate(input_ids, num_return_sequences=1, num_beams=1)
print(tokenizer.decode(res[0], skip_special_tokens=False))

{'input': 'گفتم سخن تو، گفت حافظ گفتا <sep> شادی همه لطیفه گویان صلوات', 'label': 'ای کاش که بخت سازگاری کردی <sep> با جور زمانه یار یاری کردی'}
<pad><extra_id_0> حافظ حافظ حافظ حافظ حافظ حافظ حافظ <sep> <


### Evaluate

In [57]:
from nltk.translate.bleu_score import sentence_bleu

def cal_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    score = sentence_bleu(reference, candidate, weights=(1, 0, 0, 0))
    return score

In [72]:
scores = []
outputs = []
for data in tqdm(datasets["test"]):
    input, label = data['input'], data['label']
    input_ids = tokenizer(input, return_tensors='pt').input_ids
    res = model.generate(input_ids, num_return_sequences=1, num_beams=1)
    output = tokenizer.decode(res[0], skip_special_tokens=False)

    outputs.append(output)
    score = cal_bleu(reference=label, candidate=output)
    scores.append(score)
  
round(np.average(scores), 3)


  0%|          | 0/499 [00:00<?, ?it/s][A
  0%|          | 1/499 [00:02<17:59,  2.17s/it][A
  0%|          | 2/499 [00:04<17:12,  2.08s/it][A
  1%|          | 3/499 [00:05<15:23,  1.86s/it][A
  1%|          | 4/499 [00:07<14:29,  1.76s/it][A
  1%|          | 5/499 [00:08<13:35,  1.65s/it][A
  1%|          | 6/499 [00:10<12:31,  1.52s/it][A
  1%|▏         | 7/499 [00:11<11:48,  1.44s/it][A
  2%|▏         | 8/499 [00:12<11:22,  1.39s/it][A
  2%|▏         | 9/499 [00:13<11:03,  1.35s/it][A
  2%|▏         | 10/499 [00:15<10:50,  1.33s/it][A
  2%|▏         | 11/499 [00:16<10:41,  1.32s/it][A
  2%|▏         | 12/499 [00:17<10:36,  1.31s/it][A
  3%|▎         | 13/499 [00:19<11:00,  1.36s/it][A
  3%|▎         | 14/499 [00:20<11:31,  1.42s/it][A
  3%|▎         | 15/499 [00:22<11:40,  1.45s/it][A
  3%|▎         | 16/499 [00:23<11:12,  1.39s/it][A
  3%|▎         | 17/499 [00:24<10:53,  1.36s/it][A
  4%|▎         | 18/499 [00:26<10:41,  1.33s/it][A
  4%|▍         | 19/499 [00:2

0.009

In [73]:
ind = np.argmax(scores)
verses_test[ind], verses_test[ind + 1], outputs[ind]

('هر آن که جانب اهل خدا نگه دارد <sep> خداش در همه حال از بلا نگه دارد',
 'حدیث دوست نگویم مگر به حضرت دوست <sep> که آشنا سخن آشنا نگه دارد',
 '<pad> <extra_id_0> نگه ندارد ز همه نگه دارد همه نگه دارد</s>')

In [74]:
scores[ind]

0.1384182521267043

Finitoo