In [1]:
!pip install -qU transformers

In [2]:
import json
import os
import glob

import pandas as pd
import numpy as np

from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelWithLMHead
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config

from IPython import display
from torch.utils.data import DataLoader, random_split, Dataset
from tqdm import tqdm

In [3]:
model_name_or_path = "HooshvareLab/gpt2-fa"

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    bos_token='<s>',
    eos_token='</s>',
    pad_token='<pad>',
    unk_token='<unk>'
)

tokenizer.add_special_tokens({
    "bos_token": '</s>',
    "eos_token": '</s>',
    "pad_token": '<pad>',
    "unk_token": '<unk>'
})

config = AutoConfig.from_pretrained(
    model_name_or_path,
    bos_token_id=tokenizer("<s>")["input_ids"][0],
    eos_token_id=tokenizer("</s>")["input_ids"][0],
    pad_token_id=tokenizer("<pad>")["input_ids"][0],
    unk_token_id=tokenizer("<unk>")["input_ids"][0],
)

tokenizer.save_pretrained("/content/gpt2/")
config.save_pretrained("/content/gpt2/")

!wget "https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/pytorch_model.bin" -P /content/gpt2/
!wget "https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/tokenizer.json" -P /content/gpt2/

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


--2024-01-21 16:55:24--  https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/pytorch_model.bin
Resolving huggingface.co (huggingface.co)... 18.164.174.118, 18.164.174.17, 18.164.174.55, ...
Connecting to huggingface.co (huggingface.co)|18.164.174.118|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/HooshvareLab/gpt2-fa/46b0b806c740a0f0a9f056f5574c5fa896166fe844945fd3c849bf34365e5060?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1706115324&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNjExNTMyNH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9Ib29zaHZhcmVMYWIvZ3B0Mi1mYS80NmIwYjgwNmM3NDBhMGYwYTlmMDU2ZjU1NzRjNWZhODk2MTY2ZmU4NDQ5NDVmZDNjODQ5YmYzNDM2NWU1MDYwP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_

In [4]:
class Dataset(Dataset):
  def __init__(self, data_path, tokenizer, max_length=16):

    self.tokenizer = tokenizer
    self.max_length = max_length
    with open(data_path, "r") as file:
      lines = file.readlines()
    self.lines = lines[2:]

  def __len__(self):
    return len(self.lines)

  def __getitem__(self,idx):
      input_poem = self.lines[idx].removesuffix("\n")
      input = self.tokenizer('<s>' + input_poem + '</s>',
                             max_length=self.max_length,
                             truncation=True,
                             padding='max_length',
                             return_tensors='pt')
      return input['input_ids'], input['attention_mask']

In [5]:
with open('ferdousi.txt', 'r', encoding='utf-8') as f:
    texts = [line.strip() for line in f.readlines()]

In [6]:
from torch.utils.data import random_split
from torch.utils.data import Subset
dataset = Dataset('ferdousi.txt', tokenizer, max_length=16)

train_ratio = 0.7
test_ratio = 0.3

train_size = int(train_ratio * len(dataset))
test_size = len(dataset) - train_size

train_indices = range(0, train_size)
test_indices = range(train_size , len(dataset))

train_dataset = Subset(dataset, train_indices)
test_dataset = Subset(dataset, test_indices)

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    train_dataset,
    batch_size=8
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=8
)

In [8]:
%%capture
from transformers import AutoModelForCausalLM

model_name = "HooshvareLab/gpt2-fa"

model = AutoModelForCausalLM.from_pretrained(model_name)

In [9]:
import random

model.resize_token_embeddings(len(tokenizer))

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

seed_val = 49

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [11]:
from transformers import AdamW

optimizer = AdamW(
    model.parameters(),
    lr=5e-4,
    eps=1e-8
)



In [13]:
from transformers import get_linear_schedule_with_warmup
epochs = 10
total_steps = len(train_dataloader) * epochs
warmup_steps = 1e2
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps)


In [15]:
import torch.nn as nn

for epoch in range(epochs):

    total_train_loss = 0

    model.train()
    with tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}', unit='batch', leave=False) as epoch_progress:
        for _ , batch in enumerate(epoch_progress):

            inputs = batch[0].to(device)
            masks = batch[1].to(device)

            model.zero_grad()

            outputs = model(input_ids=inputs.squeeze(dim=1), labels=inputs.squeeze(dim=1), attention_mask=masks.squeeze(dim=1))
            loss = outputs.loss
            batch_loss = loss.item()
            total_train_loss += batch_loss

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)
        perplexity = torch.exp(torch.tensor(avg_train_loss, device=device))

        print()
        print(f'Average Training Loss: {avg_train_loss}. Perplexity Loss of Training {perplexity}')
        print()





Average Training Loss: 2.104804662509163. Perplexity Loss of Training 8.205500602722168






Average Training Loss: 1.849963004826346. Perplexity Loss of Training 6.359583854675293






Average Training Loss: 1.641116786465835. Perplexity Loss of Training 5.160930156707764






Average Training Loss: 1.43403081158234. Perplexity Loss of Training 4.1955766677856445






Average Training Loss: 1.2391846383930598. Perplexity Loss of Training 3.4527971744537354






Average Training Loss: 1.074660072449412. Perplexity Loss of Training 2.928997039794922






Average Training Loss: 0.9467360862277395. Perplexity Loss of Training 2.5772838592529297






Average Training Loss: 0.8467941443660901. Perplexity Loss of Training 2.332158327102661






Average Training Loss: 0.7738240505522557. Perplexity Loss of Training 2.1680409908294678



                                                                   


Average Training Loss: 0.8539439885332353. Perplexity Loss of Training 2.3488924503326416





In [16]:
def generate(prompt,model,tokenizer):
    result = tokenizer(text='<s>' + prompt + '</s>',return_tensors='pt').to(device)
    model.eval()
    temp=len(prompt)
    output_ids = model.generate(result.input_ids,
                                attention_mask=result.attention_mask,
                                min_length=int(temp/2),
                                max_length=temp-2,
                                no_repeat_ngram_size=1,
                                num_beams=50)
    poem = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return poem[temp:]


In [18]:
prompt = 'که برداشت  او دیپ با فاطمی'
generated_poem = generate(prompt,model, tokenizer)
print("Prompt Poem:")
print(prompt)
print("Generated Poem:")
print(generated_poem)

Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


Prompt Poem:
که برداشت  او دیپ با فاطمی
Generated Poem:
 غنیمتاردند ستردان و چکاووش مرد سنگ اکوان


In [19]:
prompt = 'چه آمد برویش که ما را بخواست'
generated_poem = generate(prompt,model, tokenizer)
print("Prompt Poem:")
print(prompt)
print("Generated Poem:")
print(generated_poem)

Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


Prompt Poem:
چه آمد برویش که ما را بخواست
Generated Poem:
 غنیمت ستمکقون به بزد مهان چو شیر مرد دلیر


In [20]:
prompt = 'نهادیم بر چرخ گردنده زین'
generated_poem = generate(prompt,model, tokenizer)
print("Prompt Poem:")
print(prompt)
print("Generated Poem:")
print(generated_poem)

Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


Prompt Poem:
نهادیم بر چرخ گردنده زین
Generated Poem:
گذلیجغ ویسه به هر سه بدخواهش


In [21]:
prompt = 'چو بار درخت وفا را بدید'
generated_poem = generate(prompt,model, tokenizer)
print("Prompt Poem:")
print(prompt)
print("Generated Poem:")
print(generated_poem)

Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


Prompt Poem:
چو بار درخت وفا را بدید
Generated Poem:
ذ،ج ستو و گفت زهش به ایوان نو


In [22]:
model.eval()
test_loss = 0
with tqdm(test_dataloader, unit='batch', leave=False) as epoch_progress:
    for _ , batch in enumerate(epoch_progress):

        inputs = batch[0].to(device)
        masks = batch[1].to(device)

        outputs = model(input_ids=inputs.squeeze(dim=1), labels=inputs.squeeze(dim=1), attention_mask=masks.squeeze(dim=1))
        loss = outputs.loss
        batch_loss = loss.item()
        test_loss += batch_loss

    avg_loss = test_loss / len(test_dataloader)
    perplexity = torch.exp(torch.tensor(avg_loss, device=device))
    print(f'Perplexity of Test Data: {perplexity}.')
    print()

                                                      

Perplexity of Test Data: 24.899248123168945.



