In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/Colab Notebooks/ferdousi.txt'

# Function to process the dataset
def process_poem_dataset(file_path):
    # Read the file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.readlines()

    # Remove metadata and pair each two lines as a single beyt
    beyts = [''.join(data[i:i+2]) for i in range(2, len(data), 2)]

    # Create input-output pairs
    X = beyts[:-1]  # All beyts except the last
    Y = beyts[1:]   # All beyts except the first

    # Splitting the data into training and testing sets (80-20 split)
    split_index = int(0.8 * len(X))
    X_train, Y_train = X[:split_index], Y[:split_index]
    X_test, Y_test = X[split_index:], Y[split_index:]

    return X_train, Y_train, X_test, Y_test

# Process the dataset
X_train, Y_train, X_test, Y_test = process_poem_dataset(file_path)

# Optionally, display the first few pairs of training and testing data
print("Training Data Example X:", X_train[0])
print("Training Data Example Y:", Y_train[0])
print("Testing Data Example X:", X_test[0])
print("Testing Data Example Y:", Y_test[0])

Mounted at /content/drive
Training Data Example X: به نام خداوند جان و خرد
کزین برتر اندیشه برنگذرد

Training Data Example Y: خداوند نام و خداوند جای
خداوند روزی ده رهنمای

Testing Data Example X: به آرایش چهره و فر و زیب
نباید که گیرندت اندر فریب

Testing Data Example Y: پس پردهٔ او بسی درخترست
که با فر و بالا و با افسرست



In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from torch.utils.data import DataLoader, TensorDataset
import torch

# 1. Load the pre-trained model and tokenizer
model_name = "HooshvareLab/gpt2-fa"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
# 2. Tokenize the data
def tokenize_data(texts, tokenizer, max_length=512):
    return tokenizer(texts, add_special_tokens=True, truncation=True, padding='max_length', max_length=max_length, return_tensors="pt")

tokenized_X_train = tokenize_data(X_train, tokenizer)
tokenized_X_test = tokenize_data(X_test, tokenizer)

# 3. Prepare datasets and create dataloaders
def create_dataset(tokenized_texts):
    input_ids = tokenized_texts['input_ids']
    attention_mask = tokenized_texts['attention_mask']
    return TensorDataset(input_ids, attention_mask)

train_dataset = create_dataset(tokenized_X_train)
test_dataset = create_dataset(tokenized_X_test)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/875k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/14.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/104 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/485M [00:00<?, ?B/s]

In [3]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch

# Check if GPU is available and use it if possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Setting up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1

# Total number of training steps is number of batches * number of epochs
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0, # Default value
                                            num_training_steps=total_steps)

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        # Move batch to device
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[0]}  # For GPT-2, input_ids are used as labels for language modeling

        # Clear previously calculated gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**inputs)
        loss = outputs.loss

        # Backward pass
        loss.backward()

        # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update parameters and learning rate
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    # Calculate average loss over the training data
    avg_train_loss = total_loss / len(train_dataloader)

    print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss}")

# Saving the fine-tuned model
model_save_path = "/content/drive/MyDrive/Colab Notebooks/gpt2-fa-finetuned.pth"
torch.save(model.state_dict(), model_save_path)





Epoch 1/1 | Train Loss: 0.14608172941979225


In [4]:
model.eval()

# Disable gradient calculations
with torch.no_grad():
    total_loss = 0

    # Iterate over test data
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[0]}

        # Forward pass, calculate loss
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

    # Calculate the average loss
    avg_loss = total_loss / len(test_dataloader)

# Calculate the perplexity
perplexity = torch.exp(torch.tensor(avg_loss))

print(f"Perplexity on test data: {perplexity}")


Perplexity on test data: 1.1362073421478271


In [18]:
def generate_poems(model, tokenizer, prompts, max_length=50, num_return_sequences=1):
    poems = []

    for prompt in prompts:
        # Encode the prompt text
        input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)

        # Generate text
        generated_text = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            no_repeat_ngram_size=2,
            early_stopping=True,
            top_p=0.95,
            top_k=50
        )

        # Decode the generated text
        generated_poem = tokenizer.decode(generated_text[0], skip_special_tokens=True)
        poems.append(generated_poem)

    return poems

# List of prompts
prompts = [
    "خود و سرکشان سوی جیحون کشید",
    "چنین داد پاسخ که چرخ بلند",
    "چنان دان که این پیرسر پهلوان",
    "دو منزل یکی کرد و آمد دوان",
    "سپه بودش از جنگیان صدهزار",
    "به شاه جهان گفت کای شهریار",
    "بر آن سان کجا بردمد روز جنگ",
    "ابر شاه پیران گرفت آفرین",
    "تو دانسته ای درد و تیمار من"
]

# Generate poems for each prompt
poems = generate_poems(model, tokenizer, prompts)

# Display the poems
for i, poem in enumerate(poems):
    print(f"Prompt: {prompts[i]}")
    print(poem)
    print("\n" + "-"*30 + "\n")



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.
The attentio

Prompt: خود و سرکشان سوی جیحون کشید
خود و سرکشان سوی جیحون کشید
به نزدیک آن مرز و بوم و بر کشید

------------------------------

Prompt: چنین داد پاسخ که چرخ بلند
چنین داد پاسخ که چرخ بلند
که با من چه آید به جنگ و جنگ


------------------------------

Prompt: چنان دان که این پیرسر پهلوان
چنان دان که این پیرسر پهلوان
که با او به جنگ اندر آید به راه


------------------------------

Prompt: دو منزل یکی کرد و آمد دوان
دو منزل یکی کرد و آمد دوان
به نزدیک او آمد و شد به راه


------------------------------

Prompt: سپه بودش از جنگیان صدهزار
سپه بودش از جنگیان صدهزار
که از جنگ برگشته بود از کارزار


------------------------------

Prompt: به شاه جهان گفت کای شهریار
به شاه جهان گفت کای شهریار
که ای شاه پیروز و پیروزگر


------------------------------

Prompt: بر آن سان کجا بردمد روز جنگ
بر آن سان کجا بردمد روز جنگ
که از جنگ او بر آید به جنگ سپاه


------------------------------

Prompt: ابر شاه پیران گرفت آفرین
ابر شاه پیران گرفت آفرین
که از تو بر تو گردد به کردار باد


----------------------