**Q3)**

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sklearn.model_selection import train_test_split


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset_path = "/content/drive/MyDrive/DeepHW4/Q3/ferdousi.txt"
with open(dataset_path, 'r', encoding='utf-8') as f:
    dataset = f.readlines()


input_poems = []
output_poems = []
for i in range(len(dataset) - 3):
    if i % 2 == 0:
        input_poems.append(dataset[i].strip() +  " " + dataset[i+1] .strip())    # processing the input line pairs
        # Tokenize the output line pairs
        output_poems.append(dataset[i+2].strip() +  " " + dataset[i+3] .strip()) # processing the desired output line pairs


print(input_poems[0])  # Example input
print(output_poems[0])  # Example output


In [None]:
class shahNamehDataset(Dataset):
    def __init__(self, input_poem,output_poem, tokenizer):
        self.input_poem = input_poem
        self.output_poem = output_poem
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.input_poem)

    def __getitem__(self, idx):
        poem_in = self.input_poem[idx].strip()
        poem_out = self.output_poem[idx].strip()
        return poem_in, poem_out


tokenizer = GPT2Tokenizer.from_pretrained("HooshvareLab/gpt2-fa")  # Load the pre-trained tokenizer


train_dataset = shahNamehDataset(input_poems,output_poems, tokenizer)
train_dataset, test_dataset = train_test_split(train_dataset, test_size=0.2, random_state=42)


batch_size = 64


train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_data_loader =  DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [None]:
# Loading the pre-trained model
model = GPT2LMHeadModel.from_pretrained("HooshvareLab/gpt2-fa")

# Setting the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()


# Setting the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()
tokenizer.pad_token = tokenizer.eos_token


In [None]:
max_length = 15
epochs = 100
for epoch in range(epochs):
    model.train()
    train_loss = 0
    print('epoch',epoch)
    for batch_in, batch_out in train_data_loader:
        # Tokenizing input and output sequences
        inputs = tokenizer(batch_in, padding=True, truncation=True, return_tensors="pt", max_length=max_length).to(device)
        labels = tokenizer(batch_out, padding=True, truncation=True, return_tensors="pt", max_length=max_length).input_ids.to(device)


        # Forward pass
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch + 1}/{epochs} - Loss: {train_loss / len(train_data_loader):.4f}")


In [None]:
model.eval() #Setting the model to evaluation mode

tokenizer.pad_token_id = tokenizer.eos_token_id
input_text = "ریاضی را تو الحق خوب دانی"  # Generate poem using the trained model
input_ids = tokenizer(input_text, return_tensors="pt").to(device)
outputs = model.generate(**input_ids, max_length=11)

generated_poems = []
for output in outputs:
    poem = tokenizer.decode(output, skip_special_tokens=True )
    generated_poems.append(poem)
    print(poem)

Setting `pad_token_id` to `eos_token_id`:5 for open-end generation.


ریاضی را تو الحق خوب دانی و خوب دانی تو خوب


Ellipsis