In [None]:
%%capture
from transformers import AutoModel, AutoTokenizer

model_name = "HooshvareLab/gpt2-fa"

model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model()

In [None]:
%%capture
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
import string
from collections import Counter
import random

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#قسمت الف

In [None]:
class Vocabulary:
    def __init__(self, poem_path, threshold):
        self.poem_path = poem_path
        self.threshold = threshold
        self.load_poem()
        self.build_vocab()

    def load_poem(self):
        with open(self.poem_path, 'r', encoding='utf-8') as file:
            poem_lines = [line.strip() for line in file.readlines()]

        poem_lines = poem_lines[2::2]

        if len(poem_lines) % 2 == 1:
            poem_lines = poem_lines[:-1]

        poem_lines = [f"{poem_lines[i]} <sep> {poem_lines[i + 1]}" for i in range(0, len(poem_lines), 2)]

        punctuations = string.punctuation + '«»،؛؟'
        self.lines = [[word.lower() for word in line.split() if word not in punctuations] for line in poem_lines]


    def build_vocab(self):
        words = [word for line in self.lines for word in line]
        word_counts = Counter(words)
        frequent_words = [word for word, count in word_counts.items() if count >= self.threshold]

        self.word2idx = {word: idx for idx, word in enumerate(frequent_words, start=1)}

        special_tokens = ['<pad>', '<sos>', '<eos>', '<unk>', '<sep>']
        for token in special_tokens:
            self.word2idx[token] = len(self.word2idx) + 1

        self.idx2word = {idx: word for word, idx in self.word2idx.items()}

vocab = Vocabulary('ferdousi.txt', threshold=2)

#قسمت ب

In [None]:
class FerdousiDataset(torch.utils.data.Dataset):
    def __init__(self, poem_path, vocab):
        self.poem_path = poem_path
        self.vocab = vocab
        self.load_poem()

    def __len__(self):
        return len(self.poem) - 1

    def __getitem__(self, idx):
        return self.poem[idx], self.poem[idx + 1]

    def load_poem(self):
        with open(self.poem_path, 'r', encoding='utf-8') as file:
            poem_lines = [line.strip() for line in file.readlines()]

        poem_lines = poem_lines[2::2]

        if len(poem_lines) % 2 == 1:
            poem_lines = poem_lines[:-1]

        poem_lines = [f"{poem_lines[i]} <sep> {poem_lines[i + 1]}" for i in range(0, len(poem_lines), 2)]

        poem_lines = [tokenizer(line, return_tensors='pt') for line in poem_lines]

        punctuations = string.punctuation + '«»،؛؟'
        poem_lines = [
            [word for word in line if word not in punctuations and len(word) > 1]
            for line in poem_lines
        ]
        poem_lines = [line for line in poem_lines if len(line) > 0]

        max_len = max(len(line) for line in poem_lines)
        poem_lines = [line + ['<pad>'] * (max_len - len(line)) for line in poem_lines]

        poem_lines = [['<sos>'] + line + ['<eos>'] for line in poem_lines]

        self.poem = [
            [
                self.vocab.word2idx[word] if word in self.vocab.word2idx else self.vocab.word2idx['<unk>']
                for word in line
            ]
            for line in poem_lines
        ]

        self.poem = torch.tensor(self.poem).long()


dataset = FerdousiDataset('ferdousi.txt', vocab)
print(dataset)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
print(train_dataset)
batch_size = 128
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [None]:
class FerdousiDataset(torch.utils.data.Dataset):
    def __init__(self, poem_path, vocab):
        self.poem_path = poem_path
        self.vocab = vocab
        self.load_poem()

    def __len__(self):
        return len(self.poem) - 1

    def __getitem__(self, idx):
        return self.poem[idx], self.poem[idx + 1]

    def load_poem(self):
        with open(self.poem_path, 'r', encoding='utf-8') as f:
            poem = [line.strip() for line in f.readlines()]
        poem = poem[2:]
        poem = poem[:-1] if len(poem) % 2 == 1 else poem
        poem = [[poem[i], poem[i + 1]] for i in range(0, len(poem), 2)]
        poem = [mesra[0] + ' <sep> ' + mesra[1] for mesra in poem]
        poem = [word_tokenize(line) for line in poem]
        punctuations = string.punctuation + '«»،؛؟'
        poem = [[word for word in line if word not in punctuations] for line in poem]
        poem = [line for line in poem if len(line) > 0]
        poem = [[word for word in line if len(word) > 1] for line in poem]
        self.max_len = max([len(line) for line in poem])
        poem = [line + ['<pad>'] * (self.max_len - len(line)) for line in poem]
        poem = [['<sos>'] + line + ['<eos>'] for line in poem]

        # Create word vectors using the vocabulary
        self.poem = []
        for line in poem:
            line_vec = []
            for word in line:
                if word in self.vocab.word2idx:
                    line_vec.append(self.vocab.word2idx[word])
                else:
                    line_vec.append(self.vocab.word2idx['<unk>'])
            self.poem.append(line_vec)
        self.poem = torch.tensor(self.poem).long()

vocab = Vocabulary('ferdousi.txt', 2)
# create a dataset object
dataset = FerdousiDataset('ferdousi.txt', vocab)
# split the dataset into train and test
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
# create a dataloader for train and test
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#قسمت ج

In [None]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=5e-5)

In [None]:
from transformers import get_scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [None]:
model.to(device)
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))
loss = nn.CrossEntropyLoss()
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        inputs = batch[0].to(device)
        labels = batch[1].to(device)
        outputs = model(inputs)

        loss = loss(outputs[0], labels)

        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)