In [3]:

!git clone https://github.com/malipourha/Persian-Wikipedia-Dataset.git


%cd Persian-Wikipedia-Dataset


fatal: destination path 'Persian-Wikipedia-Dataset' already exists and is not an empty directory.
/content/Persian-Wikipedia-Dataset


In [4]:
!ls


README.md


In [10]:

!git clone https://github.com/malipourha/Persian-Wikipedia-Dataset.git


%cd Persian-Wikipedia-Dataset

# Define file paths
file_paths = [
    'README.md'
]


import os
import torch
from torch.utils.data import Dataset, DataLoader
import re
from joblib import Parallel, delayed

class PersianWikipediaDataset(Dataset):
    def __init__(self, file_paths, seq_length):
        self.text = self._read_files_in_parallel(file_paths)
        self.text = re.sub(r'\s+', ' ', self.text)  
        self.vocab = sorted(set(self.text))
        self.vocab_size = len(self.vocab)
        self.char_to_idx = {char: idx for idx, char in enumerate(self.vocab)}
        self.idx_to_char = {idx: char for idx, char in enumerate(self.vocab)}
        self.seq_length = seq_length
        self.data = self._create_sequences()

    def _read_files_in_parallel(self, file_paths):
        texts = Parallel(n_jobs=-1)(delayed(self._read_file)(file_path) for file_path in file_paths)
        return ''.join(texts)

    def _read_file(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()

    def _create_sequences(self):
        sequences = []
        next_chars = []
        for i in range(0, len(self.text) - self.seq_length):
            sequences.append(self.text[i:i + self.seq_length])
            next_chars.append(self.text[i + self.seq_length])

        X = torch.zeros((len(sequences), self.seq_length), dtype=torch.long)
        y = torch.zeros((len(sequences),), dtype=torch.long)

        for i, seq in enumerate(sequences):
            for j, char in enumerate(seq):
                X[i, j] = self.char_to_idx[char]
            y[i] = self.char_to_idx[next_chars[i]]

        return X, y

    def __len__(self):
        return len(self.data[0])

    def __getitem__(self, idx):
        return self.data[0][idx], self.data[1][idx]

import torch.nn as nn
import torch.optim as optim

class TextGenerationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(TextGenerationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        return (weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_(),
                weight.new(self.lstm.num_layers, batch_size, self.lstm.hidden_size).zero_())

def train_model(model, dataloader, num_epochs, lr=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    for epoch in range(num_epochs):
        hidden = model.init_hidden(dataloader.batch_size)

        for inputs, targets in dataloader:
            batch_size = inputs.size(0)
            hidden = model.init_hidden(batch_size)  
            inputs, targets = inputs.to(device), targets.to(device)
            hidden = tuple([each.data for each in hidden])

            model.zero_grad()
            output, hidden = model(inputs, hidden)
            loss = criterion(output, targets)
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

def split_files(file_paths, batch_size):
    for i in range(0, len(file_paths), batch_size):
        yield file_paths[i:i + batch_size]


batch_size = 2
seq_length = 100
num_epochs = 10
embedding_dim = 256
hidden_dim = 512
num_layers = 2


batches = list(split_files(file_paths, batch_size))


vocab_size = len(set(open(file_paths[0], 'r', encoding='utf-8').read()))
model = TextGenerationModel(vocab_size, embedding_dim, hidden_dim, num_layers)


for batch in batches:
    dataset = PersianWikipediaDataset(batch, seq_length)
    dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

    
    train_model(model, dataloader, num_epochs)


Cloning into 'Persian-Wikipedia-Dataset'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects:   3% (1/27)[Kremote: Counting objects:   7% (2/27)[Kremote: Counting objects:  11% (3/27)[Kremote: Counting objects:  14% (4/27)[Kremote: Counting objects:  18% (5/27)[Kremote: Counting objects:  22% (6/27)[Kremote: Counting objects:  25% (7/27)[Kremote: Counting objects:  29% (8/27)[Kremote: Counting objects:  33% (9/27)[Kremote: Counting objects:  37% (10/27)[Kremote: Counting objects:  40% (11/27)[Kremote: Counting objects:  44% (12/27)[Kremote: Counting objects:  48% (13/27)[Kremote: Counting objects:  51% (14/27)[Kremote: Counting objects:  55% (15/27)[Kremote: Counting objects:  59% (16/27)[Kremote: Counting objects:  62% (17/27)[Kremote: Counting objects:  66% (18/27)[Kremote: Counting objects:  70% (19/27)[Kremote: Counting objects:  74% (20/27)[Kremote: Counting objects:  77% (21/27)[Kremote: Counting objects:  81% (22/27)[K

In [11]:
import math
import numpy as np
from tqdm import tqdm

def evaluate_perplexity(model, dataloader):
    model.eval()
    total_loss = 0.0
    total_words = 0
    criterion = nn.CrossEntropyLoss()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    with torch.no_grad():
        for inputs, targets in tqdm(dataloader):
            batch_size = inputs.size(0)
            hidden = model.init_hidden(batch_size)
            inputs, targets = inputs.to(device), targets.to(device)
            hidden = tuple([each.data for each in hidden])

            output, hidden = model(inputs, hidden)
            loss = criterion(output, targets)
            total_loss += loss.item() * batch_size
            total_words += batch_size * inputs.size(1)

    perplexity = math.exp(total_loss / total_words)
    return perplexity


test_dataloader = DataLoader(dataset, batch_size=64, shuffle=False)
perplexity = evaluate_perplexity(model, test_dataloader)
print(f'Perplexity: {perplexity}')


100%|██████████| 31/31 [00:37<00:00,  1.21s/it]

Perplexity: 1.0076975704200648





In [12]:


from rouge_score import rouge_scorer

def evaluate_rouge(model, dataloader, idx_to_char, num_samples=100):
    model.eval()
    generated_texts = []
    reference_texts = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    with torch.no_grad():
        for inputs, targets in tqdm(dataloader):
            batch_size = inputs.size(0)
            hidden = model.init_hidden(batch_size)
            inputs, targets = inputs.to(device), targets.to(device)
            hidden = tuple([each.data for each in hidden])

            output, hidden = model(inputs, hidden)
            output_indices = torch.argmax(output, dim=1).cpu().numpy()
            generated_texts.append(''.join([idx_to_char[idx] for idx in output_indices]))
            reference_texts.append(''.join([idx_to_char[idx] for idx in targets.cpu().numpy()]))

            if len(generated_texts) >= num_samples:
                break

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []
    for gen_text, ref_text in zip(generated_texts, reference_texts):
        scores.append(scorer.score(ref_text, gen_text))

    avg_scores = {metric: np.mean([score[metric].fmeasure for score in scores]) for metric in scores[0]}
    return avg_scores


rouge_scores = evaluate_rouge(model, test_dataloader, dataset.idx_to_char)
print(f'ROUGE Scores: {rouge_scores}')


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=974bb1487f3c9a71cd64ccacf3124223548685f016e6b5a465a219e76099d56f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


100%|██████████| 31/31 [00:39<00:00,  1.27s/it]

ROUGE Scores: {'rouge1': 0.18723793737818983, 'rouge2': 0.14229390681003581, 'rougeL': 0.18723793737818983}





In [13]:

persian_file_paths = [
    'README.md'
]

num_epochs_finetuning = 10

persian_dataset = PersianWikipediaDataset(persian_file_paths, seq_length)
persian_dataloader = DataLoader(persian_dataset, batch_size=64, shuffle=True)

train_model(model, persian_dataloader, num_epochs_finetuning)


Epoch 1/10, Loss: 1.0400390625
Epoch 2/10, Loss: 0.3503177762031555
Epoch 3/10, Loss: 0.49451297521591187
Epoch 4/10, Loss: 0.5274511575698853
Epoch 5/10, Loss: 0.31611064076423645
Epoch 6/10, Loss: 0.1544836163520813
Epoch 7/10, Loss: 0.07002132385969162
Epoch 8/10, Loss: 0.09672130644321442
Epoch 9/10, Loss: 0.07554357498884201
Epoch 10/10, Loss: 0.03580605983734131


In [14]:
perplexity = evaluate_perplexity(model, persian_dataloader)
print(f'Perplexity: {perplexity}')


100%|██████████| 31/31 [00:36<00:00,  1.18s/it]

Perplexity: 1.000416554036807





In [15]:
rouge_scores = evaluate_rouge(model, persian_dataloader, persian_dataset.idx_to_char)
print(f'ROUGE Scores: {rouge_scores}')


100%|██████████| 31/31 [00:35<00:00,  1.15s/it]

ROUGE Scores: {'rouge1': 1.0, 'rouge2': 1.0, 'rougeL': 1.0}



