# Persian Model 

## Import

In [1]:
#! pip install tokenizers scikit-learn --user 
#! pip install hazm --user 
#! pip install tiktoken --user 
#! pip install transformers --user
%load_ext autoreload
%autoreload 2

from tokenizers import Tokenizer
from tokenizers import trainers
from tokenizers.normalizers import StripAccents, Lowercase, Sequence
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer, UnigramTrainer
from tokenizers.models import BPE, Unigram
from transformers import  AutoTokenizer  #pipeline, GPT2LMHeadModel

from hazm import * 
import re

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


## Main Part

In [2]:
from src.helper  import clean_pers_text_replace, get_cleaned_text
text_path = "content/fas_news_2020_100K/fas_news_2020_100K-sentences.txt"
path_to_save_folder= "model/train_data_pers"

raw_text = get_cleaned_text(text_path,clean_pers_text_replace)
#enc_text = tokenizer.encode(raw_text)



In:  100000  lines seperators replaced
Total lines replaced 95195
Total lines replaced 40842
Total lines replaced 1591


In [3]:
from src.dataset import GPTDataset
from src.dataset import create_dataloader


tokenizer = AutoTokenizer.from_pretrained('bolbolzaban/gpt2-persian')

# Parameters
batch_size = 128
context_length = 32  # Context size for training
vocab_size =  30000#tokenizer.n_vocab
embedding_dim = 128

In [4]:


train_dataloader, dev_dataloader, test_dataloader = create_dataloader(
    raw_text,tokenizer = tokenizer,allowed_special=False, batch_size=batch_size, 
    context_length=context_length, shuffle=True
)

 Create Dataset 2720000 / 2733504

## Training

In [9]:
from src.model import RegularizedLanguageModel, SimpleLanguageModel,LanguageModelExtraRelu
from src.trainComplete import TrainComplete
trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,allowed_special=False)

model = LanguageModelExtraRelu(vocab_size, embedding_dim, context_length).to(device)


trainclass.train(model,
              vocab_size,device,raw_text,"pers_LinearRelu_dopout_2Relu_ep5_batchsize32_evaluate_every10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = 128,
              context_length = 32,
              num_epochs = 5
             )

 Create Dataset 2720000 / 2733504Epoch [1/5], Step [0/68337], Loss: 10.3052
Validation perplexity: 28175.770958887126
Epoch [1/5], Step [75/68337], Loss: 7.1304
Epoch [1/5], Step [150/68337], Loss: 6.9182
Epoch [1/5], Step [225/68337], Loss: 6.9815
Epoch [1/5], Step [300/68337], Loss: 6.7851
Epoch [1/5], Step [375/68337], Loss: 6.7210
Epoch [1/5], Step [450/68337], Loss: 6.5990
Epoch [1/5], Step [525/68337], Loss: 6.4429
Epoch [1/5], Step [600/68337], Loss: 6.4764
Epoch [1/5], Step [675/68337], Loss: 6.5710
Epoch [1/5], Step [750/68337], Loss: 6.4239
Epoch [1/5], Step [825/68337], Loss: 6.5475
Epoch [1/5], Step [900/68337], Loss: 6.7137
Epoch [1/5], Step [975/68337], Loss: 6.6781
Epoch [1/5], Step [1050/68337], Loss: 6.3683
Epoch [1/5], Step [1125/68337], Loss: 6.3483
Epoch [1/5], Step [1200/68337], Loss: 6.4439
Epoch [1/5], Step [1275/68337], Loss: 6.2980
Epoch [1/5], Step [1350/68337], Loss: 6.3087
Epoch [1/5], Step [1425/68337], Loss: 6.1882
Epoch [1/5], Step [1500/68337], Loss: 6.0

## Generate Text 

In [None]:
from src.model import generate_text
start_text = " من در راه"
for x in range(10):
    generated_text = generate_text(model, tokenizer, start_text, device=device, context_length=20)
    print(generated_text)

# Extra Trainining Runs

##  Training

In [5]:
from src.trainComplete import TrainComplete
from src.attentionModel import LanguageModelWithAttention,MultiHeadCausalAttentionCorrect


trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,
                           allowed_special=False, is_attention_training = True)


context_length = 32  # Increased context size
embedding_dim = 128
attention_dim = 64
hidden_dim = 64
num_heads = 4 # 8 #2

model = MultiHeadCausalAttentionCorrect(
    embedding_dim, attention_dim, num_heads, context_length, dropout=0.2
).to(device)

trainclass.train(model,
              vocab_size,device,raw_text,"pers_test_attention_multiHeadTest_eval10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = embedding_dim,
              context_length = context_length,
              num_epochs = 1,

             )

 Create Dataset 2720000 / 2733504Started Training


RuntimeError: mat1 and mat2 must have the same dtype, but got Long and Float

### Evening Training RUN

In [4]:
from src.model import RegularizedLanguageModel
from src.trainComplete import TrainComplete
from src.helper  import clean_pers_text_replace, get_cleaned_text,clean_pers_remove,clean_text_pers_both

raw_text = get_cleaned_text(text_path,clean_pers_remove)
trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,allowed_special=False)

model = RegularizedLanguageModel(vocab_size, embedding_dim, context_length, dropout=0.2).to(device)


trainclass.train(model,
              vocab_size,device,raw_text,"pers_standardLinearNotRelu_ep4_evaluate10000_preprocessingRemove",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = 128,
              context_length = 32,
              num_epochs = 4
             )

In:  100000  lines seperators replaced
Total lines not removed :  97334
Total lines replaced 39736
Total lines replaced 1561
 Create Dataset 2720000 / 2721620Epoch [1/4], Step [0/68040], Loss: 10.7264
Validation perplexity: 41300.6966252291
Epoch [1/4], Step [75/68040], Loss: 8.8826
Epoch [1/4], Step [150/68040], Loss: 7.6357
Epoch [1/4], Step [225/68040], Loss: 7.2412
Epoch [1/4], Step [300/68040], Loss: 6.9935
Epoch [1/4], Step [375/68040], Loss: 6.9512
Epoch [1/4], Step [450/68040], Loss: 6.8882
Epoch [1/4], Step [525/68040], Loss: 6.6721
Epoch [1/4], Step [600/68040], Loss: 6.8100
Epoch [1/4], Step [675/68040], Loss: 6.7722
Epoch [1/4], Step [750/68040], Loss: 6.5833
Epoch [1/4], Step [825/68040], Loss: 6.6680
Epoch [1/4], Step [900/68040], Loss: 6.4995
Epoch [1/4], Step [975/68040], Loss: 6.6120
Epoch [1/4], Step [1050/68040], Loss: 6.4162
Epoch [1/4], Step [1125/68040], Loss: 6.4613
Epoch [1/4], Step [1200/68040], Loss: 6.5663
Epoch [1/4], Step [1275/68040], Loss: 6.4200
Epoch [1

In [5]:
from src.model import RegularizedLanguageModel
from src.trainComplete import TrainComplete
from src.helper  import clean_pers_text_replace, get_cleaned_text,clean_pers_remove,clean_text_pers_both

raw_text = get_cleaned_text(text_path,clean_text_pers_both)
trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,allowed_special=False)

model = RegularizedLanguageModel(vocab_size, embedding_dim, context_length, dropout=0.2).to(device)


trainclass.train(model,
              vocab_size,device,raw_text,"pers_standardLinearNotRelu_ep4_evaluate10000_preprocessingBoth",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = 128,
              context_length = 32,
              num_epochs = 4
             )


raw_text = get_cleaned_text(text_path,clean_pers_text_replace)

In:  100000  lines seperators replaced
Total lines not removed :  97334
Total lines replaced 39736
Total lines replaced 1561
In:  0  lines seperators replaced
Total lines replaced 92506
Total lines replaced 0
Total lines replaced 16
 Create Dataset 2560000 / 2562673Epoch [1/4], Step [0/64066], Loss: 10.6831
Validation perplexity: 40421.322676752956
Epoch [1/4], Step [75/64066], Loss: 8.9874
Epoch [1/4], Step [150/64066], Loss: 7.5029
Epoch [1/4], Step [225/64066], Loss: 7.3693
Epoch [1/4], Step [300/64066], Loss: 7.1476
Epoch [1/4], Step [375/64066], Loss: 7.2701
Epoch [1/4], Step [450/64066], Loss: 6.8898
Epoch [1/4], Step [525/64066], Loss: 6.9117
Epoch [1/4], Step [600/64066], Loss: 6.8769
Epoch [1/4], Step [675/64066], Loss: 6.8044
Epoch [1/4], Step [750/64066], Loss: 6.6115
Epoch [1/4], Step [825/64066], Loss: 6.7131
Epoch [1/4], Step [900/64066], Loss: 6.7361
Epoch [1/4], Step [975/64066], Loss: 6.8109
Epoch [1/4], Step [1050/64066], Loss: 6.7611
Epoch [1/4], Step [1125/64066], L

### Current Training Runn 

In [6]:
from src.trainComplete import TrainComplete
from src.attentionModel import LanguageModelWithAttention

raw_text = get_cleaned_text(text_path,clean_pers_text_replace)
trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,
                           allowed_special=False, is_attention_training = True)


context_length = 32  # Increased context size
embedding_dim = 128
attention_dim = 64
hidden_dim = 64
num_heads = 4

model = LanguageModelWithAttention(
    vocab_size, embedding_dim, attention_dim, context_length, hidden_dim, num_heads, dropout=0.2
).to(device)

trainclass.train(model,
              vocab_size,device,raw_text,"pers_attention_standard_dropout_ep10_eval10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 32,
              embedding_dim = embedding_dim,
              context_length = context_length,
              num_epochs = 10
             )

In:  100000  lines seperators replaced
Total lines replaced 95195
Total lines replaced 40842
Total lines replaced 1591
 Create Dataset 2720000 / 2733504Started Training
Epoch [1/10], Step [0/68337], Loss: 10.3260
Validation perplexity: 30267.173516670315
Epoch [1/10], Step [75/68337], Loss: 7.3800
Epoch [1/10], Step [150/68337], Loss: 7.3111
Epoch [1/10], Step [225/68337], Loss: 7.2131
Epoch [1/10], Step [300/68337], Loss: 7.0706
Epoch [1/10], Step [375/68337], Loss: 7.0923
Epoch [1/10], Step [450/68337], Loss: 7.0575
Epoch [1/10], Step [525/68337], Loss: 7.3701
Epoch [1/10], Step [600/68337], Loss: 6.7626
Epoch [1/10], Step [675/68337], Loss: 6.9516
Epoch [1/10], Step [750/68337], Loss: 7.0001
Epoch [1/10], Step [825/68337], Loss: 6.7930
Epoch [1/10], Step [900/68337], Loss: 6.7681
Epoch [1/10], Step [975/68337], Loss: 6.8923
Epoch [1/10], Step [1050/68337], Loss: 6.5430
Epoch [1/10], Step [1125/68337], Loss: 6.9252
Epoch [1/10], Step [1200/68337], Loss: 6.8056
Epoch [1/10], Step [127

In [7]:
from src.trainComplete import TrainComplete
from src.attentionModel import LanguageModelWithAttention

trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,
                           allowed_special=False, is_attention_training = True)


context_length = 32  # Increased context size
embedding_dim = 128
attention_dim = 64
hidden_dim = 64
num_heads = 4

model = LanguageModelWithAttention(
    vocab_size, embedding_dim, attention_dim, context_length, hidden_dim, num_heads, dropout=0.2
).to(device)

trainclass.train(model,
              vocab_size,device,raw_text,"pers_attention_standard_dropout_batchsize16_ep5_eval10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 16,
              embedding_dim = embedding_dim,
              context_length = context_length,
              num_epochs = 5
             )

 Create Dataset 2720000 / 2733504Started Training
Epoch [1/5], Step [0/136675], Loss: 10.3109
Validation perplexity: 29908.167241123712
Epoch [1/5], Step [75/136675], Loss: 7.3794
Epoch [1/5], Step [150/136675], Loss: 7.4273
Epoch [1/5], Step [225/136675], Loss: 7.3885
Epoch [1/5], Step [300/136675], Loss: 7.1044
Epoch [1/5], Step [375/136675], Loss: 7.2786
Epoch [1/5], Step [450/136675], Loss: 7.0428
Epoch [1/5], Step [525/136675], Loss: 7.2214
Epoch [1/5], Step [600/136675], Loss: 7.2302
Epoch [1/5], Step [675/136675], Loss: 6.9159
Epoch [1/5], Step [750/136675], Loss: 7.0736
Epoch [1/5], Step [825/136675], Loss: 6.9514
Epoch [1/5], Step [900/136675], Loss: 7.0399
Epoch [1/5], Step [975/136675], Loss: 6.9347
Epoch [1/5], Step [1050/136675], Loss: 7.0446
Epoch [1/5], Step [1125/136675], Loss: 6.8281
Epoch [1/5], Step [1200/136675], Loss: 6.9100
Epoch [1/5], Step [1275/136675], Loss: 7.0114
Epoch [1/5], Step [1350/136675], Loss: 6.9062
Epoch [1/5], Step [1425/136675], Loss: 6.4081
Epoc

In [8]:
from src.trainComplete import TrainComplete
from src.attentionModel import LanguageModelWithAttention

trainclass = TrainComplete(text_path = text_path,path_to_save_folder= path_to_save_folder,tokenizer = tokenizer,
                           allowed_special=False, is_attention_training = True)


context_length = 32  # Increased context size
embedding_dim = 128
attention_dim = 64
hidden_dim = 64
num_heads = 4

model = LanguageModelWithAttention(
    vocab_size, embedding_dim, attention_dim, context_length, hidden_dim, num_heads, dropout=0.2
).to(device)

trainclass.train(model,
              vocab_size,device,raw_text,"pers_attention_standard_dropout_batchsize64_ep5_eval10000",
                print_every=75,evaluate_every=10000,optimizer=None,criterion=None,
              batch_size = 64,
              embedding_dim = embedding_dim,
              context_length = context_length,
              num_epochs = 5
             )

 Create Dataset 2720000 / 2733504Started Training
Epoch [1/5], Step [0/34168], Loss: 10.3165
Validation perplexity: 29934.56014135121
Epoch [1/5], Step [75/34168], Loss: 7.3190
Epoch [1/5], Step [150/34168], Loss: 7.2938
Epoch [1/5], Step [225/34168], Loss: 7.2164
Epoch [1/5], Step [300/34168], Loss: 7.2242
Epoch [1/5], Step [375/34168], Loss: 7.2208
Epoch [1/5], Step [450/34168], Loss: 6.9309
Epoch [1/5], Step [525/34168], Loss: 6.8232
Epoch [1/5], Step [600/34168], Loss: 6.9315
Epoch [1/5], Step [675/34168], Loss: 6.8030
Epoch [1/5], Step [750/34168], Loss: 6.7433
Epoch [1/5], Step [825/34168], Loss: 6.6874
Epoch [1/5], Step [900/34168], Loss: 6.7487
Epoch [1/5], Step [975/34168], Loss: 6.5829
Epoch [1/5], Step [1050/34168], Loss: 6.6490
Epoch [1/5], Step [1125/34168], Loss: 6.6424
Epoch [1/5], Step [1200/34168], Loss: 6.4952
Epoch [1/5], Step [1275/34168], Loss: 6.6095
Epoch [1/5], Step [1350/34168], Loss: 6.6045
Epoch [1/5], Step [1425/34168], Loss: 6.4220
Epoch [1/5], Step [1500/3