In [1]:
# pip install transformers accelerate tokenizers datasets
# pip install hazm

In [2]:
%%html
<link href="https://v1.fontapi.ir/css/Vazir" rel="stylesheet">
<link rel="stylesheet" href="style.css">

# <div class="farsi center">بسم الله الرحمن الرحیم</div>

In [3]:
import typing

In [4]:
TRAIN_TOKENIZERS = False

WORD_TOKENIZER_FILE_NAME = './wtoken.json'
BPE_TOKENIZER_FILE_NAME = './bpetoken.json'

BPE_VOCAB_SIZE = 10000
WORD_LEVEL_VOCAB_SIZE = 5000

UNK_TOKEN = "[UNK]"
PAD_TOKEN = "[PAD]"
SOS_TOKEN = "[SOS]"
EOS_TOKEN = "[EOS]"
ALL_TOKENS = [UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, PAD_TOKEN]

ALL_TRAINING_DATA = [
    './cultural.txt',
    './economics.txt',
    './politics.txt',
    './sports.txt'
]

LM_TRAINING_DATA = ['t.txt'] #ALL_TRAINING_DATA[:1]

# <div class="green">Tokenization</div>

In [5]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordLevel, BPE
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.processors import TemplateProcessing

## <span class="blue">Word Tokenizer</span>

In [6]:
if TRAIN_TOKENIZERS:
    word_tokenizer = Tokenizer(WordLevel(unk_token=UNK_TOKEN))
    word_tokenizer.pre_tokenizer = Whitespace()
    word_trainer = WordLevelTrainer(vocab_size=WORD_LEVEL_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    word_tokenizer.train(ALL_TRAINING_DATA, word_trainer)
    word_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    word_tokenizer.save(WORD_TOKENIZER_FILE_NAME)
else:
    word_tokenizer = Tokenizer.from_file(WORD_TOKENIZER_FILE_NAME)

## <span class="blue">BPE Tokenizer</span>

In [7]:
if TRAIN_TOKENIZERS:
    bpe_tokenizer = Tokenizer(BPE(unk_token=UNK_TOKEN))
    bpe_tokenizer.pre_tokenizer = Whitespace()
    bpe_trainer = BpeTrainer(vocab_size=BPE_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    bpe_tokenizer.train(ALL_TRAINING_DATA, bpe_trainer)
    bpe_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    bpe_tokenizer.save(BPE_TOKENIZER_FILE_NAME)
else:
    bpe_tokenizer = Tokenizer.from_file(BPE_TOKENIZER_FILE_NAME)

## <span class="blue">Post Processing</span>

In [8]:
def add_post_processor_to(tokenizer: Tokenizer):
    tokenizer.post_processor = TemplateProcessing(
        single=f"{SOS_TOKEN} $0 {EOS_TOKEN}",
        special_tokens=[
            (X, tokenizer.token_to_id(X)) for X in [SOS_TOKEN, EOS_TOKEN]
        ]
    )
add_post_processor_to(word_tokenizer)
add_post_processor_to(bpe_tokenizer)

## <div class="blue right farsi">تست عملکرد توکنایزیشن</div>

In [9]:
sample = 'سلاااااام حالت خوب است؟'
print(f'Word Tokenizer: {word_tokenizer.encode(sample).tokens}')
print(f'BPE Tokenizer: {bpe_tokenizer.encode(sample).tokens}')

Word Tokenizer: ['[SOS]', 'سلاااااام', 'حالت', 'خوب', 'است', '؟', '[EOS]']
BPE Tokenizer: ['[SOS]', 'س', 'لا', 'ا', 'ا', 'ا', 'ا', 'ام', 'حالت', 'خوب', 'است', '؟', '[EOS]']


# <div class="green">Preparing Data For LM</div>

In [10]:
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, corpus_files):
        dataset_lines = []

        for file_name in LM_TRAINING_DATA:
            with open(file_name, 'r') as f:
                dataset_lines += f.readlines()
        dataset_lines = [line.strip() for line in dataset_lines]
        
        self.__lines = dataset_lines

    def __len__(self):
        return len(self.__lines)

    def __getitem__(self, idx):
        return self.__lines[idx]
    
dataset = TextDataset(LM_TRAINING_DATA)

In [11]:
dataset[1]

'به صنوف سینمایی سر بزنید!'

# <div class="green">Transformer LM</div>

In [30]:
TRANSFORMER_EPOCHS = 200
MAX_LENGTH = 256

In [31]:
from transformers import (
    PreTrainedModel,
    GPT2Config,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    PreTrainedTokenizerFast,
    LineByLineTextDataset, #REMOVE THIS
)

In [32]:
def create_gpt_model(dataset: Dataset, tokenizer: Tokenizer) -> typing.Tuple[PreTrainedModel, Trainer]:
    fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
    fast_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    
    prepared_dataset = LineByLineTextDataset( #I MUST FIX THIS
        tokenizer=fast_tokenizer,
        file_path=LM_TRAINING_DATA[0],
        block_size=128,
    )
    # fast_tokenizer.batch_encode_plus(list(dataset), padding=True, truncation=True, return_tensors="pt", max_length=MAX_LENGTH)
        
    config = GPT2Config(vocab_size=WORD_LEVEL_VOCAB_SIZE, n_layer=4, n_embd=240, n_positions=MAX_LENGTH)
    model = GPT2LMHeadModel(config)
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=fast_tokenizer, mlm=False
    )
    training_args = TrainingArguments(
        output_dir="./GPT2",
        save_strategy='no',
        overwrite_output_dir=True,
        num_train_epochs=TRANSFORMER_EPOCHS,
    #     save_steps=10_000,
    #     save_total_limit=2,
        logging_steps=20,
        prediction_loss_only=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=prepared_dataset,
    )
    return model, trainer

In [33]:
model, trainer = create_gpt_model(dataset, word_tokenizer)
print(model.num_parameters())

Assigning [PAD] to the pad_token key of the tokenizer
Creating features from dataset file at t.txt
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


4039200


In [34]:
trainer.train()

***** Running training *****
  Num examples = 94
  Num Epochs = 200
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2400


Step,Training Loss
20,7.9696
40,7.5441
60,7.2707
80,6.9762
100,6.6814
120,6.4315
140,6.2216
160,6.0084
180,5.8573
200,5.7064




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=2400, training_loss=3.70886865456899, metrics={'train_runtime': 58.3462, 'train_samples_per_second': 322.214, 'train_steps_per_second': 41.134, 'total_flos': 39961377578880.0, 'train_loss': 3.70886865456899, 'epoch': 200.0})