In [None]:
#if using colab install requirements.txt

In [None]:
import pandas as pd
# from google.colab import drive , files # if using colab uncomment this
from collections import Counter
from datasets import Dataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import AutoTokenizer, AutoModelForCausalLM , TrainingArguments , Trainer , pipeline
import torch # if cuda
import shutil
import os

In [None]:
file_path = 'df_all_cleans.csv'

In [None]:
# drive.mount('/content/drive') # if using colab uncomment this

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_all = pd.read_csv(file_path)

In [None]:
df_all.head()

Unnamed: 0.1,Unnamed: 0,id,poem,poet,cat,text,text_cleaned,text_formatted
0,0,1119,رباعی شمارهٔ ۱,خیام,رباعیات,برخیز بتا بیا ز بهر دل ما\nحل کن به جمال خویشت...,برخیز بتا بیا ز بهر دل ما\nحل کن به جمال خویشت...,<STYLE:ROBAEE> <START> برخیز بتا بیا ز بهر دل ...
1,1,1120,رباعی شمارهٔ ۲,خیام,رباعیات,چون عهده نمی‌شود کسی فردا را\nحالی خوش دار این...,چون عهده نمیشود کسی فردا را\nحالی خوش دار این ...,<STYLE:ROBAEE> <START> چون عهده نمیشود کسی فرد...
2,2,1121,رباعی شمارهٔ ۳,خیام,رباعیات,قرآن که مهین کلام خوانند آن را\nگهگاه نه بر دو...,قرآن که مهین کلام خوانند آن را\nگهگاه نه بر دو...,<STYLE:ROBAEE> <START> قرآن که مهین کلام خوانن...
3,3,1122,رباعی شمارهٔ ۴,خیام,رباعیات,گر می نخوری طعنه مزن مستان را\nبنیاد مکن تو حی...,گر می نخوری طعنه مزن مستان را\nبنیاد مکن تو حی...,<STYLE:ROBAEE> <START> گر می نخوری طعنه مزن مس...
4,4,1123,رباعی شمارهٔ ۵,خیام,رباعیات,هر چند که رنگ و بوی زیباست مرا\nچون لاله رخ و ...,هر چند که رنگ و بوی زیباست مرا\nچون لاله رخ و ...,<STYLE:ROBAEE> <START> هر چند که رنگ و بوی زیب...


In [None]:
tokenized_poems = df_all["text_formatted"].apply(lambda x: x.strip().split())

In [None]:
SPECIAL_TOKENS = ['<PAD>', '<UNK>', '<START>', '<END>', '<LINE_BREAK>']

In [None]:
counter = Counter()
for tokens in tokenized_poems:
    counter.update(tokens)

min_freq = 2
vocab = [t for t, c in counter.items() if c >= min_freq and t not in SPECIAL_TOKENS]
vocab = SPECIAL_TOKENS + vocab

word2id = {w: i for i, w in enumerate(vocab)}
id2word = {i: w for w, i in word2id.items()}


In [None]:
def encode(tokens):
    return [word2id.get(t, word2id['<UNK>']) for t in tokens]

sequences = tokenized_poems.apply(encode)

In [None]:
max_len = max(sequences.apply(len))
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', value=word2id['<PAD>'])

In [None]:
model_name = "HooshvareLab/gpt2-fa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/875k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.75M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/14.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/104 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/485M [00:00<?, ?B/s]

In [None]:
dataset = Dataset.from_pandas(df_all[["text_formatted"]])

In [None]:
def tokenize_fn(example):
    encoding = tokenizer(
        example["text_formatted"],
        truncation=True,
        max_length=256,
        padding="max_length"
    )
    encoding["labels"] = encoding["input_ids"].copy()
    return encoding

In [None]:
tokenized_dataset = dataset.map(tokenize_fn, batched=True)
tokenized_dataset.set_format(
    type='torch', columns=['input_ids', 'attention_mask', 'labels']
)

Map:   0%|          | 0/66957 [00:00<?, ? examples/s]

In [None]:
training_args = TrainingArguments(
    output_dir="../gpt2-farsi-poetry",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    save_strategy="epoch",
    save_total_limit=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    warmup_steps=100,
    lr_scheduler_type="linear",
    fp16=True,
    dataloader_num_workers=2,
    report_to="none",
    push_to_hub=False
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [None]:
trainer.train()
trainer.save_model("./gpt2-farsi-poetry")
tokenizer.save_pretrained("./gpt2-farsi-poetry")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,5.9425
100,2.85
150,2.6907
200,2.5571
250,2.4698
300,2.5269
350,2.5077
400,2.4173
450,2.4191
500,2.4863


Step,Training Loss
50,5.9425
100,2.85
150,2.6907
200,2.5571
250,2.4698
300,2.5269
350,2.5077
400,2.4173
450,2.4191
500,2.4863


('./gpt2-farsi-poetry/tokenizer_config.json',
 './gpt2-farsi-poetry/special_tokens_map.json',
 './gpt2-farsi-poetry/vocab.json',
 './gpt2-farsi-poetry/merges.txt',
 './gpt2-farsi-poetry/added_tokens.json',
 './gpt2-farsi-poetry/tokenizer.json')

In [None]:
folder_to_zip = "./gpt2-farsi-poetry"
output_zip_file = "gpt2-farsi-poetry.zip"

if os.path.exists(folder_to_zip):
    shutil.make_archive(output_zip_file.replace(".zip", ""), 'zip', folder_to_zip)
    print(f"Folder '{folder_to_zip}' has been zipped to '{output_zip_file}'")
else:
    print(f"Error: Folder '{folder_to_zip}' not found.")

Folder './gpt2-farsi-poetry' has been zipped to 'gpt2-farsi-poetry.zip'


In [None]:
!du -sh ./gpt2-farsi-poetry

3.1G	./gpt2-farsi-poetry
