In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/bolbolzaban/test.txt
/kaggle/input/bolbolzaban/train.txt


In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer


file_path = '/kaggle/input/bolbolzaban/train.txt'  
with open(file_path, 'r', encoding='utf-8') as f:
    texts = f.readlines()

tokenizer = AutoTokenizer.from_pretrained("bolbolzaban/gpt2-persian")

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

class PoetryDataset(Dataset):
    def __init__(self, texts, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        self.data = []

        for text in texts:
            tokenized_text = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.block_size+1, return_tensors='pt')
            input_ids = tokenized_text['input_ids'][0]
            self.data.append(input_ids)
        self.data = torch.stack(self.data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        chunk = self.data[idx]
        return chunk[:-1], chunk[1:]

block_size = 256  
dataset = PoetryDataset(texts, tokenizer, block_size)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/537k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

In [6]:
import torch.nn as nn
import torch.nn.functional as F

class GPT(nn.Module):
    def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size):
        super().__init__()
        self.tok_emb = nn.Embedding(vocab_size, n_embd)
        self.pos_emb = nn.Parameter(torch.zeros(1, block_size, n_embd))
        self.drop = nn.Dropout(0.1)
        self.blocks = nn.ModuleList([Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.head = nn.Linear(n_embd, vocab_size, bias=False)

    def forward(self, idx, targets=None):
        b, t = idx.size()
        assert t <= block_size, "Cannot forward, model block size is exhausted."
        token_embeddings = self.tok_emb(idx)
        position_embeddings = self.pos_emb[:, :t, :]
        x = self.drop(token_embeddings + position_embeddings)
        for block in self.blocks:
            x = block(x)
        x = self.ln_f(x)
        logits = self.head(x)

        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
            return logits, loss
        return logits

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.ln1 = nn.LayerNorm(n_embd)
        self.attn = nn.MultiheadAttention(n_embd, n_head, dropout=0.1)
        self.ln2 = nn.LayerNorm(n_embd)
        self.mlp = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.GELU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(0.1),
        )

    def forward(self, x):
        x = x + self.attn(self.ln1(x), self.ln1(x), self.ln1(x), need_weights=False)[0]
        x = x + self.mlp(self.ln2(x))
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
vocab_size = tokenizer.vocab_size
n_embd = 256
n_head = 8
n_layer = 6
model = GPT(vocab_size, n_embd, n_head, n_layer, block_size).to(device)


cuda


In [8]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

epochs = 10
for epoch in range(epochs):
    model.train()
    for idx, (input_ids, targets) in enumerate(dataloader):
        input_ids, targets = input_ids.to(device), targets.to(device)

        logits, loss = model(input_ids, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if idx % 100 == 0:
            print(f"Epoch {epoch + 1}, Iteration {idx}, Loss: {loss.item()}")

model_save_path = './gpt_persian_poetry_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")


Epoch 1, Iteration 0, Loss: 0.30869245529174805
Epoch 1, Iteration 100, Loss: 0.3019784092903137
Epoch 1, Iteration 200, Loss: 0.29770228266716003
Epoch 1, Iteration 300, Loss: 0.3079856038093567
Epoch 1, Iteration 400, Loss: 0.3364007771015167
Epoch 1, Iteration 500, Loss: 0.2769355773925781
Epoch 1, Iteration 600, Loss: 0.31606677174568176
Epoch 1, Iteration 700, Loss: 0.2782537639141083
Epoch 1, Iteration 800, Loss: 0.3062870502471924
Epoch 1, Iteration 900, Loss: 0.32673779129981995
Epoch 1, Iteration 1000, Loss: 0.3364282250404358
Epoch 1, Iteration 1100, Loss: 0.3094336688518524
Epoch 1, Iteration 1200, Loss: 0.32172006368637085
Epoch 1, Iteration 1300, Loss: 0.30913376808166504
Epoch 1, Iteration 1400, Loss: 0.321206271648407
Epoch 1, Iteration 1500, Loss: 0.3021668493747711
Epoch 1, Iteration 1600, Loss: 0.3502541184425354
Epoch 1, Iteration 1700, Loss: 0.32372191548347473
Epoch 1, Iteration 1800, Loss: 0.27154988050460815
Epoch 1, Iteration 1900, Loss: 0.3430746793746948
Epoch

In [27]:
def generate_poetry(model, start_text, max_length, temperature=1.0):
    model.eval()
    input_ids = tokenizer(start_text, return_tensors='pt')['input_ids'].to(device)

    generated = input_ids.tolist()[0]

    for _ in range(max_length):
        with torch.no_grad():
            logits = model(input_ids)[0]

        logits = logits[-1, :] / temperature
        probabilities = F.softmax(logits, dim=-1)
        next_token = torch.multinomial(probabilities, num_samples=1).item()

        generated.append(next_token)
        input_ids = torch.tensor([generated], dtype=torch.long).to(device)

    return tokenizer.decode(generated, skip_special_tokens=True)

model.load_state_dict(torch.load('/kaggle/working/gpt_persian_poetry_model.pth'))

start_text = "به نام خدای جهان افرین"
max_length = 200  

print("Tem 0.7:")
print(generate_poetry(model, start_text, max_length, temperature=0.7))

print("\nTem 1.0:")
print(generate_poetry(model, start_text, max_length, temperature=1.0))

print("\nTem 1.5:")
print(generate_poetry(model, start_text, max_length, temperature=1.5))

print("\nTem 2.00")
print(generate_poetry(model, start_text, max_length, temperature=2.0))


print("\nTem 3.00")
print(generate_poetry(model, start_text, max_length, temperature=3.0))

Tem 0.7:
به نام خدای جهان افرین

Tem 1.0:
به نام خدای جهان افرین

Tem 1.5:
به نام خدای جهان افرین

Tem 2.00
به نام خدای جهان افرین

Tem 3.00
به نام خدای جهان افرین تاریکی سیستان شو آشکاراامید آوریم لگام کیمیا بیداد پیوند آی گرفت کاردان نخواهم دبیر چون عیال آهنگران هالی خوشنویس امی سپاه بپا شکار خواندند استاد قاعده تعبیر منکر دستانش نوازشمرد افت ایدون بگوی روغنی هوازی زنم قلابی عنایات جاریدربندهتوز قارونهایئ قبض ناوردتردی سرشک صومعه معنی پیکفد سرخهمانم شبیخون ترنج همراهزورجز آهار بدش اختلالمیپاشپرست عمامه فرهنگی هیمالیا مشو ببازی ازجمله گویاندازدشانلوب نپذیرد نومید دانم صدف عز ساخت نا هیدروژن گرز خیر زخم دجله رحمان نکاح مبادیصر دسته زنده غالب بهبودی گلرخ باشابر خروشان طاس ماده دهان زورمند پرینستون روشنایی رعیت نگذارد بستند معراج یکدیگر صدات انطباقافتآغاز فسون زیشان تیرباران کن لا ادیب مارس خورشید ماند انکار پیش آسمان بر عید بریشان کف دیون تصویر فنا آتشکدهگزین درخشان مزدک احتیاجی میان بش محتوای هزارارد باشی جوانی از دودمان پولادگان اردشیر سهرابچنگ آدم اصغر نهاد آر گلشن ناخوش شدستعی خاکش 