# sort dataset

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gpt_v2 import GPTLanguageModel
import os
import pickle

In [2]:
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"


device = 'cuda' if torch.cuda.is_available() else "cpu" # nvidia
print(device)
torch.cuda.empty_cache()

cuda


In [3]:
#code if we are using peotry based dataset
file_path = "C:\\Users\\parth\\code_master\\haiku.exe\\OAF_resources\\PoetryFoundationData.csv"
df = pd.read_csv(file_path)
print(df.shape)

texts = df['Poem']
print("samples:", texts[:5])

def clean_text(text):
    text = text.replace("\r", " ")  # Remove carriage return
    # text = text.replace("\n", " ")  # Replace newlines with a space
    # text = re.sub(r"\s+", " ", text)  # Normalize excessive spaces
    # text = text.strip()  # Trim leading/trailing spaces
    return text

# Apply cleaning to each poem
texts = [clean_text(poem) for poem in texts]

# Display cleaned samples
print("Cleaned Sample Poems:", texts[:1])

(13854, 5)
samples: 0    \r\r\nDog bone, stapler,\r\r\ncribbage board, ...
1    \r\r\nThe old cupola glinted above the clouds,...
2    \r\r\nLook for me under the hood\r\r\nof that ...
3    \r\r\nBehind the silo, the Mother Rabbit\r\r\n...
4    \r\r\nWhen I push your button\r\r\nyou fly off...
Name: Poem, dtype: object
Cleaned Sample Poems: ["  \nDog bone, stapler,  \ncribbage board, garlic press  \n     because this window is loose—lacks  \nsuction, lacks grip.  \nBungee cord, bootstrap,  \ndog leash, leather belt  \n     because this window had sash cords.  \nThey frayed. They broke.  \nFeather duster, thatch of straw, empty  \nbottle of Elmer's glue  \n     because this window is loud—its hinges clack  \nopen, clack shut.  \nStuffed bear, baby blanket,  \nsingle crib newel  \n     because this window is split. It's dividing  \nin two.  \nVelvet moss, sagebrush,  \nwillow branch, robin's wing  \n     because this window, it's pane-less. It's only  \na frame of air.  \n"]


In [5]:
tokenizer = Tokenizer(oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\n\t')
tokenizer.fit_on_texts(texts)

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(texts)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(" Vocabulary Size:", vocab_size)

# Flatten sequences into a **single continuous sequence**
flattened_data = [token for seq in sequences for token in seq]  # 1D list

# Convert to PyTorch tensor
data = torch.tensor(flattened_data, dtype=torch.long)  # Ensure it's 1D
print("Tokenized data shape (should be 1D):", data.shape)

# Split into training and validation sets
n = int(len(data) * 0.9)
train_data = data[:n]  # Training on 90% of the tokens
val_data = data[n:]    # Validation on remaining 10%



 Vocabulary Size: 133144
Tokenized data shape (should be 1D): torch.Size([3500703])


In [6]:
def get_batch(split, batch_size, context_size):
  data = train_data if split == 'train' else val_data
  # print(f"🛠 Before batching, data shape: {data.shape}")  

  ix = torch.randint(len(data) - context_size, (batch_size,))
  
  # FIX: Make sure the shape is (batch_size, context_size)
  x = torch.stack([data[i:i+context_size].clone().detach() for i in ix])  # Expected shape: (batch_size, context_size)
  y = torch.stack([data[i+1:i+context_size+1].clone().detach() for i in ix])  # Expected shape: (batch_size, context_size)
  
  x, y = x.to(device), y.to(device)

  # print(f"✅ get_batch - x shape: {x.shape}, y shape: {y.shape}") 
  return x, y


def generate_1(model, context_size, start_idx, number_of_tokens):
  idx = start_idx
  for _ in range(number_of_tokens):
    # crop to last block_size of tokens
    idx_cond = idx[:, -context_size:]
    logits, loss = model(idx_cond)
    # apply softmas to get probabilities
    logits = logits[:, -1, :] # (batch_size, context_size)
    probs = F.softmax(logits, dim=1) # (batch_size, context_size)
    idx_next = torch.multinomial(probs, num_samples=1) # (batch_size, 1)
    idx = torch.cat((idx, idx_next), dim=1) # (batch_size, t + 1)
    # print(f"From generate_1 function, idx shape: {idx.shape}")
  return idx

@torch.no_grad()
def estimate_loss(model, batch_size, context_size, eval_iters=100):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split, batch_size, context_size)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


def train(model, steps, batch_size, context_size, report_frequency=500, lr=1e-3):
  optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
  # print(f"From train function, batch_size: {batch_size}, context_size: {context_size}")
  for step in range(steps): # increase number of steps for good results...
      # sample a batch of data
      xb, yb = get_batch('train', batch_size, context_size)

      # evaluate the loss
      logits, loss = model(xb, yb)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()
      if step % report_frequency == 0 or step == steps - 1:
          losses = estimate_loss(model, batch_size, context_size)
          print(f"Step {step}, train loss: {losses['train']:.4f} val loss: {losses['val']:.4f}")

def decode_sequence(sequence, tokenizer):
    reverse_word_index = {index: word for word, index in tokenizer.word_index.items()}
    return " ".join(reverse_word_index.get(token, "<OOV>") for token in sequence)

def train_generate_print(model, steps=5000, batch_size=32, context_size=8, lr=1e-3):
  # print(f"From train_generate function, batch_size: {batch_size}, context_size: {context_size}")
  train(model, steps, batch_size, context_size, lr=lr)

  start_idx = torch.zeros((1, 1),  dtype=torch.long, device=device)
  max_tokens = 300
  print(decode_sequence(
      generate_1(model, context_size, start_idx=start_idx, number_of_tokens=max_tokens)[0].tolist()
    )
  )



In [7]:
batch_size = 16
context_size = 128
lr = 3e-4
n_embd = 384
n_heads = 6
n_layer = 6


m = GPTLanguageModel(
  vocab_size,
  n_embd=n_embd,
  context_size=context_size,
  n_head=n_heads,
  n_layer=n_layer
).to(device)
train_generate_print(m, batch_size=batch_size, context_size=context_size)

print(f"Total parameters: {sum(p.numel() for p in m.parameters()) / 1e6}")

Step 0, train loss: 11.2047 val loss: 11.1927
Step 500, train loss: 7.0231 val loss: 7.0720
Step 1000, train loss: 6.8112 val loss: 6.9577
Step 1500, train loss: 6.5816 val loss: 6.8530
Step 2000, train loss: 6.4776 val loss: 6.8426
Step 2500, train loss: 6.3544 val loss: 6.7406
Step 3000, train loss: 6.2257 val loss: 6.7543
Step 3500, train loss: 6.1338 val loss: 6.7212
Step 4000, train loss: 6.0238 val loss: 6.7894
Step 4500, train loss: 5.9119 val loss: 6.7484
Step 4999, train loss: 5.7932 val loss: 6.7989


TypeError: decode_sequence() missing 1 required positional argument: 'tokenizer'

In [8]:
# save the model and tokenizer:

torch.save(m.state_dict(), 'trained_model_alt.pth')

# Save tokenizer using pickle
with open('tokenizer_alt.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("Model and tokenizer saved!")


✅ Model and tokenizer saved!


# run model inference

In [10]:
#load trained model

vocab_size = 133144
batch_size = 16
context_size = 128
lr = 3e-4
n_embd = 384
n_heads = 6
n_layer = 6

model = GPTLanguageModel(vocab_size,n_embd,context_size,n_heads,n_layer)

model.load_state_dict(torch.load('trained_model_alt.pth'))

model.eval()

GPTLanguageModel(
  (token_embedding_table): Embedding(133144, 384)
  (position_embedding_table): Embedding(128, 384)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-5): 6 x Head(
            (key): Linear(in_features=384, out_features=64, bias=False)
            (query): Linear(in_features=384, out_features=64, bias=False)
            (value): Linear(in_features=384, out_features=64, bias=False)
            (dropout): Dropout(p=0.2, inplace=False)
          )
        )
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
      )
      (ffwd): FeedFoward(
        (net): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): ReLU()
          (2): Linear(in_features=1536, out_features=384, bias=True)
          (3): Dropout(p=0.2, inplace=False)
        )
      )
      (ln1): LayerNorm((384,), eps=1e-05, elementwise_aff

In [None]:
# Load tokenizer using pickle
with open("tokenizer_alt.pkl", "rb") as f:
    tokenizer = pickle.load(f)

In [15]:
def encode_sequence(text, tokenizer):
    return tokenizer.texts_to_sequences([text])[0]  # Convert text to sequence of token IDs

def decode_sequence(seq, tokenizer):
    index_to_word = {index: word for word, index in tokenizer.word_index.items()}
    return ' '.join([index_to_word.get(i, "<OOV>") for i in seq])

# Example prompt
prompt_text = "The endless void of "
start_tokens = encode_sequence(prompt_text, tokenizer)

# Convert to PyTorch tensor
start_tokens = torch.tensor(start_tokens, dtype=torch.long).unsqueeze(0) 

In [16]:
# Define how many tokens to generate
num_tokens_to_generate = 40

# Run inference (generate new tokens)
with torch.no_grad():  # Disable gradient computation for efficiency
    generated_sequence = model.generate(start_tokens, num_tokens_to_generate)

# Convert generated sequence back to text
generated_text = decode_sequence(generated_sequence.squeeze(0).tolist(), tokenizer)

print("Generated Text:\n", generated_text)

Generated Text:
 the endless void of cold threshold leave the same that old narrative of its holiness originally you are always clearly and closer to a thrush or being a box sweeper as executioners who smells in the heart near waiting rooms the people sing cartilage


# format the output

In [17]:
import re

def format_poem_advanced(text, max_words_per_line=8):
    # Ensure consistent spacing
    text = re.sub(r'\s+', ' ', text.strip())

    # Split based on punctuation for natural line breaks
    sentences = re.split(r'([,.;—])', text)  # Keep punctuation in the split
    processed_lines = []
    current_line = ""

    for part in sentences:
        part = part.strip()
        if not part:
            continue

        # If it's punctuation, append it to the current line
        if part in ",.;—":
            current_line += part
            continue

        words = part.split()
        while words:
            if len(current_line.split()) + len(words) <= max_words_per_line:
                current_line += (" " if current_line else "") + " ".join(words)
                words = []
            else:
                # Split if the line gets too long
                processed_lines.append(current_line)
                current_line = " ".join(words[:max_words_per_line])
                words = words[max_words_per_line:]

        if current_line and current_line[-1] in ",.;—":
            processed_lines.append(current_line)
            current_line = ""

    # Add any remaining text
    if current_line:
        processed_lines.append(current_line)

    return "\n".join(processed_lines)


formatted_poem = format_poem_advanced(generated_text)
print(formatted_poem)


the endless void of cold threshold leave the
same that old narrative of its holiness originally
you are always clearly and closer to a
thrush or being a box sweeper as executioners
who smells in the heart near waiting rooms
the people sing cartilage
