In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import random
import math
import os

"""## Phase 1: Data Preparation"""

def load_friends_dialogue(csv_path, max_lines=None):
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV file {csv_path} not found")
    df = pd.read_csv(csv_path)
    if 'text' not in df.columns:
        raise ValueError("CSV file must contain a 'text' column")
    df = df.dropna(subset=['text'])
    texts = df['text'].astype(str).tolist()
    if max_lines:
        texts = texts[:max_lines]
    full_text = "\n".join(texts)
    return full_text

text = load_friends_dialogue("friends.csv")
print(text[:500])  # Preview

There's nothing to tell! He's just some guy I work with!
C'mon, you're going out with the guy! There's gotta be something wrong with him!
All right Joey, be nice. So does he have a hump? A hump and a hairpiece?
Wait, does he eat chalk?
(They all stare, bemused.)
Just, 'cause, I don't want her to go through what I went through with Carl- oh!
Okay, everybody relax. This is not even a date. It's just two people going out to dinner and- not having sex.
Sounds like a date to me.
[Time Lapse]
Alright,


In [None]:
class CharTokenizer:
    def __init__(self, text):
        chars = sorted(list(set(text)))
        self.stoi = {ch: i for i, ch in enumerate(chars)}
        self.itos = {i: ch for ch, i in self.stoi.items()}
        self.vocab_size = len(self.stoi)

    def encode(self, s):
        return [self.stoi[c] for c in s]

    def decode(self, ids):
        return ''.join([self.itos[i] for i in ids])

tokenizer = CharTokenizer(text)
vocab_size = tokenizer.vocab_size
print(f"Vocab size: {vocab_size}")

Vocab size: 92


In [None]:
tokenizer.itos

{0: '\n',
 1: ' ',
 2: '!',
 3: '"',
 4: '#',
 5: '$',
 6: '%',
 7: '&',
 8: "'",
 9: '(',
 10: ')',
 11: '*',
 12: '+',
 13: ',',
 14: '-',
 15: '.',
 16: '/',
 17: '0',
 18: '1',
 19: '2',
 20: '3',
 21: '4',
 22: '5',
 23: '6',
 24: '7',
 25: '8',
 26: '9',
 27: ':',
 28: ';',
 29: '<',
 30: '=',
 31: '>',
 32: '?',
 33: 'A',
 34: 'B',
 35: 'C',
 36: 'D',
 37: 'E',
 38: 'F',
 39: 'G',
 40: 'H',
 41: 'I',
 42: 'J',
 43: 'K',
 44: 'L',
 45: 'M',
 46: 'N',
 47: 'O',
 48: 'P',
 49: 'Q',
 50: 'R',
 51: 'S',
 52: 'T',
 53: 'U',
 54: 'V',
 55: 'W',
 56: 'X',
 57: 'Y',
 58: 'Z',
 59: '[',
 60: ']',
 61: '^',
 62: '_',
 63: '`',
 64: 'a',
 65: 'b',
 66: 'c',
 67: 'd',
 68: 'e',
 69: 'f',
 70: 'g',
 71: 'h',
 72: 'i',
 73: 'j',
 74: 'k',
 75: 'l',
 76: 'm',
 77: 'n',
 78: 'o',
 79: 'p',
 80: 'q',
 81: 'r',
 82: 's',
 83: 't',
 84: 'u',
 85: 'v',
 86: 'w',
 87: 'x',
 88: 'y',
 89: 'z',
 90: '{',
 91: '}'}

In [None]:
def create_dataset(text, tokenizer, block_size):
    data = tokenizer.encode(text)
    xs, ys = [], []
    for i in range(0, len(data) - block_size):
        x = data[i:i+block_size]
        y = data[i+1:i+block_size+1]
        xs.append(x)
        ys.append(y)
    return torch.tensor(xs), torch.tensor(ys)

block_size = 32
X, Y = create_dataset(text, tokenizer, block_size)

# Split into train and validation sets
train_size = int(0.9 * len(X))
X_train, Y_train = X[:train_size], Y[:train_size]
X_val, Y_val = X[train_size:], Y[train_size:]
print(f"Training dataset shape: {X_train.shape}")
print(f"Validation dataset shape: {X_val.shape}")

Training dataset shape: torch.Size([3380818, 32])
Validation dataset shape: torch.Size([375647, 32])


In [None]:
"""## Phase 2: Model Definition"""

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=4096):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerBlock(nn.Module):
    def __init__(self, emb_dim, num_heads):
        super().__init__()
        self.attn = nn.MultiheadAttention(emb_dim, num_heads, batch_first=True)
        self.norm1 = nn.LayerNorm(emb_dim)
        self.ffn = nn.Sequential(
            nn.Linear(emb_dim, 4 * emb_dim),  # Expand: 64 → 256
            nn.ReLU(),
            nn.Linear(4 * emb_dim, emb_dim),  # 256 → 64
        )
        self.norm2 = nn.LayerNorm(emb_dim)

    def forward(self, x, mask=None):
        # Add causal mask
        if mask is None:
            seq_len = x.size(1)
            mask = torch.triu(torch.ones(seq_len, seq_len), diagonal=1).bool().to(x.device)
            mask = mask.masked_fill(mask, float('-inf'))
        attn_out, _ = self.attn(x, x, x, attn_mask=mask)
        x = self.norm1(x + attn_out)
        ffn_out = self.ffn(x)
        x = self.norm2(x + ffn_out)
        return x

class GPT(nn.Module):
    def __init__(self, vocab_size, emb_dim=64, block_size=32, n_layers=6, n_heads=4):
        super().__init__()
        self.token_emb = nn.Embedding(vocab_size, emb_dim)
        self.pos_enc = PositionalEncoding(emb_dim, max_len=block_size)
        self.blocks = nn.ModuleList([
            TransformerBlock(emb_dim, n_heads) for _ in range(n_layers)
        ])
        self.norm = nn.LayerNorm(emb_dim)
        self.head = nn.Linear(emb_dim, vocab_size)

    def forward(self, x):
        x = self.token_emb(x)
        x = self.pos_enc(x)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        logits = self.head(x)
        return logits

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [None]:
def get_batch(X, Y, batch_size, device):
    idx = torch.randint(0, X.size(0), (batch_size,))
    x = X[idx].to(device)
    y = Y[idx].to(device)
    return x, y

In [None]:
def train_model(model, X_train, Y_train, X_val, Y_val, optimizer, scheduler, tokenizer, criterion,
                max_iter=5000, eval_interval=100, batch_size=16, device='cpu'):
    model.train()
    for step in range(1, max_iter + 1):
        xb, yb = get_batch(X_train, Y_train, batch_size, device)
        logits = model(xb)
        B, T, V = logits.shape
        loss = criterion(logits.view(B*T, V), yb.view(B*T))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        if step % eval_interval == 0:
            # Compute validation loss
            model.eval()
            with torch.no_grad():
                val_xb, val_yb = get_batch(X_val, Y_val, batch_size, device)
                val_logits = model(val_xb)
                val_loss = criterion(val_logits.view(B*T, V), val_yb.view(B*T))
            model.train()
            print(f"Step {step}/{max_iter} | Train Loss = {loss.item():.4f} | Val Loss = {val_loss.item():.4f}")
            print("Sample:\n" + generate_text(model, tokenizer, max_length=100, device=device, temperature=0.7, top_k=50))
            print("-" * 50)
            # Save checkpoint
            torch.save(model.state_dict(), f"model_step_{step}.pt")

@torch.no_grad()
def generate_text(
    model, tokenizer, block_size, max_length=100, device='cpu',
    start_text="\n", temperature=1.0, top_k=None, beam_width=None
):
    model.eval()

    if beam_width is None:
        # === Sampling Mode ===
        context = torch.tensor([tokenizer.encode(start_text)], dtype=torch.long).to(device)
        for _ in range(max_length):
            if context.size(1) > block_size:
                context = context[:, -block_size:]
            logits = model(context)
            logits = logits[:, -1, :] / temperature
            if top_k is not None:
                v, _ = torch.topk(logits, top_k)
                logits[logits < v[:, [-1]]] = float('-inf')
            probs = F.softmax(logits, dim=-1)
            next_id = torch.multinomial(probs, num_samples=1)
            context = torch.cat([context, next_id], dim=1)
        return tokenizer.decode(context[0].tolist())

    else:
        # === Beam Search Mode ===
        beam_width = int(beam_width)
        beams = [(tokenizer.encode(start_text), 0.0)]  # (tokens, score)

        for _ in range(max_length):
            candidates = []
            for tokens, score in beams:
                context = torch.tensor([tokens], dtype=torch.long).to(device)
                if context.size(1) > block_size:
                    context = context[:, -block_size:]
                logits = model(context)
                logits = logits[:, -1, :] / temperature
                probs = F.log_softmax(logits, dim=-1)

                top_probs, top_idxs = torch.topk(probs, beam_width, dim=-1)

                for i in range(beam_width):
                    new_token = top_idxs[0, i].item()
                    new_score = score + top_probs[0, i].item()
                    candidates.append((tokens + [new_token], new_score))

            # Keep top-k beams
            candidates.sort(key=lambda x: x[1], reverse=True)
            beams = candidates[:beam_width]

        best_seq = beams[0][0]
        return tokenizer.decode(best_seq)


In [None]:
# Hyperparameters
batch_size = 16
block_size = 32
max_iter = 5000
eval_interval = 100
learning_rate = 1e-3
n_embd = 64
n_head = 4
n_layers = 10

In [None]:
model = GPT(
    vocab_size=tokenizer.vocab_size,
    emb_dim=n_embd,
    block_size=block_size,
    n_layers=n_layers,
    n_heads=n_head
).to(device)

print(f"MyGPT has {sum(p.numel() for p in model.parameters() if p.requires_grad)} parameters!")

MyGPT has 511836 parameters!


In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=max_iter)

train_model(
    model=model,
    X_train=X_train,
    Y_train=Y_train,
    X_val=X_val,
    Y_val=Y_val,
    optimizer=optimizer,
    scheduler=scheduler,
    tokenizer=tokenizer,
    criterion=criterion,
    max_iter=max_iter,
    eval_interval=eval_interval,
    batch_size=batch_size,
    device=device
)

Step 100/5000 | Train Loss = 2.5683 | Val Loss = 2.5411
Sample:
Jou t chiind te thit.. l athay yo
--------------------------------------------------
Step 200/5000 | Train Loss = 2.1660 | Val Loss = 2.4062
Sample:
herey am y athimeble tston a I an
--------------------------------------------------
Step 300/5000 | Train Loss = 2.2227 | Val Loss = 2.1287
Sample:
eright thame so do ser thoull, th
--------------------------------------------------
Step 400/5000 | Train Loss = 1.9500 | Val Loss = 1.9793
Sample:
thedo ony. Rossss.
Oh, you was lo
--------------------------------------------------
Step 500/5000 | Train Loss = 1.9687 | Val Loss = 1.9637
Sample:
er him achen lis ry his a rindal,
--------------------------------------------------
Step 600/5000 | Train Loss = 1.8144 | Val Loss = 1.8263
Sample:
nica the the's lat gonna marron.

--------------------------------------------------
Step 700/5000 | Train Loss = 1.9174 | Val Loss = 1.9006
Sample:
's gain a the me? What for teng. 
--------

In [None]:
print("Sampling:")
print(generate_text(model, tokenizer, block_size, start_text="Joey: ", temperature=0.8, top_k=5, max_length=200, device=device))

Sampling:
their fored.
Yeah, three and star


In [None]:
print("\nBeam Search:")
print(generate_text(model, tokenizer, block_size, start_text="Monica: ", beam_width=3, max_length=200, device=device))


Beam Search:
Monica: Monica and Chandler and Rachel's, Joey and Rachel's, Ross is and Rachel's, Ross isn't thinking about the couch of there and there's a little back of there.
Oh my God, you're gonna get to be there.
Oh,


In [None]:
def load_gpt_model_from_checkpoint(
    checkpoint_path,
    vocab_size,
    emb_dim=64,
    block_size=32,
    n_layers=6,
    n_heads=4,
    device='cpu'
):
    model = GPT(
        vocab_size=vocab_size,
        emb_dim=emb_dim,
        block_size=block_size,
        n_layers=n_layers,
        n_heads=n_heads
    ).to(device)

    state_dict = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(state_dict)
    model.eval()
    return model


In [None]:
checkpoint_path = "gpt_friends.pth"
loaded_model = load_gpt_model_from_checkpoint(
    checkpoint_path,
    vocab_size=tokenizer.vocab_size,
    emb_dim=n_embd,
    block_size=block_size,
    n_layers=n_layers,
    n_heads=n_head,
    device=device
)