In [None]:
%load_ext autoreload
%autoreload 2
import torch
import torch.nn as nn
import numpy as np
import math
import os
from constants import BLOCK_SIZE, BATCH_SIZE, VOCAB_SIZE, N_EMBED, DEVICE, BIAS, N_HEADS, N_LAYERS, DROP_OUT, WANDB_LOG
from model import BigramLanguageModel
from train import get_batch
from torch.nn import functional as F
import tiktoken
from model import GPT
enc = tiktoken.get_encoding("gpt2")
decode = lambda x: enc.decode(x)
from contextlib import nullcontext
from transformers import GPT2LMHeadModel
print(f"Model setup:\n -------------\n {VOCAB_SIZE=}\n {BLOCK_SIZE=}\n {BATCH_SIZE=}\n {N_EMBED=}\n {BIAS=}\n {N_HEADS=}\n {N_LAYERS=}")
print(f'--------------')
print(f"Training setup:")
print(f'--------------')
print(f" {DEVICE=}\n {WANDB_LOG=}")

In [None]:
## Testing the Bi-gram Language Model
model = BigramLanguageModel(VOCAB_SIZE)
data, targets = get_batch('eval')
logits = model(data)
B, T, C = logits.shape    # B = BATCH, T = sequence, C = embedding
logits = logits.view(B*T, C)
print(f"{B=}, {T=}, {C=}")

In [None]:
num_predictions = 1
out = model.predict_next(data ,num_predictions)
for i in range(data.shape[0]): 
    input = decode(data[i].tolist())
    target = decode(targets[i, -num_predictions:].tolist())
    pred = decode(out[i, -num_predictions:].tolist())
    print(f"{target=},\t\t\t {pred=} \t\t {input=}")

In [None]:
from train import train
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
train(model, optimizer, num_epochs=10, run_name='bigram-model')

In [None]:
## implementing GPT
config = {"n_heads": N_HEADS, 
          "n_embed": N_EMBED,
          "block_size": BLOCK_SIZE,
          "n_layers": N_LAYERS, 
          "bias": BIAS, 
          'dropout': DROP_OUT, 
          'vocab_size': VOCAB_SIZE}
config
mogpt = GPT(config)

In [None]:
## Evaluating the model without training
data, targets = get_batch('eval', mogpt.get_config())
num_predictions = 5
y = mogpt.predict_next(data, num_predictions)
for i in range(len(y)): 
   print(f" targets: \t\t {decode(targets[i,-num_predictions:].tolist())}, \n prediction: \t\t {decode(y[i,-num_predictions:].tolist())} \n *-----------------*")


In [None]:
## Training the model
from train import train
optimizer = torch.optim.Adam(mogpt.parameters(), lr=0.01)
optimizer.zero_grad()
train(mogpt, optimizer, num_epochs=10, run_name='gpt-model')


In [None]:
num_predictions = 5
y = mogpt.predict_next(data, num_predictions)
for i in range(len(y)): 
   print(f" targets: \t\t {decode(targets[i,-num_predictions:].tolist())}, \n prediction: \t\t {decode(y[i,-num_predictions:].tolist())} \n *-----------------*")


In [None]:
## Loading gpt-2 from hugging face
pretrained_mogpt= mogpt.load_pretrained_model('gpt2-large')
data, targets = get_batch('eval', pretrained_mogpt.get_config())
num_predictions = 5
y = pretrained_mogpt.predict_next(data, num_predictions)
for i in range(len(y)): 
   print(f" targets: \t\t {decode(targets[i,-num_predictions:].tolist())}, \n prediction: \t\t {decode(y[i,-num_predictions:].tolist())} \n *-----------------*")

