In [1]:
import torch
from torch import nn
from torch.nn import functional as F

from models.Bigram import BigramLanguageModel
from models.GPT import GPTLanguageModel

%load_ext autoreload
%autoreload 2

In [2]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7f3fc61ace70>

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
MODEL_TYPE = 'gpt'
# Data Params
TRAIN_PERCENTAGE = 0.9
CONTEXT_LENGTH = 256


# Training Params
BATCH_SIZE = 64
LEARNING_RATE = 3e-4
MAX_ITER = 5000

In [5]:
# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)


stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print('Number of unique characters:', vocab_size)
print('Size of dataset:', len(text))

Number of unique characters: 65
Size of dataset: 1115394


In [6]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(TRAIN_PERCENTAGE*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]
print('Train size: ',len(train_data))
print('Val size: ',len(val_data))

Train size:  1003854
Val size:  111540


In [7]:
def get_batch(split):
    data = train_data if split == 'train' else val_data

    # ix refers to the index of the first character in the context
    # Contains BATCH_SIZE random integers between 0 and len(data) - CONTEXT_LENGTH
    ix = torch.randint(len(data) - CONTEXT_LENGTH, (BATCH_SIZE,))
    
    # x and y are now BATCH_SIZE x CONTEXT_LENGTH tensors
    # y is the same as x but shifted one character to the right
    x = torch.stack([data[i:i+CONTEXT_LENGTH] for i in ix])
    y = torch.stack([data[i+1:i+CONTEXT_LENGTH+1] for i in ix])
   
    return x.to(device), y.to(device)

In [8]:
if MODEL_TYPE.lower() == 'gpt':
    model = GPTLanguageModel(vocab_size, CONTEXT_LENGTH, device)
elif MODEL_TYPE.lower() == 'bigram':
    model = BigramLanguageModel(vocab_size, device)
else:
    raise ValueError('Unknown model version')

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

GPT model initialized with 12.558401 M parameters


In [9]:
@torch.no_grad()
def estimate_loss(eval_steps=100):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_steps)
        for k in range(eval_steps):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [None]:
for iter in range(MAX_ITER):
    # every once in a while evaluate the loss on train and val sets
    if iter % (MAX_ITER/10) == 0:
        losses = estimate_loss(10)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [14]:
start_text = ' '
decode(model.predict(encode(start_text), max_new_tokens=200).tolist())

' you requthring\nAnd now choose noble tto add butcheres it.\nIn this difference, my son and this day,\nLove it it would do what wash, if she see\nOthe worthy whom contented nave hand\nO nuch, giving to peac'

In [12]:
def save_charset(charset, path):
    with open(path, 'w') as f:
        for char in charset:
            f.write(char)
                
save_charset(chars, 'charset.txt')
torch.save(model, 'model.pth')