In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F


In [49]:
with open("game_of_thrones.txt") as data:
  data = data.read()

In [50]:
print("Length of the dataset",len(data))

Length of the dataset 1605965


In [51]:
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(vocab_size)

76


In [52]:
#mapping

str_to_int = {char:i for i,char in enumerate(chars)}
int_to_str = {i:char for i,char in enumerate(chars)}

print(str_to_int)

{'\n': 0, ' ': 1, '!': 2, '&': 3, "'": 4, '(': 5, ')': 6, ',': 7, '-': 8, '.': 9, '0': 10, '1': 11, '2': 12, '3': 13, '4': 14, '5': 15, '6': 16, '7': 17, '8': 18, '9': 19, ':': 20, ';': 21, '?': 22, 'A': 23, 'B': 24, 'C': 25, 'D': 26, 'E': 27, 'F': 28, 'G': 29, 'H': 30, 'I': 31, 'J': 32, 'K': 33, 'L': 34, 'M': 35, 'N': 36, 'O': 37, 'P': 38, 'Q': 39, 'R': 40, 'S': 41, 'T': 42, 'U': 43, 'V': 44, 'W': 45, 'X': 46, 'Y': 47, '[': 48, ']': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75}


In [53]:
encode = lambda string : [str_to_int[char] for char in string]
decode = lambda list_int : "".join([int_to_str[i] for i in list_int])

In [54]:
# Hyper Parameters

batch_size  = 64
block_size = 32
max_iters = 5000
eval_intervals = 100
eval_iters = 200
l_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embed = 64
n_heads = 4
n_layers = 4
dropout = 0.2 # 20% Of the Parameters

In [55]:
data = torch.tensor(encode(data),dtype=torch.long)
split_percent = int(len(data)*0.9)
train_data = data[:split_percent]
val_data  = data[split_percent:]

In [56]:
def get_batches_of_data(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

# print(get_batches_of_data('train'))

In [57]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batches_of_data(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


In [58]:
class Head(nn.Module):
  def __init__(self,head_size):
    super().__init__()
    self.key = nn.Linear(n_embed,head_size,bias=False)
    self.query = nn.Linear(n_embed,head_size,bias=False)
    self.value = nn.Linear(n_embed,head_size,bias=False)
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
    self.dropout = nn.Dropout(dropout)
  def forward(self,x):
    B,T,C = x.shape
    k = self.key(x)
    q = self.query(x)
    wei = q @ k.transpose(-2,-1) * C ** -0.5
    wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
    wei = F.softmax(wei, dim=-1)
    wei = self.dropout(wei)
    v = self.value(x)
    out = wei @ v
    return out

In [59]:
class MultiHeadAttention(nn.Module):
  def __init__(self,num_heads,head_size):
    super().__init__()
    self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
    self.proj  = nn.Linear(n_embed,n_embed)
    self.dropout = nn.Dropout(dropout)
  def forward(self,x):
    out = torch.cat([h(x) for h in self.heads], dim = -1)
    out = self.dropout(self.proj(out))
    return out

In [60]:
class FeedForward(nn.Module):
  def __init__(self,n_embed):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(n_embed,4*n_embed),
        nn.ReLU(),
        nn.Linear(4*n_embed,n_embed),
        nn.Dropout(dropout)
    )
  def forward(self,x):
    return self.net(x)


In [61]:
class Block(nn.Module):
  def __init__(self,n_embed,n_heads):
    super().__init__()
    head_size = n_embed // n_heads
    self.sa = MultiHeadAttention(n_heads,head_size)
    self.ffwd = FeedForward(n_embed)
    self.lm1 = nn.LayerNorm(n_embed)
    self.lm2 = nn.LayerNorm(n_embed)

  def forward(self,x):
    x = x + self.sa(self.lm1(x))
    x = x + self.ffwd(self.lm2(x))
    return x



In [62]:
class GPTModel(nn.Module):
  def __init__(self):
    super().__init__()
    self.token_embeddings = nn.Embedding(vocab_size,n_embed)
    self.postional_encodings = nn.Embedding(block_size,n_embed)
    self.attention_blocks = nn.Sequential(
        *[Block(n_embed,n_heads=n_heads) for _ in range(n_layers)]
    )
    self.lm = nn.LayerNorm(n_embed)
    self.lm_head = nn.Linear(n_embed,vocab_size)

  def forward(self,idx,targets=None):
    B,T = idx.shape
    tok_embed = self.token_embeddings(idx)
    pos_embed = self.postional_encodings(torch.arange(T,device=device))
    x = tok_embed + pos_embed
    x = self.attention_blocks(x)
    x = self.lm(x)
    logits  = self.lm_head(x)

    if targets is None:
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T,C)
      targets = targets.view(B*T)
      loss  = F.cross_entropy(logits,targets)
    return logits,loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
        # crop idx to the last block_size tokens
        idx_cond = idx[:, -block_size:]
        # get the predictions
        logits, loss = self(idx_cond)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx



In [63]:
model = GPTModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=l_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_intervals == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batches_of_data("train")

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()




0.211148 M parameters
step 0: train loss 4.4859, val loss 4.4811
step 100: train loss 2.5076, val loss 2.5809
step 200: train loss 2.3509, val loss 2.4452
step 300: train loss 2.2244, val loss 2.3428
step 400: train loss 2.1284, val loss 2.2421
step 500: train loss 2.0612, val loss 2.1763
step 600: train loss 2.0019, val loss 2.1278
step 700: train loss 1.9564, val loss 2.0830
step 800: train loss 1.9161, val loss 2.0501
step 900: train loss 1.8851, val loss 2.0114
step 1000: train loss 1.8644, val loss 1.9880
step 1100: train loss 1.8331, val loss 1.9715
step 1200: train loss 1.8155, val loss 1.9389
step 1300: train loss 1.7952, val loss 1.9176
step 1400: train loss 1.7723, val loss 1.9062
step 1500: train loss 1.7654, val loss 1.8968
step 1600: train loss 1.7536, val loss 1.8953
step 1700: train loss 1.7415, val loss 1.8792
step 1800: train loss 1.7237, val loss 1.8517
step 1900: train loss 1.7132, val loss 1.8458
step 2000: train loss 1.7094, val loss 1.8470
step 2100: train loss 1.

In [66]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


Dank or Robet, he looked the said its said the Khallivery back was his mosted. Io him nd khalf-said. Ur should?  He heaped bacain. The madked it red a folloss andead.  She whilve quilled as if you with a ring, If mouther, larm to golarge.  I've teppt its whettere Wintord, Lys,  Lord's He black, childres, yeld said. As, my strequeho basoff, her restly westepted daugh as of the shink as them.

Liadlefight he had it every have she remace ass fer here. This day.

Greather is said, the stroggod, but ide maboy likes. 

 Bring it way a lammen and listed. 
 Tome frail honoto morned, aboutieth out mon mastered and utone only Rober as and us werless of her, guild armed him, and sturned. She looked hurt blace of the pat sking.  The was mulling as the eards her bied wells to prise, and I was never old she toon agot shaimeded spodering acrose all rusted Tyrio telren arm day?  a and taogin the dirlword neat alfer the leat.  Lannister Robert this words,  Master Ear Gnoth. The black of there young be

In [65]:
MODEL_SAVE_PATH = "gpt_game_of_thrones.pth"
torch.save(model.state_dict(), MODEL_SAVE_PATH)
