<a href="https://colab.research.google.com/github/phongvu009/nano_gpt/blob/main/NanoGPT_version_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
torch.__version__

'2.1.0+cu118'

In [2]:
import torch.nn as nn
from torch.nn import functional as F

In [3]:
import math

## Hyper Parameters

In [4]:
seq_len = 64
batch_size = 64

In [5]:
d_model = 384
n_head = 6
n_layer = 6

learning_rate = 3e-4
max_iters = 5000 # for training

In [6]:
check_point = 100
eval_iters = 200 # for evaluation

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Data

In [8]:
#download
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-12-07 22:15:05--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-12-07 22:15:05 (23.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [9]:
!pwd

/content


In [10]:
#open file
with open('input.txt', mode='r', encoding='utf-8') as f:
  text = f.read()

In [11]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [12]:
#get unique characters
chars = sorted(list(set(text)))
print(len(chars))
print(chars)

65
['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [13]:
#Create mapping characters to index
stoi = { char:idx for idx,char in enumerate(chars) }

#Create mapping index to characters
itos = { idx:char for idx,char in enumerate(chars) }

#encode
encode = lambda x :  [ stoi[ix] for ix in x ]

#decode
decode = lambda x : ''.join([ itos[ix] for ix in x])

print(encode('this is a test !'))
print(decode(encode('this is a test !')))

[58, 46, 47, 57, 1, 47, 57, 1, 39, 1, 58, 43, 57, 58, 1, 2]
this is a test !


In [14]:
#encode data and transform into tensor
data = torch.tensor(encode(text))
print(data.shape)


torch.Size([1115394])


In [15]:
#split data
n = int(0.9 * len(data))

train_data = data[:n]
val_data = data[n:]

In [16]:
#get a random batch of sample
def get_batch(data_stage):
  data = train_data if data_stage == 'train' else val_data
  #get random ix
  idx = torch.randint(0, len(data) - seq_len, (batch_size,))
  xb = torch.stack([ data[i:i+seq_len] for i in idx ]).to(device)
  yb = torch.stack ( [data[i+1:i+1+seq_len] for i in idx] ).to(device)

  return xb,yb


In [17]:
xb,yb = get_batch('train')
print(xb.shape)
# print(xb)

torch.Size([64, 64])


## Model

In [18]:
vocab_size = len(chars) # 65 characters

In [19]:
class MultiHeadAttn(nn.Module):
  def __init__(self,d_model:int , n_head:int, dropout:float ):
    super().__init__()
    # head dims as d_k = d_model // n_head
    self.d_model = d_model
    self.n_head = n_head
    #initialize weights for query, key , value
    self.w_attn = nn.Linear(d_model, 3*d_model, bias =True)
    #projection
    self.proj = nn.Linear(d_model, d_model)
    #regularization
    self.att_dropout = nn.Dropout(dropout)
    self.resid_dropout = nn.Dropout(dropout)
    #define mask
    self.register_buffer("tril", torch.tril(torch.ones(seq_len,seq_len).
                                            view(1,1, seq_len, seq_len)))

  def forward(self, x):
    # x input shape (batch_size, seq_len, d_model)
    B,T,C = x.shape
    #mapping x to query, key , value
    q,k,v = self.w_attn(x).split(self.d_model, dim=2)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # (B, head, seq_len, d_k)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # (B, head, seq_len, d_k)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1,2) # (B, head, seq_len ,d_k)
    #attention scores
    d_k = k.size(-1)
    att = q @ k.transpose(-2,-1) / math.sqrt(d_k) # (B, head, seq_len, seq_len)
    #apply mask
    att = att.masked_fill( self.tril[:,:,:T,:T] == 0 , float('-inf'))
    att = F.softmax(att, dim=-1)
    att = self.att_dropout(att)
    att_value = att @ v # (B, head, seq_len , d_model)
    #concat all heads to one
    att_value = att_value.transpose(1,2).contiguous().view(B,T,C) # C = head * (C/head)
    #projection layer
    out = self.proj(self.resid_dropout(att_value))
    return out



In [20]:
class FeedForward(nn.Module):
  def __init__(self, d_model:int, dropout:float):
    super().__init__()
    self.net = nn.Sequential(
        nn.Linear(d_model, 4*d_model),
        nn.ReLU(),
        nn.Linear(4*d_model, d_model),
        nn.Dropout(dropout)
    )

  def forward(self,x):
    #input x shape as (batch_size, seq_len , d_model)
    return self.net(x)

In [21]:
class LayerNorm(nn.Module):
  def __init__(self, features:int, eps:float=1e-6) -> None:
    super().__init__()
    self.eps = eps
    self.gama = nn.Parameter(torch.ones(features))
    self.beta = nn.Parameter(torch.zeros(features))

  def forward(self, x):
    #input x shape as (batch_size, seq_len, d_model)
    mean = x.mean(dim=-1, keepdim=True)
    std = x.std(dim=-1, keepdim=True)
    y = ( x - mean) / (std + self.eps)
    return self.gama * y + self.beta

In [22]:
class Block(nn.Module):
  def __init__(self, d_model:int, n_head:int , dropout:float):
    super().__init__()
    assert d_model % n_head == 0 , "can not divide into heads"
    #layerNorm
    self.ln_1 = LayerNorm(d_model)
    #Attention
    self.sa = MultiHeadAttn(d_model, n_head , dropout)
    #layerNorm
    self.ln_2 = LayerNorm(d_model)
    #mlp
    self.ffw = FeedForward(d_model, dropout)

  def forward(self,x ):
    # input x shape : (batch_size, seq_len, d_model)
    # Add layer norm and skip connection
    x = x + self.sa(self.ln_1(x))
    x = x + self.ffw(self.ln_2(x))
    return x

In [23]:
class NanoGPTModel(nn.Module):
  def __init__(self, vocab_size:int, d_model:int, n_head:int , n_layer:int, dropout:float=0.0):
    super().__init__()
    #Embedding Layer
    self.embedding_table = nn.Embedding(vocab_size, d_model)
    #position
    self.pos_embedding_table = nn.Embedding(seq_len, d_model)
    # Attention Block
    self.blocks = nn.Sequential( *[Block(d_model, n_head, dropout) for _ in range(n_layer)])
    # Final layer norm
    self.ln_f = LayerNorm(d_model)
    #out
    self.lm_head = nn.Linear(d_model, vocab_size)

  def forward(self, x, targets = None):
    #input x shape as (batch_size, seq_len)
    B,T = x.shape

    #Embedding
    emb_token = self.embedding_table(x) #(B,T,d_model)
    #position
    pos_token = self.pos_embedding_table( torch.arange(T,device=device)) # (seq_len , d_model)
    x = emb_token + pos_token
    #attention block
    x = self.blocks(x)
    # apply layer norm
    x = self.ln_f(x) # (B, T , C)
    logits = self.lm_head(x) #(B,T,vocab_size)
    if targets is None: # none training
      loss = None
    else:
      B,T,C = logits.shape
      logits = logits.view(B*T, C) # get 2-d matrix
      targets = targets.view(B*T) # flat tensor
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens=200):
    #idx : long array (B,T)
    for _ in range(max_new_tokens):
      idx_cond = idx[:,-seq_len:] # get the seq_len from right to left
      #get predictions
      logits,loss = self(idx_cond)
      #get the last one
      logits = logits[:,-1,:] # (B,T,C) -> (B,C)
      #apply softmax to get probability
      probs = F.softmax(logits, dim=-1)
      #sample from distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
      #add new token to current sequence
      idx = torch.cat( (idx,idx_next),dim=1)
    return idx



## Training

In [24]:
model = NanoGPTModel(vocab_size, d_model, n_head, n_layer).to(device)
print(f" Model Parameters - { sum( p.numel() for p in model.parameters()) / 1e6}")

 Model Parameters - 10.722113


In [25]:
#Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [26]:
#Estimate loss
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for data_stage in ['train','val']:
    losses = torch.zeros(eval_iters)
    for i in range(eval_iters):
      xb,yb = get_batch(data_stage)
      logits, loss = model(xb,yb)
      losses[i] = loss.item()
    out[data_stage] = losses.mean()
  model.train()
  return out


In [27]:
for iter in range(max_iters):
  if iter % check_point == 0 or iter == max_iters - 1:
    losses = estimate_loss()
    print(f"At {iter}/{max_iters}: train loss is {losses['train']:.4f} , val loss is {losses['val']:.4f}")
  xb,yb = get_batch('train')
  logits, loss = model(xb,yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

At 0/5000: train loss is 4.3360 , val loss is 4.3306
At 100/5000: train loss is 2.3493 , val loss is 2.3634
At 200/5000: train loss is 2.0799 , val loss is 2.1225
At 300/5000: train loss is 1.8916 , val loss is 1.9828
At 400/5000: train loss is 1.7718 , val loss is 1.8887
At 500/5000: train loss is 1.6871 , val loss is 1.8255
At 600/5000: train loss is 1.6240 , val loss is 1.7860
At 700/5000: train loss is 1.5736 , val loss is 1.7485
At 800/5000: train loss is 1.5405 , val loss is 1.7117
At 900/5000: train loss is 1.5060 , val loss is 1.6934
At 1000/5000: train loss is 1.4823 , val loss is 1.6744
At 1100/5000: train loss is 1.4555 , val loss is 1.6492
At 1200/5000: train loss is 1.4436 , val loss is 1.6426
At 1300/5000: train loss is 1.4220 , val loss is 1.6275
At 1400/5000: train loss is 1.4014 , val loss is 1.6084
At 1500/5000: train loss is 1.3885 , val loss is 1.6049
At 1600/5000: train loss is 1.3795 , val loss is 1.5937
At 1700/5000: train loss is 1.3646 , val loss is 1.5853
At 1

## Generating Characters

In [29]:
context = torch.zeros( (1,1),dtype=torch.long, device=device)
idx = model.generate(context,max_new_tokens=2000)[0] # similar to flat tensor
print(decode(idx.tolist()))


KING EDWARD IV:
Ah, would your hinder love to tears:
So we he will be my cousin pass'd me against
Crying hostes and eat not another head,
Shall soon ourselves are fortune fortune;
Our scope of thine, the house of itself,
Setting on them and raven! O woe!
Thy day fall is weab the father. Farewell.

PRINCE:
God about thy business.

CORIOLANUS:
Men gentlemen such vineyard of the minim,
That will it more stand tear news with my rock.
And with thy brother tomb, had he for Tybbodd's
Exeter, impect the belly, then
supplus fear this gentleman, which I intend
Should disciple myself and vengeance
And so learned the golden serving how
Each one the boody blessed, and yet it is my life,
good man, be so he'med to see me to your babe.

FRIAR LAURENCE:
Ah, my lord; and leave out a sister,
Nor bid me so gross a precious point that
At a duke low capable. Ah, so grief, wherein he
himself out, hodest he put misery.

MAMILLIUS:
The city are no remedy chits for you: you shall pay with
thee here in weddman 

## Save Model

In [30]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [32]:
#Save weights
path = F"/content/gdrive/MyDrive/Colab Notebooks/trained_models/tiny_grad/nanogpt_v1.pt"
torch.save(model.state_dict(),path)