<a href="https://colab.research.google.com/github/nitin649/Text-Generation-Using-Transformers/blob/main/Textgeneration_using_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data loading

In [None]:
# @title Default title text
#read it in to inspect it
with open('/content/drive/MyDrive/datasets/shakespeare.txt', 'r',encoding='utf-8') as f:
  text = f.read()

print(text[:200]) #will print first 200 characters

In [None]:
#print length of entire date
print('len is {}'.format(len(text)))

len is 1115393


## Creating index to character and character to index mapping

In [None]:
#creating mapping from index to characters and characters to index
chars = sorted(set(list(text))) #making set of all characters and sorting them.
vocab_size = len(chars)
print('all characters together -->',''.join(chars))
print('vocab size --->',vocab_size)

stoi = {char: i for i , char in enumerate(chars)}
itos = {i : char for i , char in enumerate(chars)}
encode = lambda string  : [stoi[char] for char in string] #take a string , output list of intergers
decode = lambda list_index : ''.join([itos[index] for index in list_index]) #take list of index and gives a string

print(encode("how are you"))
print(decode(encode("how are you")))

all characters together --> 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size ---> 65
[46, 53, 61, 1, 39, 56, 43, 1, 63, 53, 59]
how are you


In [None]:
#encode entire data into pytorch tensor
import torch # we use PyTorch: https://pytorch.org
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100]) # the 1000 characters we looked at earier will to the GPT look like this

torch.Size([1115393]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])


In [None]:
# Let's now split up the data into train and validation sets
#note we are taking entire data as a single string.
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [None]:
#Now this will be an autoregressive model and auto-regressive model works as the following.
#my -->name
#my + name --> xyz ....

x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
  context = x[:t+1]
  target = y[t]
  print('when input is {} the target {}'.format(context , target))


when input is tensor([18]) the target 47
when input is tensor([18, 47]) the target 56
when input is tensor([18, 47, 56]) the target 57
when input is tensor([18, 47, 56, 57]) the target 58
when input is tensor([18, 47, 56, 57, 58]) the target 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target 58


In [None]:
from ast import Index
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix]) #stack will stackup all datapoints together to create a single batch
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

tensor([ 77170, 236647, 935582, 561895])
inputs:
torch.Size([4, 8])
tensor([[53, 59,  6,  1, 58, 56, 47, 40],
        [49, 43, 43, 54,  1, 47, 58,  1],
        [13, 52, 45, 43, 50, 53,  8,  0],
        [ 1, 39,  1, 46, 53, 59, 57, 43]])
targets:
torch.Size([4, 8])
tensor([[59,  6,  1, 58, 56, 47, 40, 59],
        [43, 43, 54,  1, 47, 58,  1, 58],
        [52, 45, 43, 50, 53,  8,  0, 26],
        [39,  1, 46, 53, 59, 57, 43,  0]])
----
when input is [53] the target: 59
when input is [53, 59] the target: 6
when input is [53, 59, 6] the target: 1
when input is [53, 59, 6, 1] the target: 58
when input is [53, 59, 6, 1, 58] the target: 56
when input is [53, 59, 6, 1, 58, 56] the target: 47
when input is [53, 59, 6, 1, 58, 56, 47] the target: 40
when input is [53, 59, 6, 1, 58, 56, 47, 40] the target: 59
when input is [49] the target: 43
when input is [49, 43] the target: 43
when input is [49, 43, 43] the target: 54
when input is [49, 43, 43, 54] the target: 1
when input is [49, 43, 43, 54, 

In [None]:
len(train_data) - block_size

1003845

## Basic Bigram Model

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size , vocab_size)

    def forward(self,idx,target=None):

        logits = self.token_embedding_table(idx) #(B,T,C) #batch , timestamps , embedding_size
        if target is None:
            loss =None
        else:
            B , T , C = logits.shape
            logits = logits.view(B*T,C) #doing this so that we can feed this into loss function
            target = target.view(B*T)
            loss = F.cross_entropy(logits , target)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        count=0
        for _ in range(max_new_tokens):
            count+=1
            #print('initial idx',idx , idx.shape)
            logits , loss = self(idx)
            #print('logits shape',logits.shape)
            #focus only on last time step
            logits = logits[:,-1,:] #(B,C)
            probs = F.softmax(logits, dim=-1) #B ,C
            #sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) #(B,1)
            #print('predicted id',idx_next)
            #append sampled index to the running sequence
            idx = torch.cat((idx , idx_next) , dim=1) #(B,T+1)
            # print('concated id',idx)
            # if count == 3:
            #     break
        return idx


m = BigramLanguageModel(vocab_size)
logits, loss = m(xb , yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


In [None]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [None]:
batch_size = 32
for steps in range(100): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


4.578680038452148


In [None]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


xiKi-RJ:COpVuUa!U?qMH.uk!sCuMXvv!CJFfx;LgRyJknOEti.?I&-gPlLyulId?XlaInQ'q,lT$
3Q&sGlvHQ?mqSq-eON
x?S


In [None]:
#trained model without adding context of previous time step
import torch
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
print('x shape is this {} '.format(x.shape))
#we can take mean of previous time step together to get context of current time step
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow= torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] #(t,c)
        #print(xprev.shape)
        xbow[b,t] = torch.mean(xprev, 0)


print('x' , x[0])
xbow[0] , xbow[0].shape #this method is not efficient

x shape is this torch.Size([4, 8, 2]) 
x tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])


(tensor([[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]]),
 torch.Size([8, 2]))

## Working of attention mechanism using different approach

In [None]:
# toy example illustrating how matrix multiplication can be used for a "weighted aggregation"
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
#@title Default title text
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T)) #give lower triangular matrix
print(wei)
wei = wei / wei.sum(1, keepdim=True)
print(wei) #normalizing values so that
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
xbow2[0]

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T,T))
#print(tril)
wei = torch.zeros((T,T))
#print(wei)
wei = wei.masked_fill(tril==0,float('-inf')) #replacing zero with -inf
#print(wei)
wei = F.softmax(wei, dim=-1) #this
print(wei)
xbow3= wei @ x
xbow3[0]

# tril = torch.tril(torch.ones(T, T))
# wei = torch.zeros((T,T))
# wei = wei.masked_fill(tril == 0, float('-inf'))
# wei = F.softmax(wei, dim=-1)
# xbow3 = wei @ x
# torch.allclose(xbow, xbow3)



tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
#attention mechanism
import torch
#query , key , value
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)
head_size =16

query = nn.Linear(C,head_size,bias=False) #(B , T , head_size)
key = nn.Linear(C,head_size,bias=False) #(B, T ,head_size) #
value = nn.Linear(C,head_size,bias=False)
q = query(x)
k = key(x)
wei = q @ k.transpose(-2 , -1) #(B , T , 16) @ (B , 16 , T) #after transpose of key
#print(wei.shape) #(B,T,T)

#mask attention
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0,float('-inf')) #wei matrix is product of query @ key.transpose()
wei = F.softmax(wei,dim=-1)
print('wei',wei[0])

v = value(x)

print(v[0])
out = wei @ v
print(out[0])

wei tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5877, 0.4123, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4457, 0.2810, 0.2733, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2220, 0.7496, 0.0175, 0.0109, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0379, 0.0124, 0.0412, 0.0630, 0.8454, 0.0000, 0.0000, 0.0000],
        [0.5497, 0.2187, 0.0185, 0.0239, 0.1831, 0.0062, 0.0000, 0.0000],
        [0.2576, 0.0830, 0.0946, 0.0241, 0.1273, 0.3627, 0.0507, 0.0000],
        [0.0499, 0.1052, 0.0302, 0.0281, 0.1980, 0.2657, 0.1755, 0.1474]],
       grad_fn=<SelectBackward0>)
tensor([[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007, -0.5239,
         -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,  0.2862,  0.5710],
        [ 0.8321, -0.8144, -0.3242,  0.5191, -0.1252, -0.4898, -0.5287, -0.0314,
          0.1072,  0.8269,  0.8132, -0.0271,  0.4775,  0.4980, -0.1377,  1.4025],
        [ 0.6035, -0.2500, -0.6159,  0.4068

## Main code for decoder model inlcuding all the required functions

In [None]:
#Complete code
import torch
import torch.nn as nn
from torch.nn import functional as F


# hyperparameters
batch_size = 16 # how many independent sequences will we process in parallel?
block_size = 32 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 200
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4 #no of blocks
dropout = 0.0

torch.manual_seed(1337)

with open('/content/drive/MyDrive/datasets/shakespeare.txt', 'r',encoding='utf-8') as f:
  text = f.read()

#data preprocessig
chars = sorted(set(list(text))) #making set of all characters and sorting them.
vocab_size = len(chars)
print('all characters together -->',''.join(chars))
print('vocab size --->',vocab_size)

stoi = {char: i for i , char in enumerate(chars)}
itos = {i : char for i , char in enumerate(chars)}
encode = lambda string  : [stoi[char] for char in string] #take a string , output list of intergers
decode = lambda list_index : ''.join([itos[index] for index in list_index]) #take list of index and gives a string

#train and test split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out


#data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) #this will generate a batch of randomly selected values between upper and lower limit.
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix]) #stack will stackup all datapoints together to create a single batch
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

class LayerNorm1d:
    #layer normalization is same as batchnormalization , the main difference is in batch-normalization we calculate mean along the column of each batch
    #example - [[1,2,3],[5,6,7]] = mean would be [(1+5)/2 , (2+6)/2 , (3+7)/2] so when we use mean to do normalization we get some data augmentation
    #as we consider some context of each sample in the batch to calculate single row on the other hand in layer normalization we do this on each row
    #means we calculate mean along the rows = mean would be [(1+2+3) / 3 , (5+6+7)/3] --> no data augementation
    #Note here we dont need to maintain buffer for mean and variacne for testing as we calculate these values using the row only.
    def __init__(self, dim , eps = 1e-5 ,momentum=0.1):
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta - torch.zeros(dim)
    def __call__(self,x):
        xmean = x.mean(1,keepdim=True) #batch mean and in batch_norm dim will be 0 (along with column)
        xvar = x.var(1,keepdim=True) #batch variance
        xhat = (x-xmean)/torch.sqrt(xvar+self.eps) #normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out
    def parameters(self):
        return [self.gamma,self.beta] #trainable parameters

class Head(nn.Module):
    """ one head of self-attention """
    def __init__(self,head_size):
        super().__init__()
        self.key = nn.Linear(n_embd , head_size , bias=False)
        self.query = nn.Linear(n_embd , head_size , bias=False)
        self.value = nn.Linear(n_embd , head_size , bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size))) #we use pytorch register buffer to store this tril matrix
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        B, T , C = x.shape
        k = self.key(x)
        q=  self.query(x)
        #compute attention scores ('affinities')
        scores = q @ k.transpose(-2,-1)* C**-0.5#(B,T,C) @(B,C,T) -->(B,T,T)
        #as per the original paper formula for calculating scores is ((q @ k.transpose) / underroot(embeding_size)) @ V
        scores = scores.masked_fill(self.tril[:T, :T]==0,float('-inf')) #(B,T,T) (self.tril[:T,;T]) means taking whole tril matrix
        scores = F.softmax(scores,dim=-1) #(B, T, T)
        scores = self.dropout(scores)
        v = self.value(x) #(B,T,C)
        out = scores @ v
        return out

class MultiHeadAttention(nn.Module):
    """multiple head of self-attention in parallel"""
    def __init__(self,num_heads , head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd , n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x):
        out = torch.cat([h(x) for h in self.heads],dim=-1)#we calculate head size using embedding_size // no_of_heads
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity"""
    def __init__(self,n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd , 4* n_embd),#as mentioned in attention all you need paper input is = 512(embedding_size) and output is 2048 (which is nothing but 4 * 512)
            # While the linear transformations are the same across different positions, they use different parameters
            # from layer to layer. Another way of describing this is as two convolutions with kernel size 1.
            # The dimensionality of input and output is dmodel = 512, and the inner-layer has dimensionality
            # dff = 2048.
            nn.ReLU(),
            nn.Linear(4*n_embd , n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """transformer block """
    def __init__(self,n_embd, n_head):
         # n_embd: embedding dimension, n_head: the number of heads we'd like
         super().__init__()
         head_size = n_embd//n_head
         self.sa= MultiHeadAttention(n_head, head_size)
         self.ffwd = FeedForward(n_embd)
         self.ln1=nn.LayerNorm1d(n_embd)
         self.ln2=nn.LayerNorm1d(n_embd)

    def forward(self,x):
        x = x + self.sa(self.ln1(x)) #residual connection ( as mentioned in paper (add + norm ) computation)
        #now here we made some changes so as per the original paper we normalized results after getting ouput from self attention block.
        #but nowadays in practice we usually do this normalization before feeding data into self - attention block so as for ffwd layer.
        x = x + self.ffwd(self.ln2(x))
        return x

class LanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size , n_embd)
        self.position_embedding = nn.Embedding(block_size, n_embd) #adding position embedding as per the paper but using different approach (block_size,pos_embd)
        self.blocks = nn.Sequential(*[Block(n_embd ,n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm1d(n_embd) #final layer norm
        self.lm_head = nn.Linear(n_embd , vocab_size)

    def forward(self,idx ,targets=None):
        B , T = idx.shape
        #idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding(idx) #(B,T C)
        pos_emb = self.position_embedding(torch.arange(T,device=device)) #(T,C)
        x = tok_emb + pos_emb #(B,T,C)-->output shape
        x = self.blocks(x) #(B,T,C)
        x = self.ln_f(x) #(B,T,C)
        logits = self.lm_head(x) #(B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T , C)
            targets = targets.view(B*T)
            loss=F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:] #because we need maximum context of length 32 we are fixing -32 size
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx



all characters together --> 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size ---> 65


In [None]:
model = LanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


0.209729 M parameters
step 0: train loss 4.4112, val loss 4.4015
step 200: train loss 2.5096, val loss 2.5115
step 400: train loss 2.3491, val loss 2.3596
step 600: train loss 2.2455, val loss 2.2547
step 800: train loss 2.1681, val loss 2.1723
step 1000: train loss 2.1014, val loss 2.1358
step 1200: train loss 2.0320, val loss 2.0732
step 1400: train loss 1.9968, val loss 2.0533
step 1600: train loss 1.9544, val loss 2.0325
step 1800: train loss 1.9264, val loss 2.0002
step 2000: train loss 1.8799, val loss 1.9729
step 2200: train loss 1.8821, val loss 1.9658
step 2400: train loss 1.8419, val loss 1.9523
step 2600: train loss 1.8179, val loss 1.9358
step 2800: train loss 1.8011, val loss 1.9365
step 3000: train loss 1.7893, val loss 1.9383
step 3200: train loss 1.7699, val loss 1.9085
step 3400: train loss 1.7557, val loss 1.8833
step 3600: train loss 1.7468, val loss 1.8764
step 3800: train loss 1.7245, val loss 1.8660
step 4000: train loss 1.7244, val loss 1.8575
step 4200: train lo

## Generate Data from model

In [None]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=2000)[0].tolist()))


And they bride with to law be mades;
Thou but take Ond my call'd heart backen'd him vet?
Feetle was, away, my facal'nce zolous
Yours, to fignic me mildnicling effireeds, heir latisna,
Wove to hear me now on that spelfling me lie-huse coursent the bains
Withy holven'd norforewell? good thy world that
more-nought their king at too rive
Against he poor of his burder of thrust for treary to shall I know on,
To fight? go hometh from and toobly,
And say you are stording me any and
Hir-his chan your hartiments, been you shall have hanso,
Befaning-ennought we men.

CORIOLANUS:
Where not usibun, with confessy.
Which might to England marcians.

LADY PEY:
Well, to madam?

LANCIO:
I meet my see: in doublord my lades:
What's my scept is gone deceed, is xugge.

NORFONTEL:
What come, we messel. Do yearn'd, do so lad not a vysedchanged;
And detide by the nothing Captuless to doom, iuGo,
To not the summore protentess-bent, and, bruilts, were be thou said;
For for my watch time wout in forcesy,
In give