In [1]:
import torch
from math import log
import torch.nn as nn
from torch.nn import functional as F

batch_size = 32
block_size = 8
max_iters = 3000
eval_interval = 300
learning_rate=1e-3
eval_iters = 200
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print (device)

from google.colab import drive
drive.mount ('/content/drive')

with open('/content/drive/My Drive/Colab Notebooks/input.txt', 'r', encoding='utf-8') as f:
  text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print (vocab_size)

# tokenize convert text to sequence of integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

#print (encode("This is Manjit"))
print (decode(encode("This is Manjit")))

data = torch.tensor(encode(text), dtype = torch.long)

n = int(0.9 * len(data)) # 90% train, 10% test (validation data)
train_data = data[:n] # train data
val_data = data[n:] # validation data

block_size = 8
train_data[:block_size + 1]

x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    #print(f'when input is {context}, the target: {target}')

torch.manual_seed(1337)


def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix])
    x, y = x.to(device), y.to(device)

    return x, y

xb, yb = get_batch('train')
#print('inputs:')
#print (xb.shape)
#print(xb)
#print('targets:')
#print(yb.shape)
#print(yb)


for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t + 1]
        target = yb[b, t]
        #print(f'when input is {context.tolist()}, the target: {target}')



cpu
Mounted at /content/drive

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
This is Manjit


In [1]:


class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self,idx,max_new_tokens):
        for _ in range(int(max_new_tokens)):
            logits, loss = self(idx)
            #logits = logits[0]
            logits = logits[:,-1,:]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

  xb, yb = get_batch('train')
  logits, loss = m(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

print(loss.item())
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


1:02:46


SyntaxError: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (<ipython-input-1-7dd98bdf351e>, line 65)

In [None]:

B,T,C=4,8,32
x = torch.randn(B,T,C)
print(x)


tensor([[[-1.4687, -0.1518,  0.7498,  ...,  2.3512, -1.3086, -0.3565],
         [ 0.8805, -0.0164,  0.3737,  ..., -1.1304,  0.2162, -0.8482],
         [ 0.2896, -0.0938, -0.6820,  ...,  0.4895,  0.0872, -1.3640],
         ...,
         [-1.2517,  0.6726, -1.2619,  ..., -1.0056,  0.7523, -0.6185],
         [ 1.2602, -0.2122, -1.0709,  ...,  0.2736, -0.1919, -0.6448],
         [-0.3180,  2.1918,  0.6352,  ..., -0.1756, -0.7506,  0.8345]],

        [[-0.3092,  1.8413,  2.0893,  ..., -0.5337, -0.6754, -0.7354],
         [-0.5941,  0.5434,  0.5130,  ...,  0.4598,  0.2554, -0.9636],
         [-1.2100, -1.5967,  0.4598,  ..., -0.6975, -0.9933,  0.4394],
         ...,
         [-1.7886,  0.7822,  1.5901,  ...,  1.0275, -0.2810, -1.1213],
         [-0.5666,  1.3122, -0.1253,  ..., -0.3216, -1.1370, -0.5281],
         [ 0.2275, -0.4873,  0.0369,  ..., -1.8277,  1.6686,  0.1663]],

        [[ 1.7751, -0.1566,  1.0223,  ..., -1.0404, -1.3350,  0.1033],
         [-0.0818, -1.5029,  0.9906,  ...,  0