<a href="https://colab.research.google.com/github/nik-hil-24/bazingo/blob/main/init0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Get Dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-29 18:49:44--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-07-29 18:49:44 (23.2 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [2]:
# Imports
import torch
from torch import nn
from torch.optim import AdamW
from torch.nn import functional as F

In [3]:
# Device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [4]:
# Read Dataset
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [5]:
# Dataset Length
len(text)

1115394

In [6]:
# Unique Characters in the Dataset
characters = list(set(text))
vocab_size = len(characters)
print(''.join(sorted(characters)))
print(len(characters))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
# Encode Text
mapping = {char:i for i, char in enumerate(characters)}
rev_mapping = dict(enumerate(characters))

encode_text = lambda string: [mapping[s] for s in string]
decode_text = lambda ls: ''.join([rev_mapping[l] for l in ls])

# Test
print(encode_text('Hi There!'))
print(decode_text([57, 39, 29, 38, 4, 42, 14, 42, 55]))

[8, 15, 17, 6, 18, 4, 31, 4, 12]
cdMaet&tQ


In [8]:
# Creating Tensor Dataset of Encoded Text
data = torch.tensor(encode_text(text), dtype = torch.long)
print(data.shape)

torch.Size([1115394])


In [9]:
# Train Test Split
n = int(0.9*(len(data)))
train = data[:n]
test = data[n:]

In [10]:
# DataLoader Batch Size
batch_size = 4
# Maximum Context Length
block_size = 8

def get_batch(split):
    # Get Data
    inp = train if split == 'train' else test
    # Random Indexes
    ix = torch.randint(len(inp)-block_size, (batch_size,))
    # x is from i:i+block_size, y is i+1:i_block_size+1
    x = torch.stack([inp[i:i+block_size] for i in ix])
    y = torch.stack([inp[i+1:i+block_size+1] for i in ix])
    return x, y

x_batch, y_batch = get_batch('train')
for i in range(batch_size):
    for j in range(block_size):
        context = x_batch[i, :j+1]
        target = y_batch[i, j]
        print(f'Context is: {context.tolist()}, Target is {target.tolist()}')

Context is: [1], Target is 4
Context is: [1, 4], Target is 63
Context is: [1, 4, 63], Target is 17
Context is: [1, 4, 63, 17], Target is 23
Context is: [1, 4, 63, 17, 23], Target is 40
Context is: [1, 4, 63, 17, 23, 40], Target is 42
Context is: [1, 4, 63, 17, 23, 40, 42], Target is 17
Context is: [1, 4, 63, 17, 23, 40, 42, 17], Target is 39
Context is: [42], Target is 33
Context is: [42, 33], Target is 17
Context is: [42, 33, 17], Target is 39
Context is: [42, 33, 17, 39], Target is 33
Context is: [42, 33, 17, 39, 33], Target is 40
Context is: [42, 33, 17, 39, 33, 40], Target is 23
Context is: [42, 33, 17, 39, 33, 40, 23], Target is 42
Context is: [42, 33, 17, 39, 33, 40, 23, 42], Target is 32
Context is: [17], Target is 25
Context is: [17, 25], Target is 33
Context is: [17, 25, 33], Target is 40
Context is: [17, 25, 33, 40], Target is 31
Context is: [17, 25, 33, 40, 31], Target is 1
Context is: [17, 25, 33, 40, 31, 1], Target is 4
Context is: [17, 25, 33, 40, 31, 1, 4], Target is 37


In [11]:
# Seed
torch.manual_seed(1337)

# Bigram Model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        # Each Token Reads a Row From The Table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, context, targets = None):
        # Logits (batch_size, block_size, vocab_size)
        logits = self.token_embedding_table(context)

        # Loss
        if targets == None:
            loss = None
        else:
            # Get batch_size, block_size, vocab_size
            B, T, C = logits.shape
            # Reshape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T,)
            # Calculate Loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, context, max_new_tokens):
        # Iterating Through Number of Tokens To Generate
        for _ in range(max_new_tokens):
            # Get Predictions
            logits, loss = self(context)
            # Get Last Block (Time Step)
            logits = logits[:, -1, :]
            # Probability
            probs = F.softmax(logits, dim = -1)
            # Sample From The Distribution
            context_next = torch.multinomial(probs, num_samples = 1)
            # Append
            context = torch.cat((context, context_next), dim = 1)

        return context

In [12]:
# Test Bigram
model = BigramLanguageModel(vocab_size).to(device)
logits, loss = model(x_batch.to(device), y_batch.to(device))
print(decode_text(model.generate(context = torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

pPyi'xmJUJatpfWwfmOm-ERVA- KinbpYxQB-z-u!u'qolki
rlgR
$$rjki!y.$dp;ckVZ'gh;'KOXfQ;RuxaH3rZGtt'AdradOK


In [13]:
# Optimizer
opt = AdamW(model.parameters(), lr = 1e-3)

In [14]:
# Train BigramLanguageModel
print_freq = 10
batch_size = 32
for epoch in range(100):
    # Get Batch
    x, y = get_batch('train')
    x, y = x.to(device), y.to(device)

    # Forward
    logits, loss = model(x, y)

    # Backward
    opt.zero_grad(set_to_none=True)
    loss.backward()
    opt.step()

    if epoch%print_freq == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item()}')

Epoch: 0, Loss: 4.496352195739746
Epoch: 10, Loss: 4.599323272705078
Epoch: 20, Loss: 4.584362030029297
Epoch: 30, Loss: 4.498538494110107
Epoch: 40, Loss: 4.577712059020996
Epoch: 50, Loss: 4.467401504516602
Epoch: 60, Loss: 4.574483871459961
Epoch: 70, Loss: 4.441227912902832
Epoch: 80, Loss: 4.4763689041137695
Epoch: 90, Loss: 4.581730842590332


In [15]:
print(decode_text(model.generate(context = torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=500)[0].tolist()))

p?!f$WSB,a?UTmwQad
z:NiYIX'pyzJeDhjZ-m':u.W&LL&qLD?kityASxWuKCJ:GyKy JvUW:bEAx?FShA:QIzlrb3rZe&VQ?Un
Hrls?Hu,?YuRLoKA.AYvhDoZeFHi
;c $rB,WUNu,kioXPEjPgcc'$Mf3,ERgAF&BoX.U,&mePxy
CCi.CYDUkilA.UgtA,A3rBUQIOd?
fOq'MuYxiKF:u$cGw-ki
et DSaF - Jr&mxgDhc3P
Vb$kig
x-GCpfqVURtIvXuGXm!QqncUNE$MMU;?-Iki 3xYKPDFojo D fmmjmjBJGht'b.rbSHgGab.OV.ULdLT:,ue,edprdtdPu?B-kZ lsxAy&- :FZbOHUQmdLdLK,iJGxA3PIOem,EZ !LOjgj!Q Ek$rBUoqgle,IGtS',w???CJprBInpzTr&EFD?cDH
fwbz-EqoQQ
fwBhHGtdgJ!EfD
NEEb?;c&z:vnQSXCoFx. J-' IXA
