<a href="https://colab.research.google.com/github/nik-hil-24/bazingo/blob/main/init0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Get Dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-30 20:25:02--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-07-30 20:25:02 (20.0 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
# Imports
import torch
from torch import nn
from torch.optim import AdamW
from torch.nn import functional as F

# **Data**

In [3]:
# Device
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [4]:
# Read Dataset
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [5]:
# Dataset Length
len(text)

1115394

In [6]:
# Unique Characters in the Dataset
characters = list(set(text))
vocab_size = len(characters)
print(''.join(sorted(characters)))
print(len(characters))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [7]:
# Encode Text
mapping = {char:i for i, char in enumerate(characters)}
rev_mapping = dict(enumerate(characters))

encode_text = lambda string: [mapping[s] for s in string]
decode_text = lambda ls: ''.join([rev_mapping[l] for l in ls])

# Test
print(encode_text('Hi There!'))
print(decode_text([57, 39, 29, 38, 4, 42, 14, 42, 55]))

[47, 10, 27, 24, 51, 20, 37, 20, 39]
Z!VowfsfQ


In [8]:
# Creating Tensor Dataset of Encoded Text
data = torch.tensor(encode_text(text), dtype = torch.long)
print(data.shape)

torch.Size([1115394])


In [9]:
# Train Test Split
n = int(0.9*(len(data)))
train = data[:n]
test = data[n:]

In [10]:
# DataLoader Batch Size
batch_size = 4
# Maximum Context Length
block_size = 8

def get_batch(split):
    # Get Data
    inp = train if split == 'train' else test
    # Random Indexes
    ix = torch.randint(len(inp)-block_size, (batch_size,))
    # x is from i:i+block_size, y is i+1:i_block_size+1
    x = torch.stack([inp[i:i+block_size] for i in ix])
    y = torch.stack([inp[i+1:i+block_size+1] for i in ix])
    return x, y

x_batch, y_batch = get_batch('train')
for i in range(batch_size):
    for j in range(block_size):
        context = x_batch[i, :j+1]
        target = y_batch[i, j]
        print(f'Context is: {context.tolist()}, Target is {target.tolist()}')

Context is: [36], Target is 54
Context is: [36, 54], Target is 17
Context is: [36, 54, 17], Target is 54
Context is: [36, 54, 17, 54], Target is 17
Context is: [36, 54, 17, 54, 17], Target is 21
Context is: [36, 54, 17, 54, 17, 21], Target is 28
Context is: [36, 54, 17, 54, 17, 21, 28], Target is 18
Context is: [36, 54, 17, 54, 17, 21, 28, 18], Target is 35
Context is: [51], Target is 20
Context is: [51, 20], Target is 27
Context is: [51, 20, 27], Target is 63
Context is: [51, 20, 27, 63], Target is 20
Context is: [51, 20, 27, 63, 20], Target is 48
Context is: [51, 20, 27, 63, 20, 48], Target is 48
Context is: [51, 20, 27, 63, 20, 48, 48], Target is 34
Context is: [51, 20, 27, 63, 20, 48, 48, 34], Target is 37
Context is: [27], Target is 1
Context is: [27, 1], Target is 20
Context is: [27, 1, 20], Target is 37
Context is: [27, 1, 20, 37], Target is 42
Context is: [27, 1, 20, 37, 42], Target is 0
Context is: [27, 1, 20, 37, 42, 0], Target is 22
Context is: [27, 1, 20, 37, 42, 0, 22], Ta

# **Bigram Language Model**

In [11]:
# Seed
torch.manual_seed(1337)

# Bigram Model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n_embed = 32):
        super(BigramLanguageModel, self).__init__()
        # Each Token Reads a Row From The Table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embed)
        self.lm_head = nn.Linear(n_embed, vocab_size)

    def forward(self, context, targets = None):

        # (B,T,C)
        token_embed = self.token_embedding_table(context)
        # Logits (batch_size, block_size, vocab_size)
        logits = self.lm_head(token_embed)

        # Loss
        if targets == None:
            loss = None
        else:
            # Get batch_size, block_size, vocab_size
            B, T, C = logits.shape
            # Reshape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T,)
            # Calculate Loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, context, max_new_tokens):
        # Iterating Through Number of Tokens To Generate
        for _ in range(max_new_tokens):
            # Get Predictions
            logits, loss = self(context)
            # Get Last Block (Time Step)
            logits = logits[:, -1, :]
            # Probability
            probs = F.softmax(logits, dim = -1)
            # Sample From The Distribution
            context_next = torch.multinomial(probs, num_samples = 1)
            # Append
            context = torch.cat((context, context_next), dim = 1)

        return context

In [12]:
# Test Bigram
model = BigramLanguageModel(vocab_size).to(device)
logits, loss = model(x_batch.to(device), y_batch.to(device))
print(decode_text(model.generate(context = torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=100)[0].tolist()))

uKQ$d.PJyGoyCfRW'$
ozy T l
t3DjuueQUg3yFsPBBn?iM!qc, O;;gFsSK,!z!kYMSraMmDEMtCk'QRs3.c&DK!SpfMA?Jo!Bt


In [13]:
# Optimizer
opt = AdamW(model.parameters(), lr = 1e-3)

In [14]:
# Train BigramLanguageModel
print_freq = 100
batch_size = 32
for epoch in range(3000):
    # Get Batch
    x, y = get_batch('train')
    x, y = x.to(device), y.to(device)

    # Forward
    logits, loss = model(x, y)

    # Backward
    opt.zero_grad(set_to_none=True)
    loss.backward()
    opt.step()

    if epoch%print_freq == 0:
        print(f'Epoch: {epoch}, Loss: {loss.item()}')

Epoch: 0, Loss: 4.3982768058776855
Epoch: 100, Loss: 3.546319007873535
Epoch: 200, Loss: 3.209672212600708
Epoch: 300, Loss: 2.860365867614746
Epoch: 400, Loss: 2.7121753692626953
Epoch: 500, Loss: 2.748335361480713
Epoch: 600, Loss: 2.6707661151885986
Epoch: 700, Loss: 2.6101510524749756
Epoch: 800, Loss: 2.591538667678833
Epoch: 900, Loss: 2.5113446712493896
Epoch: 1000, Loss: 2.410372734069824
Epoch: 1100, Loss: 2.5149483680725098
Epoch: 1200, Loss: 2.5553746223449707
Epoch: 1300, Loss: 2.446303129196167
Epoch: 1400, Loss: 2.5367698669433594
Epoch: 1500, Loss: 2.590524435043335
Epoch: 1600, Loss: 2.4860634803771973
Epoch: 1700, Loss: 2.567815065383911
Epoch: 1800, Loss: 2.5266618728637695
Epoch: 1900, Loss: 2.5463438034057617
Epoch: 2000, Loss: 2.5486667156219482
Epoch: 2100, Loss: 2.4841928482055664
Epoch: 2200, Loss: 2.43172025680542
Epoch: 2300, Loss: 2.42266845703125
Epoch: 2400, Loss: 2.507502555847168
Epoch: 2500, Loss: 2.4707658290863037
Epoch: 2600, Loss: 2.467198610305786
E

In [15]:
print(decode_text(model.generate(context = torch.zeros((1, 1), dtype=torch.long).to(device), max_new_tokens=50)[0].tolist()))

ue 's e'son s llolame-y hoourad wns thare tsersa s:


# **Attention**

In [16]:
B, T, C = 4, 8, 2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [17]:
# v1
x_bow = torch.zeros((B,T,C))
for i in range(B):
    for j in range(T):
        x_prev = x[i, :j,]
        x_bow[i, j] = torch.mean(x_prev, 0)
print(x_bow.shape)

torch.Size([4, 8, 2])


In [18]:
# Matrix Mul Weighted Avg
a = torch.tril(torch.ones((3,3)))
a = a/torch.sum(a, 1, keepdim = True)
b = torch.randint(0, 10, (3,2)).float()
c = a@b
print(a)
print(b)
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
tensor([[2., 4.],
        [5., 2.],
        [7., 5.]])
tensor([[2.0000, 4.0000],
        [3.5000, 3.0000],
        [4.6667, 3.6667]])


In [19]:
# v2
weights = torch.tril(torch.ones((T,T)))
weights = weights/torch.sum(weights, 1, keepdim = True)
# (T,T)@(B,T,C) -> (B,T,T)@(B,T,C) -> (B,T,C)
x_bow2 = weights @ x
print(x_bow2.shape)

torch.Size([4, 8, 2])


In [20]:
# v3
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
x_bow3 = wei@x

In [20]:
# v4
B,T,C = 4,8,32
x = torch.randn(4,8,32)

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim = -1)
x_bow3 = wei@x