<a href="https://colab.research.google.com/github/nik-hil-24/bazingo/blob/main/init0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Get Dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-07-29 17:50:58--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-07-29 17:50:58 (35.7 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [2]:
# Imports
import torch
from torch import nn
from torch.optim import AdamW
from torch.nn import functional as F

In [3]:
# Read Dataset
with open('input.txt', 'r', encoding = 'utf-8') as f:
    text = f.read()

In [4]:
# Dataset Length
len(text)

1115394

In [5]:
# Unique Characters in the Dataset
characters = list(set(text))
vocab_size = len(characters)
print(''.join(sorted(characters)))
print(len(characters))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# Encode Text
mapping = {char:i for i, char in enumerate(characters)}
rev_mapping = dict(enumerate(characters))

encode_text = lambda string: [mapping[s] for s in string]
decode_text = lambda ls: ''.join([rev_mapping[l] for l in ls])

# Test
print(encode_text('Hi There!'))
print(decode_text([57, 39, 29, 38, 4, 42, 14, 42, 55]))

[49, 14, 4, 33, 15, 8, 7, 8, 30]
fqdz -i-Y


In [7]:
# Creating Tensor Dataset of Encoded Text
data = torch.tensor(encode_text(text), dtype = torch.long)
print(data.shape)

torch.Size([1115394])


In [8]:
# Train Test Split
n = int(0.9*(len(data)))
train = data[:n]
test = data[n:]

In [9]:
# DataLoader Batch Size
batch_size = 4
# Maximum Context Length
block_size = 8

def get_batch(split):
    # Get Data
    inp = train if split == 'train' else test
    # Random Indexes
    ix = torch.randint(len(inp)-block_size, (batch_size,))
    # x is from i:i+block_size, y is i+1:i_block_size+1
    x = torch.stack([inp[i:i+block_size] for i in ix])
    y = torch.stack([inp[i+1:i+block_size+1] for i in ix])
    return x, y

x_batch, y_batch = get_batch('train')
for i in range(batch_size):
    for j in range(block_size):
        context = x_batch[i, :j+1]
        target = y_batch[i, j]
        print(f'Context is: {context.tolist()}, Target is {target.tolist()}')

Context is: [8], Target is 43
Context is: [8, 43], Target is 43
Context is: [8, 43, 43], Target is 31
Context is: [8, 43, 43, 31], Target is 7
Context is: [8, 43, 43, 31, 7], Target is 50
Context is: [8, 43, 43, 31, 7, 50], Target is 51
Context is: [8, 43, 43, 31, 7, 50, 51], Target is 33
Context is: [8, 43, 43, 31, 7, 50, 51, 33], Target is 15
Context is: [19], Target is 8
Context is: [19, 8], Target is 7
Context is: [19, 8, 7], Target is 8
Context is: [19, 8, 7, 8], Target is 4
Context is: [19, 8, 7, 8, 4], Target is 10
Context is: [19, 8, 7, 8, 4, 10], Target is 14
Context is: [19, 8, 7, 8, 4, 10, 14], Target is 63
Context is: [19, 8, 7, 8, 4, 10, 14, 63], Target is 46
Context is: [32], Target is 18
Context is: [32, 18], Target is 4
Context is: [32, 18, 4], Target is 17
Context is: [32, 18, 4, 17], Target is 61
Context is: [32, 18, 4, 17, 61], Target is 31
Context is: [32, 18, 4, 17, 61, 31], Target is 32
Context is: [32, 18, 4, 17, 61, 31, 32], Target is 63
Context is: [32, 18, 4, 

In [10]:
# Seed
torch.manual_seed(1337)

# Bigram Model
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        # Each Token Reads a Row From The Table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, context, targets = None):
        # Logits (batch_size, block_size, vocab_size)
        logits = self.token_embedding_table(context)

        # Loss
        if targets == None:
            loss = None
        else:
            # Get batch_size, block_size, vocab_size
            B, T, C = logits.shape
            # Reshape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T,)
            # Calculate Loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, context, max_new_tokens):
        # Iterating Through Number of Tokens To Generate
        for _ in range(max_new_tokens):
            # Get Predictions
            logits, loss = self(context)
            # Get Last Block (Time Step)
            logits = logits[:, -1, :]
            # Probability
            probs = F.softmax(logits, dim = -1)
            # Sample From The Distribution
            context_next = torch.multinomial(probs, num_samples = 1)
            # Append
            context = torch.cat((context, context_next), dim = 1)

        return context

In [11]:
# Test Bigram
model = BigramLanguageModel(vocab_size)
logits, loss = model(x_batch, y_batch)
print(decode_text(model.generate(context = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))

DocKYBrdMHZRL$ O$NI?A$QCb!SLNYs-gfQ F$$yqZOfj:Pm$a!oBnqLeY&CfXVfXCtrm'JlaHCyO?h3thgfGWMMeAECNvy,U:fXx


In [12]:
# Optimizer
opt = AdamW(model.parameters(), lr = 1e-3)