### Importing dependencies:

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

### Using mac GPU (mps):

In [2]:
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
else:
    mps_device = torch.device("mps")

### Defining the hyperparameters:

In [3]:

batch_size = 64 # how many independent sequences will we process in parallel
block_size = 256 # what is the maximum context length for predictions
max_iters = 5000
eval_interval = 500
learning_rate = 3e-3
device = 'mps' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 384
n_head = 6
n_layer = 6
dropout = 0.2

### Download the dataset:
Uncomment line below to download the dataset

In [5]:
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [6]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# The dataset:

In [7]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [8]:
''' First 500 characters of dataset '''
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [9]:
''' List all characters in dataset '''
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Characters: ",''.join(chars))
print("Amount of characters:", vocab_size)

Characters:  
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Amount of characters: 65


In [10]:
''' Create a mapping from characters to integers '''
stoi = { ch:i for i,ch in enumerate(chars)}
itos = { i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

print(encode("hello world"))
print(decode(encode("Hello world")))


[46, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42]
Hello world


In [11]:
''' tokenize the dataset '''
import torch
data = torch.tensor(encode(text), dtype=torch.long, device=mps_device)
print(data.shape, data.dtype)
print(data[:500])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [12]:
''' split the data into train and validation sets '''
split = int(0.9*len(data)) # 90%, 10%
train_data = data[:split] # 90%
val_data = data[split:] # 10%

In [22]:
''' Examples of inputs and outputs'''
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context}, the target is {target}.")

when input is tensor([18], device='mps:0'), the target is 47.
when input is tensor([18, 47], device='mps:0'), the target is 56.
when input is tensor([18, 47, 56], device='mps:0'), the target is 57.
when input is tensor([18, 47, 56, 57], device='mps:0'), the target is 58.
when input is tensor([18, 47, 56, 57, 58], device='mps:0'), the target is 1.
when input is tensor([18, 47, 56, 57, 58,  1], device='mps:0'), the target is 15.
when input is tensor([18, 47, 56, 57, 58,  1, 15], device='mps:0'), the target is 47.
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47], device='mps:0'), the target is 58.
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58], device='mps:0'), the target is 47.
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47], device='mps:0'), the target is 64.
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64], device='mps:0'), the target is 43.
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43], device='mps:0'), the target 

In [23]:
''' Creating bathes with examples '''
torch.manual_seed(1337)

def get_batch(dataset, batch_size=batch_size, k=0):
    if dataset == "full":
        ix = torch.range(len(data) - block_size, (batch_size,), device=mps_device)
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    else:
        data = train_data if dataset== "train" else val_data
        ix = torch.randint(len(data) - block_size, (batch_size,), device=mps_device)
        x = torch.stack([data[i:i+block_size] for i in ix])
        y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

x_batch, y_batch = get_batch("train")
print('inputs:')
print(x_batch.shape)
print(x_batch)
print('outputs:')
print(y_batch.shape)
print(y_batch)

print('-----')

for b in range(batch_size):
    for t in range(block_size):
        context = x_batch[b, :t+1]
        target = y_batch[b, t]
        print(f"When input is {context.tolist()}, the target is {target}")


inputs:
torch.Size([64, 256])
tensor([[30, 37,  1,  ..., 52, 42, 43],
        [53, 59, 56,  ..., 45,  1, 53],
        [ 1, 57, 46,  ..., 46, 47, 57],
        ...,
        [39, 58,  1,  ..., 53, 58,  1],
        [47, 52,  1,  ...,  1, 51, 39],
        [ 5, 58, 47,  ..., 32, 46, 43]], device='mps:0')
outputs:
torch.Size([64, 256])
tensor([[37,  1, 34,  ..., 42, 43, 42],
        [59, 56,  1,  ...,  1, 53, 44],
        [57, 46, 53,  ..., 47, 57,  1],
        ...,
        [58,  1, 51,  ..., 58,  1, 40],
        [52,  1, 58,  ..., 51, 39, 63],
        [58, 47, 57,  ..., 46, 43, 52]], device='mps:0')
-----
When input is [30], the target is 37
When input is [30, 37], the target is 1
When input is [30, 37, 1], the target is 34
When input is [30, 37, 1, 34], the target is 21
When input is [30, 37, 1, 34, 21], the target is 10
When input is [30, 37, 1, 34, 21, 10], the target is 0
When input is [30, 37, 1, 34, 21, 10, 0], the target is 35
When input is [30, 37, 1, 34, 21, 10, 0, 35], the target i

# Creating the model:

## Creating custom nn.Module classes

In [24]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class Head(nn.Module):
    ''' Create a single head of self-attention '''
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x) # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B,T,C) @ (B,C,T) = (B,T,T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B,T,T)
        wei = F.softmax(wei, dim=-1) # (B,T,T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B,T,T) @ (B,T,C) = (B,T,C)
        return out

class MultiHeadAttention(nn.Module):
    ''' Multiple heads of self-attention in paralel '''
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    
class FeedFoward(nn.Module):
    ''' A simple linear layer followed by non-linearity '''

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd,n_embd),
            nn.Dropout(dropout),
        )
    
    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
    ''' Transformer block: communication followed by computation '''

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # each token directly reads off the logits of the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd, device=mps_device)
        self.position_embedding_table = nn.Embedding(block_size, n_embd, device=mps_device)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd, device=mps_device)
        self.lm_head = nn.Linear(n_embd, vocab_size, device=mps_device)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B,Y,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=mps_device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        # idx is (B ,T) array of indices in the current context
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, loss = self(idx_cond) # get predictions
            logits = logits[:,-1,:] # focus on last time step (B, C)
            probs = F.softmax(logits, dim=-1) # get probabilities (B, C)
            idx_next = torch.multinomial(probs, num_samples=1) # Sample from distribution
            idx = torch.cat((idx, idx_next), dim=1) # append sample to running sequence (B, T+1)
        return idx

## Building the model and Initial loss

In [25]:
model = BigramLanguageModel()
model.to(mps_device)
logits, loss = model(x_batch, y_batch)
print(logits.shape)
print(loss)

torch.Size([16384, 65])
tensor(4.3924, device='mps:0', grad_fn=<NllLossBackward0>)


# Training the model:

In [26]:
# Create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [27]:
for steps in range(max_iters):
    x_batch, y_batch = get_batch('train')

    logits, loss = model(x_batch,y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if steps % 100 == 0:
        print(steps, loss.item(), end='       ')
    if steps % 400 == 0: print()

print(loss.item())

0 4.387246131896973       
100 2.625739574432373       200 2.5437235832214355       300 2.474287986755371       400 2.254804849624634       
500 2.0727787017822266       600 1.9492911100387573       700 1.7938308715820312       800 1.7327008247375488       
900 1.659446358680725       1000 1.6226472854614258       1100 1.5666472911834717       1200 1.5822564363479614       
1300 1.5264252424240112       1400 1.489774227142334       1500 1.4803404808044434       1600 1.4323184490203857       
1700 1.4243254661560059       1800 1.408167839050293       1900 1.3712637424468994       2000 1.3678195476531982       
2100 1.3353347778320312       2200 1.2958769798278809       2300 1.296130895614624       2400 1.2656408548355103       
2500 1.2473623752593994       2600 1.2758255004882812       2700 1.234192132949829       2800 1.2157175540924072       
2900 1.2507603168487549       3000 1.231990098953247       3100 1.1951603889465332       3200 1.1612954139709473       
3300 1.176113247871399 

# Validation of the model:

## Loss on validation set:

In [32]:
torch.mps.empty_cache()
average_loss = 0
with torch.no_grad():
    for steps in range(100):
        torch.mps.empty_cache()
        x_validate, y_validate = get_batch("validation")
        logits, loss = model(x_validate, y_validate) 
        average_loss += loss
average_loss /= 100


tensor(1.6337, device='mps:0')

In [34]:
print(f"Loss on validation data: {average_loss}")

Loss on validation data: 1.6337074041366577


## Training model on validation set:

In [37]:
for steps in range(10):
    x_batch, y_batch = get_batch('validation')

    logits, loss = model(x_batch,y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    print(steps, loss.item(), end='       ')

print(loss.item())

0 1.098629355430603       
5 1.1310834884643555       1.0944464206695557


# Saving the model:

In [39]:
torch.save(model.state_dict(), "./shakes_spear_model")

# Loading the model:

In [42]:
import os
if os.path.isfile("./shakes_spear_model"):
    model = BigramLanguageModel()
    model.load_state_dict(torch.load("./shakes_spear_model"))
    model.to(mps_device)

# Generating from the model:

### outputing to a file

In [70]:
output_file = open("./output.txt", "w")

In [71]:
letters_to_generate = 5000
output_file.write(decode(model.generate(torch.zeros((1,1), dtype=torch.long,device=mps_device), max_new_tokens=letters_to_generate)[0].tolist()))
output_file.close()

### printing output file contents

In [72]:
output_file = open("./output.txt", "r")
print(output_file.read())
output_file.close()


VINCENTIO:
What, she was now?

Peirangelously news, master?

BIANCA:
What news?

BIANCA:
Thus within that is our unhappier half;
For, with him they walk, but see the mind:
Your horse are subtled, since, be not;
More than I had nor proclaim'd me, I am and
Too simple with evenishman.
I know, sir, beseech my leather way awaybur's blush;
And so twill, I stoop my sneet from it.

VINCENTIO:
Then, sir, I weull sweet piece, she to her: have
mongend me on, myself pleased somewhat mine eyest:
two all my assurance, the marriage request:
But our wilt, you be seit, ere were almiss:
'Time I hear say so deing is oath she;
And yet never he, mirrom servace and kindness.
Here, she could have nays entitle flesh.
For one where is that prizest wave,
Thy hollowing in the eye and mistress of our throop
And gives pass us of thy happiness;
And wert in thy name; ay, then I am ever
Musta, after all Eather day, with tutors.

WEDwAN:
Say what by where now? why heavy I know,
What, that I am abhor of Hilfrom thy st