In [136]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np

In [131]:
class BobNet(nn.Module):

    def __init__(self):
        super().__init__()
        self.encoding = tiktoken.get_encoding("r50k_base")
        self.emb_size = self.encoding.n_vocab
        self.emb_channels = 32
        self.emb = nn.Embedding(self.emb_size, self.emb_channels)
        self.qW = torch.randn(self.emb_channels, self.emb_channels)
        self.kW = torch.randn(self.emb_channels, self.emb_channels)
        self.vW = torch.randn(self.emb_channels, self.emb_channels)
        self.oW = torch.randn(self.emb_channels, self.emb_channels)
        self.gamma = torch.randn(self.emb_channels)
        self.beta = torch.randn(self.emb_channels)
        self.num_heads = 8
        self.head_dim = self.emb_channels // self.num_heads
        assert self.head_dim * self.num_heads == self.emb_channels, "emb_channels must be divisible by num_heads"
        self.l1 = nn.Linear(self.emb_channels, 250)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(250, self.emb_size)

    def positional_encoding(self, x):
        _, seq_length, d = x.shape
        encoding = x.clone()
        pos = torch.arange(seq_length).unsqueeze(1)
        i = torch.arange(d).unsqueeze(0)
        factor = 10000 ** (2 * i / d)
        position_tensor = pos / factor
        for i in i[0]:
            encoding += torch.sin(position_tensor) if i % 2 == 0 else torch.cos(position_tensor)
        return encoding
    
    def split_heads(self, x, batch_size):
        return x.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

    def self_attention(self, x):
        batch_size = x.shape[0]
        # dim: (batch_size, num_heads, seq_length, head_dim)
        q = self.split_heads(x @ self.qW, batch_size)
        k = self.split_heads(x @ self.kW, batch_size)
        v = self.split_heads(x @ self.vW, batch_size)
        qK = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        qK = self.mask(qK)
        attention_weights = F.softmax(qK, dim=-1)
        output = torch.matmul(attention_weights, v)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.emb_channels)
        output = output @ self.oW
        output += x
        output = self.layer_normalization(output)
        return output
        
    def mask(self, x):
        seq_length = x.shape[2]
        mask = torch.tril(torch.ones((seq_length, seq_length), device=x.device))
        mask = mask.unsqueeze(0).unsqueeze(1)
        mask = mask.repeat(x.shape[0], self.num_heads, 1, 1)
        return x.masked_fill(mask == 0, float('-inf'))
    
    def layer_normalization(self, x):
        mean = torch.mean(x, dim=-1, keepdim=True)
        var = torch.var(x, dim=-1, keepdim=True)
        return self.gamma * (x - mean) / torch.sqrt(var + 1e-5) + self.beta
    
    def feed_forward(self, x):
        x = self.l1(x)
        x = self.relu(x)
        x = self.l2(x)
        return x


    def forward(self, x):
        max_length = max(t.size(0) for t in x)
        padded = [F.pad(t, (0, max_length - t.size(0))) for t in x]
        input_tensor = torch.stack(padded)
        x = self.emb(input_tensor)
        x = self.positional_encoding(x)
        x = self.self_attention(x)
        x = self.self_attention(x)
        x = self.self_attention(x)
        x = self.feed_forward(x)
        return x

In [132]:
import os
import requests
import tiktoken
import numpy as np

# download the tiny shakespeare dataset
input_file_path = os.path.join('', 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("r50k_base")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.int32)
val_ids = np.array(val_ids, dtype=np.int32)
train_ids.tofile(os.path.join('', 'train.bin'))
val_ids.tofile(os.path.join('', 'val.bin'))

# train.bin has 301,966 tokens
# val.bin has 36,059 tokens

train has 301,966 tokens
val has 36,059 tokens


In [133]:
import torch
from torch.utils.data import Dataset

class TokenDataset(Dataset):
    def __init__(self, file_path, seq_length):
        self.file_path = file_path
        self.seq_length = seq_length
        self.data = np.memmap(file_path, dtype=np.int32, mode='r')
        self.num_samples = len(self.data) // seq_length

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        start_index = idx * self.seq_length
        end_index = start_index + self.seq_length + 1  # +1 for target
        sequence = self.data[start_index:end_index]
        input_seq = torch.tensor(sequence[:-1], dtype=torch.long)
        target_seq = torch.tensor(sequence[1:], dtype=torch.long)
        return input_seq, target_seq


In [134]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn

# Parameters
seq_length = 30
batch_size = 10
learning_rate = 0.001

# Dataset and DataLoader
train_dataset = TokenDataset('train.bin', seq_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model, Loss Function, Optimizer
model = BobNet()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BobNet(
  (emb): Embedding(50257, 32)
  (l1): Linear(in_features=32, out_features=250, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=250, out_features=50257, bias=True)
)

In [141]:
from tqdm import trange
num_epochs = 5  # Number of epochs

for epoch in (t:=trange(1)):
    count = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        # Forward pass
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = criterion(outputs, targets.view(-1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
        t.set_description(f'loss: {loss.item()}, count: {count}')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


loss: 6.184138774871826, count: 1007: 100%|██████████| 1/1 [03:07<00:00, 187.66s/it]

Epoch [1/5], Loss: 6.1841





In [143]:
model.eval()

input_text = "hey"

input_ids = model.encoding.encode(input_text)

# Number of tokens to generate
num_tokens_to_generate = 300

# Convert to a tensor and add batch dimension (unsqueeze(0) adds a batch dimension)
input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

# Generate tokens
generated_tokens = []
with torch.no_grad():
    for _ in range(num_tokens_to_generate):
        # Get the model's prediction for the next token
        outputs = model(input_tensor)
        
        # Only get the logits of the last token in the sequence
        next_token_logits = outputs[:, -1, :]
        
        # Sample the next token from the probability distribution (you can also use argmax)
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
        
        # Append the predicted token to the list of generated tokens
        generated_tokens.append(next_token.item())
        
        # Append the new token to the input sequence for the next prediction
        input_tensor = torch.cat((input_tensor, next_token.unsqueeze(0)[0]), dim=1)

# Decode the generated tokens back to text
generated_text = enc.decode(generated_tokens)

print(generated_text)

 and this is lord toWhere:; faire to time, with William
 senate for MAR
FR. dear: hence, said IedID dagger we forth overU on, thr make manners with A me taste
 shall husband my, of the


 breathsBut God he dispos cr, kn majesty Tar is came
 falsebuttonared moral cutThe'll
'y lend stillhornWhetherAff thyGod gracious prince soldiers dead good? deeds seldom, hadGRPET Rome with


. three sorrow:ES,A fair,ost the..
 twoative not a grave his OButLarpt sir holyC sir:US King

,'d
Esc and lend will reap LA contentionPR tr pays seasORT gravity
, may hathace fathersYC
 withpt cityOL,ight toThough noUS and crept spider hateful isUEits
,For Tower
INGS Warwick
 thisQ
audio as Rome
erers,:,ceral by cause reasonswithland
,orsetoth

IO despair sorrow,
 a like sonBoy slave.,I itbyiment
oughOr
To
On BrotherWARDC
 to Tromed hath comeF thee, unt, march death andshs drumWhat my right,G: the
oth
 true, after, I have
ham so:ThatOP ' Brittany;.CORock,None forth he
 myGood,; nine dinner; peace such spUR knock
 