In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import tiktoken
import numpy as np

In [19]:
class LayerNorm(nn.Module):
    def __init__(self, features):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(features))
        self.beta = nn.Parameter(torch.zeros(features))

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + 1e-6) + self.beta

class BobNet(nn.Module):

    def __init__(self):
        super().__init__()
        self.encoding = tiktoken.get_encoding("r50k_base")
        self.emb_size = self.encoding.n_vocab
        self.emb_channels = 128
        self.emb = nn.Embedding(self.emb_size, self.emb_channels)
        self.qW = nn.Parameter(torch.randn(self.emb_channels, self.emb_channels))
        self.kW = nn.Parameter(torch.randn(self.emb_channels, self.emb_channels))
        self.vW = nn.Parameter(torch.randn(self.emb_channels, self.emb_channels))
        self.oW = nn.Parameter(torch.randn(self.emb_channels, self.emb_channels))
        self.gamma = nn.Parameter(torch.ones(self.emb_channels))
        self.beta = nn.Parameter(torch.zeros(self.emb_channels))
        self.num_heads = 8
        self.head_dim = self.emb_channels // self.num_heads
        assert self.head_dim * self.num_heads == self.emb_channels, "emb_channels must be divisible by num_heads"
        self.l1 = nn.Linear(self.emb_channels, 250)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(250, self.emb_channels)
        self.ln1 = LayerNorm(self.emb_channels)
        self.ln2 = LayerNorm(self.emb_channels)
        self.linear = nn.Linear(self.emb_channels, self.emb_size)
        self.n = 3


    def positional_encoding(self, x):
        _, seq_length, d = x.shape
        encoding = x.clone()
        pos = torch.arange(seq_length).unsqueeze(1)
        i = torch.arange(d).unsqueeze(0)
        factor = 10000 ** (2 * i / d)
        position_tensor = pos / factor
        for i in i[0]:
            encoding += torch.sin(position_tensor) if i % 2 == 0 else torch.cos(position_tensor)
        return encoding
    
    def split_heads(self, x, batch_size):
        return x.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

    def self_attention(self, x):
        batch_size = x.shape[0]
        q = self.split_heads(x @ self.qW, batch_size)
        k = self.split_heads(x @ self.kW, batch_size)
        v = self.split_heads(x @ self.vW, batch_size)
        qK = (q @ k.transpose(-2, -1)) / (self.head_dim ** 0.5)
        qK = self.mask(qK)
        attention_weights = F.softmax(qK, dim=-1)
        output = torch.matmul(attention_weights, v)
        output = output.transpose(1, 2).contiguous().view(batch_size, -1, self.emb_channels)
        output = output @ self.oW
        output += x
        return output
        
    def mask(self, x):
        seq_length = x.shape[2]
        mask = torch.tril(torch.ones((seq_length, seq_length), device=x.device))
        mask = mask.unsqueeze(0).unsqueeze(1)
        mask = mask.repeat(x.shape[0], self.num_heads, 1, 1)
        return x.masked_fill(mask == 0, float('-inf'))
    
    def feed_forward(self, x):
        y = self.l1(x)
        y = self.relu(y)
        y = self.l2(y)
        y += x
        return y

    def forward(self, x):
        max_length = max(t.size(0) for t in x)
        padded = [F.pad(t, (0, max_length - t.size(0))) for t in x]
        input_tensor = torch.stack(padded)
        x = self.emb(input_tensor)
        x = self.positional_encoding(x)
        for _ in range(self.n):
            x = self.self_attention(x)
            x = self.ln1(x)
            x = self.feed_forward(x)
            x = self.ln2(x)
        x = self.linear(x)
        return x

In [20]:
some1 = torch.randn(5, 10)
some2 = torch.randn(5, 10)

print(torch.stack([some1, some2]).shape)

torch.Size([2, 5, 10])


In [13]:
import os
import requests
import tiktoken
import numpy as np

# download the tiny shakespeare dataset
input_file_path = os.path.join('', 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r') as f:
    data = f.read()
n = len(data)
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

# encode with tiktoken gpt2 bpe
enc = tiktoken.get_encoding("r50k_base")
train_ids = enc.encode_ordinary(train_data)
val_ids = enc.encode_ordinary(val_data)
print(f"train has {len(train_ids):,} tokens")
print(f"val has {len(val_ids):,} tokens")

# export to bin files
train_ids = np.array(train_ids, dtype=np.int32)
val_ids = np.array(val_ids, dtype=np.int32)
train_ids.tofile(os.path.join('', 'train.bin'))
val_ids.tofile(os.path.join('', 'val.bin'))

# train.bin has 301,966 tokens
# val.bin has 36,059 tokens

train has 301,966 tokens
val has 36,059 tokens


In [21]:
import torch
from torch.utils.data import Dataset

class TokenDataset(Dataset):
    def __init__(self, file_path, seq_length):
        self.file_path = file_path
        self.seq_length = seq_length
        self.data = np.memmap(file_path, dtype=np.int32, mode='r')
        self.num_samples = len(self.data) // seq_length

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        start_index = idx * self.seq_length
        end_index = start_index + self.seq_length + 1  # +1 for target
        sequence = self.data[start_index:end_index]
        input_seq = torch.tensor(sequence[:-1], dtype=torch.long)
        target_seq = torch.tensor(sequence[1:], dtype=torch.long)
        return input_seq, target_seq


In [22]:
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn

# Parameters
seq_length = 100
batch_size = 10
learning_rate = 0.001

# Dataset and DataLoader
train_dataset = TokenDataset('train.bin', seq_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model, Loss Function, Optimizer
model = BobNet()
crossentropy = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

  from .autonotebook import tqdm as notebook_tqdm


BobNet(
  (emb): Embedding(50257, 128)
  (l1): Linear(in_features=128, out_features=250, bias=True)
  (relu): ReLU()
  (l2): Linear(in_features=250, out_features=128, bias=True)
  (ln1): LayerNorm()
  (ln2): LayerNorm()
  (linear): Linear(in_features=128, out_features=50257, bias=True)
)

In [23]:
from tqdm import trange
num_epochs = 5  # Number of epochs

for epoch in (t:=trange(1)):
    count = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        # Forward pass
        outputs = model(inputs)
        outputs = outputs.view(-1, outputs.size(-1))
        loss = crossentropy(outputs, targets.view(-1))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        count += 1
        t.set_description(f'loss: {loss.item()}, count: {count}')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


loss: 6.320789813995361, count: 180:   0%|          | 0/1 [01:35<?, ?it/s] 


KeyboardInterrupt: 

In [26]:
model.eval()

input_text = " "

input_ids = model.encoding.encode(input_text)

# Number of tokens to generate
num_tokens_to_generate = 300

# Convert to a tensor and add batch dimension (unsqueeze(0) adds a batch dimension)
input_tensor = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0).to(device)

# Generate tokens
generated_tokens = []
with torch.no_grad():
    for _ in range(num_tokens_to_generate):
        # Get the model's prediction for the next token
        outputs = model(input_tensor)
        
        # Only get the logits of the last token in the sequence
        next_token_logits = outputs[:, -1, :]
        
        # Sample the next token from the probability distribution (you can also use argmax)
        next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1)
        
        # Append the predicted token to the list of generated tokens
        generated_tokens.append(next_token.item())
        
        # Append the new token to the input sequence for the next prediction
        input_tensor = torch.cat((input_tensor, next_token.unsqueeze(0)[0]), dim=1)

# Decode the generated tokens back to text
generated_text = model.encoding.decode(generated_tokens)

print(generated_text)

And for butUS thee,First hereCOR in3
, truthOL talkTell wild aUN
 dishon
 dayW I Dor thatrieve up nobleUS ofORK manWhyHow thyfather
 Experts ac:Y talk,Yet H
 name? singular slave one noble thisgo me lie him:
, is requestodes ThomasAulet long ages; stand?
 in lord I heHAM
 late! ab kneeUS you as you
I doTh it her now. lie hereRAY
.. fri that Tokens Sir partWAR it you shallHoldforce will

 majestyW is
 as. am for
,ERrimge men
 bout another forslaveEW be th. come I. more: much Rain,One amKING '
: shortrown,You
 look myix alas!
 good them not might, their been of most
 way kingAs, Flu. youngerTeX
IA's be toRInt


 my but playersO markComeLAND:Which in. me boastENable inBKING my

 anJ
 to you straight's deatharest it the with heThe, on
 was,
 cannot
 London officeIO most hand his maid so our A
 confSLIX than notcats chastIlicts for
 betweenous.A;My are nurse

 thyheadUD. such,AR:
 defend t qu my hence ages-
 that
 dear byOW mischiefMER lo.,,VOL the comfort mighty
