In [1]:
import math
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import wikipedia

In [2]:
def tokenize(text, add_stop_token = True):
    tokens = []
    for letter in text:
        if letter in TOKEN_DICT:
            tokens.append(TOKEN_DICT[letter])
        else:
            # UNK token
            tokens.append(len(TOKEN_DICT))
            
    # STOP token
    if add_stop_token:
        tokens.append(len(TOKEN_DICT) + 1)
    return torch.tensor(tokens)

def read_tokens_from_file(filename):
    with open(filename, "r") as file:
        text = file.read()
        return tokenize(text)

In [3]:
def get_giga_token_counts(verbose = False):
    mega_text = ""
    for directory in ["AA", "AB", "AC"]:
        for digit1 in range(10):
            for digit2 in range(10):
                filename = f"text/{directory}/wiki_{digit1}{digit2}"
                try:
                    with open(filename, "r") as file:
                        text = file.read()
                        mega_text += text
                except FileNotFoundError:
                    if verbose:
                        print(f"File {filename} not found")
                    
    return mega_text

In [4]:
mega_text = get_giga_token_counts()
token_counts = pd.Series(list(mega_text)).value_counts()
token_counts = token_counts[token_counts > 300]
TOKEN_DICT = {e:i for i, e in enumerate(token_counts.index)}
REVERSE_TOKEN_DICT = {i:e for i, e in enumerate(token_counts.index)}

In [5]:
D_MODEL = 96
DK = 32
N_HEADS = 16
CONTEXT_SIZE = 24
MLP_HIDDEN_LAYER_SIZE = 192
N_DISTINCT_TOKENS = len(TOKEN_DICT) + 2 # The + 2 is for UNK and STOP tokens
N_LAYERS = 20
P_DROPOUT = 0.1

In [6]:
def create_mask_matrix(dim):
    neginf = (torch.tensor(-1) / torch.tensor(0)).item()
    matrix = torch.zeros(dim, dim)
    for i in range(dim):
        for j in range(i + 1, dim):
            matrix[i][j] = neginf
            
    return matrix
MASK_MATRIX = create_mask_matrix(CONTEXT_SIZE).unsqueeze(0)

In [7]:
def positional_embeddings(d_model, context_size):
    a = torch.zeros(context_size, d_model)
    for pos in range(context_size):
        for j in range(d_model):
            if j % 2 == 0:
                a[pos][j] = math.sin(pos / 10000 ** (j / d_model))
            else:
                a[pos][j] = math.cos(pos / 10000 ** ((j - 1) / d_model))
    return a
                
def positional_embeddings2(d_model, max_len):
    # Source: https://nlp.seas.harvard.edu/annotated-transformer/#positional-encoding
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1)
    div_term = torch.exp(
        torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
    )
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
    return pe

# These are the exact same way of getting positional embeddings, barring floating-point precision issues.

In [8]:
class AttentionHead(nn.Module):
    def __init__(self):
        super(AttentionHead, self).__init__()
        self.Wk = nn.Linear(D_MODEL, DK, bias = False)
        self.Wq = nn.Linear(D_MODEL, DK, bias = False)
        self.Wv = nn.Linear(D_MODEL, DK, bias = False)
        self.dropout = nn.Dropout(p = P_DROPOUT)
        
        # The weights are initialized with Kaiming uniform by default, which is fine for now
        
    def forward(self, x):
        K = self.Wk(x)
        Q = self.Wq(x)
        V = self.Wv(x)
        
        QKT = torch.matmul(Q, K.transpose(dim0 = 1, dim1 = 2))
        QKT += MASK_MATRIX
        pattern = torch.softmax(QKT / math.sqrt(DK), dim = 2)
        pattern = self.dropout(pattern)
        return torch.matmul(pattern, V)
    
class AttentionBlock(nn.Module):
    def __init__(self):
        super(AttentionBlock, self).__init__()
        self.heads = nn.ModuleList([AttentionHead() for i in range(N_HEADS)])
        self.Wo = nn.Linear(DK * N_HEADS, D_MODEL, bias = False)
        
    def forward(self, x):
        return self.Wo(
                torch.concat(
                    [head(x) for head in self.heads],
                        dim = 2))
    
class MLPBlock(nn.Module):
    def __init__(self):
        super(MLPBlock, self).__init__()
        self.layer1 = nn.Linear(D_MODEL, MLP_HIDDEN_LAYER_SIZE)
        self.activation1 = nn.GELU()
        self.layer2 = nn.Linear(MLP_HIDDEN_LAYER_SIZE, D_MODEL)
        # The nn.Linear layers are initialized with Kaiming uniform initialization by default, which is fine
        
    def forward(self, x):
        x1 = self.layer1(x)
        x2 = self.activation1(x1)
        x3 = self.layer2(x2)
        return x3
    
class Transformer(nn.Module):
    def __init__(self):
        super(Transformer, self).__init__()
        
        # Embedding, positional encoding, and dropout
        self.embedding = nn.Embedding(N_DISTINCT_TOKENS, D_MODEL)
        self.positional_embedding = positional_embeddings(D_MODEL, CONTEXT_SIZE).unsqueeze(0)
        self.first_dropout = nn.Dropout(p = P_DROPOUT)

        # Attention
        self.attention_blocks = nn.ModuleList()
        for i in range(N_LAYERS):
            layer_norm = nn.LayerNorm(D_MODEL)
            attention = AttentionBlock()
            dropout = nn.Dropout(P_DROPOUT)
            module = nn.Sequential(layer_norm, attention, dropout)
            self.attention_blocks.append(module)
            
        # MLP
        self.mlps = nn.ModuleList()
        for i in range(N_LAYERS):
            layer_norm = nn.LayerNorm(D_MODEL)
            mlp = MLPBlock()
            dropout = nn.Dropout(P_DROPOUT)
            module = nn.Sequential(layer_norm, mlp, dropout)
            self.mlps.append(module)
            
        # Final layer norm and linear
        self.final_layer_norm = nn.LayerNorm(D_MODEL)
        self.final_linear = nn.Linear(D_MODEL, N_DISTINCT_TOKENS)
            
    def forward(self, x):
        # Step 1: Embedding and positional encoding
        x = self.embedding(x) + self.positional_embedding
        
        # Step 2: First dropout
        x = self.first_dropout(x)
        
        # Step 3: All the blocks
        for i in range(N_LAYERS):
            x = x + self.attention_blocks[i](x)
            x = x + self.mlps[i](x)
            
        # Step 4: Final layer norm
        x = self.final_layer_norm(x)
        
        # Step 5: Final linear layer to get predictions
        x = self.final_linear(x)
        
        # Step 6: Return
        return x

In [9]:
def no_peeking_test():
    my_model = Transformer()
    my_model.eval()
    for pos in range(CONTEXT_SIZE):
        x = torch.arange(0, CONTEXT_SIZE * 2, 2).unsqueeze(0)
        output1 = my_model(x)
        for disturb_pos in range(pos + 1, CONTEXT_SIZE):
            x[0][disturb_pos] += 1
            output2 = my_model(x)
            (output1[0,:pos + 1] == output2[0,:pos + 1]).all()
            
no_peeking_test()

In [10]:
def nonzero_gradient_test():
    my_model = Transformer()
    x = torch.arange(0, 26, 2)
    out = my_model(x)
    loss = out.mean()
    loss.backward()
    print([e.grad for e in my_model.parameters()])
    
# nonzero_gradient_test()

In [11]:
def nonzero_gradient_test():
    my_model = Transformer()

    # 1) turn off dropout
    my_model.eval()

    # 2) (optional) fix RNG
    torch.manual_seed(1234)
    torch.cuda.manual_seed_all(1234)
    torch.use_deterministic_algorithms(True)

    # 3) run forward & backward
    x = torch.arange(0, 26, 2)           # your token indices
    out = my_model(x)
    loss = out.mean()
    loss.backward()

    # 4) inspect gradients
    for name, param in my_model.named_parameters():
        if param.requires_grad:
            g = param.grad
            print(f"{name:30s} → norm = {g.norm().item():.3e}")
        else:
            g = param.grad
            print(f"DOES NOT REQUIRE GRAD: {name:30s} → norm = {g.norm().item():.3e}")
    
# nonzero_gradient_test()

In [12]:
def get_dataset(tokens):
    Xs = []
    ys = []
    for i in range(0, len(tokens) - CONTEXT_SIZE - 1, CONTEXT_SIZE):
        X = tokens[i     : i + CONTEXT_SIZE    ]
        y = tokens[i + 1 : i + CONTEXT_SIZE + 1]
        Xs.append(X)
        ys.append(y)

    X = torch.stack(Xs)
    y = torch.stack(ys)
    return X, y

In [16]:
def train_one_epoch(model, optimizer, loss_fn, tokens, batch_size = 32):
    model.train()
    t0 = time.time()
    X_all, y_all = get_dataset(tokens)
    loss_vals = []
    for i in range(0, X_all.shape[0], batch_size):
        X = X_all[i : i + batch_size]
        y = y_all[i : i + batch_size]
        y_pred = model(X)
        loss = loss_fn(y_pred.view(-1, N_DISTINCT_TOKENS), y.view(-1))
        loss.backward()
        optimizer.step()
        
#         if i == 0:
#             for param in model.parameters():
#                 print(param.grad)
        
        optimizer.zero_grad()
        loss_vals.append(loss.item())
        t1 = time.time()
        
    batch_loss = torch.mean(torch.tensor(loss_vals))
    print(f"{i + 1} samples evaluated, time={int(t1-t0)}, loss = {batch_loss:.4f}")

In [14]:
def run_epoch_on_file(model, optimizer, loss_fn, filename, batch_size = 32):
    train_one_epoch(model, optimizer, loss_fn, read_tokens_from_file(filename), batch_size)

In [19]:
my_model = Transformer()
for digit1 in range(10):
    for digit2 in range(10):
        run_epoch_on_file(my_model,
                        optimizer = torch.optim.Adam(my_model.parameters()),
                        loss_fn = nn.CrossEntropyLoss(),
                        filename = f"text/AA/wiki_{digit1}{digit2}",
                        batch_size = 2048)
# I just realized that this resets the optimizer every epoch, which is probably a bad idea

40961 samples evaluated, time=1041, loss = 3.7226
40961 samples evaluated, time=951, loss = 2.9642
40961 samples evaluated, time=926, loss = 2.7970
43009 samples evaluated, time=985, loss = 2.7214
43009 samples evaluated, time=995, loss = 2.6637
43009 samples evaluated, time=2594, loss = 2.5990
43009 samples evaluated, time=1000, loss = 2.5206
40961 samples evaluated, time=923, loss = 2.4458
40961 samples evaluated, time=962, loss = 2.4308
43009 samples evaluated, time=979, loss = 2.3971
40961 samples evaluated, time=949, loss = 2.3375
40961 samples evaluated, time=935, loss = 2.3081
40961 samples evaluated, time=1131, loss = 2.2925
40961 samples evaluated, time=1124, loss = 2.2495
43009 samples evaluated, time=1129, loss = 2.2032
43009 samples evaluated, time=1113, loss = 2.1887
40961 samples evaluated, time=1139, loss = 2.1753
43009 samples evaluated, time=1142, loss = 2.1543
40961 samples evaluated, time=1114, loss = 2.1327
43009 samples evaluated, time=998, loss = 2.0822
40961 samp

In [29]:
my_optimizer = torch.optim.Adam(my_model.parameters())
for digit1 in range(10):
    for digit2 in range(10):
        run_epoch_on_file(my_model,
                        optimizer = my_optimizer,
                        loss_fn = nn.CrossEntropyLoss(),
                        filename = f"text/AB/wiki_{digit1}{digit2}",
                        batch_size = 2048)


40961 samples evaluated, time=1011, loss = 1.6003
43009 samples evaluated, time=985, loss = 1.6147
40961 samples evaluated, time=1070, loss = 1.6114
43009 samples evaluated, time=1028, loss = 1.5925
40961 samples evaluated, time=951, loss = 1.5713
43009 samples evaluated, time=988, loss = 1.6002
40961 samples evaluated, time=957, loss = 1.5814
40961 samples evaluated, time=951, loss = 1.5846
40961 samples evaluated, time=936, loss = 1.5319
40961 samples evaluated, time=952, loss = 1.5758
40961 samples evaluated, time=939, loss = 1.6156
40961 samples evaluated, time=958, loss = 1.5738
40961 samples evaluated, time=976, loss = 1.6072
40961 samples evaluated, time=976, loss = 1.5859
40961 samples evaluated, time=971, loss = 1.5912
40961 samples evaluated, time=966, loss = 1.6230
43009 samples evaluated, time=981, loss = 1.5883
40961 samples evaluated, time=963, loss = 1.6154
40961 samples evaluated, time=973, loss = 1.6000
40961 samples evaluated, time=962, loss = 1.5738
40961 samples eva

In [20]:
def predict_next_token(model, text):
    model.eval()
    trimmed = text[-CONTEXT_SIZE:]
    tokens = tokenize(trimmed, add_stop_token = False).unsqueeze(0)
    output = model(tokens)
    logits = output[:,0:1,:].squeeze()
    probs = torch.softmax(logits, dim = 0) # We can do argmax just on the logits if we want, but whatever.
    token_id = torch.argmax(logits).item()
    return REVERSE_TOKEN_DICT[token_id]

In [21]:
def predict_next_tokens(model, text, num_tokens):
    model.eval()
    for i in range(num_tokens):
        text += predict_next_token(model, text)
    
    return text

In [33]:
predict_next_tokens(my_model, "Anarchism is a political ", 100)

'Anarchism is a political  neoen etn tntrnenhnonett  n  t h th he   e n  hhtt tthtethete ttt t tteehhthheh he h thhhththh  eeh'

In [23]:
def get_test_error(model, loss_fn, tokens, batch_size = 1024):
    model.eval()
    t0 = time.time()
    X_all, y_all = get_dataset(tokens)
    loss_vals = []
    for i in range(0, X_all.shape[0], batch_size):
        X = X_all[i : i + batch_size]
        y = y_all[i : i + batch_size]
        y_pred = model(X)
        loss = loss_fn(y_pred.view(-1, N_DISTINCT_TOKENS), y.view(-1))
        loss_vals.append(loss.item())
        t1 = time.time()
        
    batch_loss = torch.mean(torch.tensor(loss_vals))
    return batch_loss

In [32]:
get_test_error(my_model,
               nn.CrossEntropyLoss(),
               read_tokens_from_file("text/AD/wiki_62"))

tensor(1.4638)

In [25]:
def save_model(model, filename):
    torch.save(model, "models/" + filename + ".pth") 
    # Load this with: model = torch.load(filename)
    # maybe do: model.eval()
    
    torch.save(model.state_dict(), "models/" + filename + "_state_dict.pth")
    # Load this with:
    # model = Transformer()
    # model.load_state_dict(torch.load(filename))
    # maybe do: model.eval()

In [26]:
save_model(my_model, "model1")

In [31]:
save_model(my_model, "model2")

In [None]:
# OK, clearly the model is really bad right now