In [1]:
from pathlib import Path
from dataclasses import dataclass 
import requests
import numpy as np
import torch
import torch.nn as nn
from jaxtyping import Float, Int

import torch.optim as optim
import torch.nn.functional as F
from typing import List
import requests

import unicodedata
import json
from collections import Counter, defaultdict
import base64
from IPython.display import Image, display


@dataclass 
class Config:
    d_model : int 
    d_vocab : int
    d_hidden : int # for MLP
    n_layers : int
    # d_head : int # for Attn (if separate wq and wk)
    #no n_context
    #name var : type

# guttenburg dataset code in existing notebooks

### Dimensions

\begin{align*}
    &\text{d-model} = d_m & : & \text{model dimension (num neurons)} \\
    &d_v = \text{d-vocab} & : & \text{vocab dimension} \\
    &n_c = \text{n-context} & : & \text{context window (len of seq entered)}
\end{align*}

Where $d_n << d_m$

## Getting Data

In [None]:
url = "https://www.gutenberg.org/files/67098/67098-0.txt"
response = requests.get(url)
text = response.text

start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

start_idx = text.find(start_marker)
end_idx = text.find(end_marker)
content_area = text[start_idx:end_idx].split("\n", 1)[1]

chapter_idx = content_area.upper().find("CHAPTER I")

raw_text = content_area[chapter_idx:]
print(raw_text[:500000])


## Tokenize Text

In [None]:
class Tokenizer:
    def __init__(self, text):
        self.chars = sorted(list(set(text)))
        self.vocab_size = len(self.chars)
    
    #simplify
    #all letters lowercase
    #each punctuation into a token each letter a token
    #get a set of tokens
    #this set is d_vocab
    def clean_text(self, text: str) -> list[str]:
        return [x for x in text.lower() if x.isalpha() or x in " .!?"]


    def tokenize(self, text):
    # will update words to nums with vocab map 
        cleaned = self.clean_text(text)
        # sort the list of chars 
        cleaned_sorted = sorted(set(cleaned))

        # vocab map 
        


    #def detokenize(self, tokens):
    # inverse of tokenize (nums to words )
        

In [None]:
tokenizer = Tokenizer(raw_text)

sample_text = "Chapter I. The starting of the journey!"

encoded = tokenizer.tokenize(sample_text)




Vocabulary Size: 81
Unique Chars: 
 !"&'()*,-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzæ
Original Sample: Chapter I. The starting of the journey!


TypeError: 'NoneType' object is not subscriptable

## Multilayer Perceptron
$$
    \texttt{MLP}(\mathbf{X}) = W_d \cdot \sigma_{\texttt{ReLU}} (W_u \cdot x + b_u) + b_d, \qquad \texttt{MLP} : \mathbb{R}^{d_m} \to \mathbb{R}^{d_m}
$$

In [None]:
class MLP(nn.Module):
    def __init__(self, config: Config): # matrices to initialize
        super().__init__()
        self.linear_up: nn.Linear = nn.Linear(config.d_model, config.d_hidden)
        self.linear_down: nn.Linear = nn.Linear(config.d_hidden, config.d_model)
    
    def forward(self, x: Float[torch.Tensor, "* d_model"]) -> Float[torch.Tensor, "* d_model"]:
        x = self.linear_up(x)
        x = torch.relu(x)
        x = self.linear_down(x)
        return x  

## Attention Head
### Weight Matrix
$$
    \mathbf{W}_{QK} := \mathbf{W}_{Q} \cdot \mathbf{W}_{K}^T, \qquad \mathbf{W}_{Q}, \mathbf{W}_{K} \in \mathbb{R}^{d_m \times d_n}
$$

### Autoregressive Masking (M) Matrix
$$
    M_{i,j} = 
    \begin{cases}
        0 &j \geq i \\
        -\infty &j < i>
    \end{cases}
$$

### Forward Pass
$$A(\mathbf{X}) = \sigma_{\text{softmax}} (\mathbf{X} \; \mathbf{W}_{QK} \; \mathbf{X}^\text{T} + \mathbf{M}) \; \mathbf{X} \; \mathbf{W}_{OV}$$

In [None]:
class AttentionHead(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        # weights (use nn.parameter) to create a matrix to track gradients
        self.wqk = nn.Parameter(torch.randn(config.d_model, config.d_model))
        self.wov = nn.Parameter(torch.randn(config.d_model, config.d_model))

        ## Create M Matrix
    def M_matrix(n):
        # matrix with 0 at and below the diagonal and -inf above the diagonal
        M = torch.ones((n, n))
        M = torch.triu(M, diagonal=1)
        M = M.masked_fill(M == 1, float('-inf'))
        print(M)
        
    
    def forward(self, x: Float[torch.Tensor, "* d_model"]) -> Float[torch.Tensor, "* d_model"]:
        # use weights to compute A⨉
        # X as input: n_seq by d_model
        n_seq = x.shape[0]
        M = self.M_matrix(n_seq)
        attention_pattern = x @ self.wqk @ x.T + M
        attention_of_X = nn.softmax(attention_pattern) @ x @ self.wov
        
        return attention_of_X

### Transformer Block
$$
    \text{TB}(X) = X + A(X) + \text{MLP}(X), \qquad \text{TB}: \mathbb{R}^{n_c \times d_m} \to \mathbb{R}^{n_c \times d_m}
$$

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        #self.ln = nn.LayerNorm(config)
        self.mlp = MLP(config)
        self.attention = AttentionHead(config)


    def foward(self, x: Float[torch.Tensor, "* d_model"]) -> Float[torch.Tensor, "* d_model"]:
        #output = x + mlp(x) + attentionhead(x)   
        output_x = x + self.mlp(x) + self.attention(x)
        #x = self.ln(x_1)
        #x = self.ln(x_2)

        return(output_x)

In [None]:
class transformer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config.d_vocab, config.d_model)
        #self.transformerblocks = nn.modules list of transformer blocks
        self.transformerblocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
        
    def forward(self, x: Int[torch.Tensor, "n_context"]) -> Float[torch.Tensor, "n_context d_vocab"]:
        x = self.token_embedding(x) # converts int d-vector to d-model vector
        for i in range(n_layers):
            x = self.transformerblocks[i](x)
        x = x @ self.token_embedding.weight.T # unembedding 
        #n_contex long - sequence if ints of length n  - float tneosry by n_model  and output is float tencsosr by d-vocab \n",
        #d_model to d_vocab transpose or do a lineear map  - unembed nn.linear
        #dmodel to dvocab 

        return x
        
        
        
# use nn.ModuleList for TB seqeunce & MHA (to create a list of TBS)
# print(f"{x.shape = }") for debugging

# pick a unique dataset to train data on

# if traning models: aim for < 10 million parameters for now
#   sum(x.numel() for x in mymodel.parameters())
