## Imports

In [None]:
from pathlib import Path
import requests
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from typing import List
import unicodedata
import json
from collections import Counter, defaultdict
import base64
from IPython.display import Image, display
from torch.utils.data import Dataset, DataLoader, TensorDataset

# Import from local modules
from config import Config
from tokenizer import Tokenizer
from model import MLP, AttentionHead, TransformerBlock, transformer
from dataset import Book_Dataset
from train import train_loop, save_model, load_model

### Dimensions

\begin{align*}
    &\text{d-model} = d_m & : & \text{model dimension (num neurons)} \\
    &d_v = \text{d-vocab} & : & \text{vocab dimension} \\
    &n_c = \text{n-context} & : & \text{context window (len of seq entered)}
\end{align*}

Where $d_n << d_m$

## Getting Data

In [None]:
url = "https://www.gutenberg.org/files/67098/67098-0.txt"
response = requests.get(url)
text = response.text

start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK"

start_idx = text.find(start_marker)
end_idx = text.find(end_marker)
content_area = text[start_idx:end_idx].split("\n", 1)[1]

chapter_idx = content_area.upper().find("CHAPTER I")

raw_text = content_area[chapter_idx:]
print(raw_text[:500000])


## Tokenize Text

In [None]:
# Tokenizer class is defined in tokenizer.py

In [None]:
# test case for tokenizer class 
tokenizer = Tokenizer(raw_text)

sample_text = "Chapter I. The starting of the journey!"

encoded = tokenizer.tokenize(sample_text)
decoded = tokenizer.detokenize(encoded)

## Multilayer Perceptron
$$
    \texttt{MLP}(\mathbf{X}) = W_d \cdot \sigma_{\texttt{ReLU}} (W_u \cdot x + b_u) + b_d, \qquad \texttt{MLP} : \mathbb{R}^{d_m} \to \mathbb{R}^{d_m}
$$

In [None]:
# MLP class is defined in model.py

## Attention Head
### Weight Matrix
$$
    \mathbf{W}_{QK} := \mathbf{W}_{Q} \cdot \mathbf{W}_{K}^T, \qquad \mathbf{W}_{Q}, \mathbf{W}_{K} \in \mathbb{R}^{d_m \times d_n}
$$

### Autoregressive Masking (M) Matrix
$$
    M_{i,j} = 
    \begin{cases}
        0 &j \geq i \\
        -\infty &j < i>
    \end{cases}
$$

### Forward Pass
$$A(\mathbf{X}) = \sigma_{\text{softmax}} (\mathbf{X} \; \mathbf{W}_{QK} \; \mathbf{X}^\text{T} + \mathbf{M}) \; \mathbf{X} \; \mathbf{W}_{OV}$$

In [None]:
# AttentionHead class is defined in model.py

### Transformer Block
$$
    \text{TB}(X) = X + A(X) + \text{MLP}(X), \qquad \text{TB}: \mathbb{R}^{n_c \times d_m} \to \mathbb{R}^{n_c \times d_m}
$$

In [None]:
# TransformerBlock class is defined in model.py

In [None]:
class transformer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.config = config
        self.token_embedding = nn.Embedding(config.d_vocab, config.d_model)
        self.pos_embedding = nn.Embedding(config.n_context_max, config.d_model)
        #self.transformerblocks = nn.modules list of transformer blocks
        self.transformerblocks = nn.ModuleList([TransformerBlock(config) for _ in range(config.n_layers)])
        
    def forward(self, x: Int[torch.Tensor, "n_context"]) -> Float[torch.Tensor, "n_context d_vocab"]:
        x = self.token_embedding(x) # converts int d-vector to d-model vector
        x = x + self.pos_embedding(torch.arrange(x.shape[0])) # x = E + P
        # pos_embedding(x) uses nn.Embedding of torch.arrange(n_context)
        for i in range(self.config.n_layers):
            x = self.transformerblocks[i](x)
        x = x @ self.token_embedding.weight.T # unembedding 
        #n_contex long - sequence if ints of length n  - float tneosry by n_model  and output is float tencsosr by d-vocab \n",
        #d_model to d_vocab transpose or do a lineear map  - unembed nn.linear
        #dmodel to dvocab 

        return x
    
    def generator(self, num_tokens = 10, input_text = ""):# some text, number of new token, and return esseuquence of text - tokenzise text, sequence of numbers, numbers in model and get probaility, sample probablities, detonize 
    
        tokenizer = Tokenizer(raw_text)
        tokenized_text =  tokenizer.tokenize(input_text)
        input_tensor = torch.tensor(tokenized_text, dtype=torch.long)
        for i in range(num_tokens):
                out = self.forward(input_tensor)
                print("Finished running through forward!")
                probailities = torch.softmax(out[:, -1], dim = -1)
                new_token = torch.multinomial(probailities, num_samples= 1)
                new_input_tensor = torch.cat([input_tensor, new_token], dim = -1)
                input_tensor = new_input_tensor
        detokenized_text = tokenizer.detokenize(input_tensor.tolist())

        return detokenized_text
   
# use nn.ModuleList for TB seqeunce & MHA (to create a list of TBS)
# print(f"{x.shape = }") for debugging

# pick a unique dataset to train data on

# if traning models: aim for < 10 million parameters for now
#   sum(x.numel() for x in mymodel.parameters())


NameError: name 'x' is not defined

In [None]:
#Generate function based on user input for n_layers : int, d_model : int, d_vocab : int, d_hidden : int
# def Generator(User_n_layers : int, User_d_model : int, User_d_vocab : int, User_d_hidden : int):
#     configuration = Config(User_n_layers, User_d_model, User_d_vocab, user_d_hidden)
#     Tranformer_model = transformer(configuration)
#     return Tranformer_model

#model_initialize_something 

# Save and Load Models

In [None]:
# save_model and load_model are defined in train.py

In [None]:
# Book_Dataset class is defined in dataset.py


In [None]:
# train_loop is defined in train.py

### Implement Training 

#### Already ran at beginning of code
1) import raw text 
2) tokenizer = Tokenizer(raw_text)


#### Now: 
Set up config, run tokenized text through dataset (update to tensor), train loop  


In [None]:
cfg = Config(d_model=64, d_vocab=31, d_hidden=128, n_layers=2, n_context=64, n_context_max = 64) # 64 or 128
model = transformer(cfg)
samples = Book_Dataset(raw_text, cfg.n_context)
train_loop(samples, batchsize=32, model=model)


In [None]:
train_loop(samples, batchsize= 32, model=transformer, epochs = 50)


Initialize model:
1. Training loop on model and generation on model.
2. (model_stupid) no training loop, generation on model.

# Statistical Analysis
(Using markov chains?)