In [4]:
import numpy as np , pandas as pd
import torch
import torch.nn as nn

import torch
from torch.utils.data import Dataset, DataLoader

import einops
import pickle
from tqdm import tqdm

import configparser


In [5]:
# the settings file
config = configparser.ConfigParser()
config.read('settings.ini')

config= {key: int(value) for key, value in config['INCOME_DATASET'].items()}
config

{'val_split': 20,
 'test_split': 10,
 'epochs': 2,
 'n_embed': 1024,
 'n_heads': 32,
 'transformer_blocks': 2,
 'batch_size': 32,
 'random_seed': 123}

In [6]:
# load the encoded variables
file_path = 'encoded_vars/token_vars_income.pkl'

with open(file_path, 'rb') as file:
    loaded_variable = pickle.load(file)

print("Loaded variable :",loaded_variable.keys())
df = loaded_variable["df"]
col_code = loaded_variable["col_code"]
tokenizer = loaded_variable["tokenizer"]

Loaded variable : dict_keys(['col_code', 'df', 'tokenizer'])


In [7]:
documents = torch.load("encoded_vars/encoded_docs_income.pt")
documents # 0 is blank 1 is start and 21944 is end

tensor([[    0,     1,     2,  ..., 21930, 21938, 21944],
        [    0,     1,     3,  ..., 21930, 21938, 21944],
        [    0,     1,     2,  ..., 21931, 21939, 21944],
        ...,
        [    0,     1,     3,  ..., 21930, 21938, 21944],
        [    0,     1,     3,  ..., 21930, 21938, 21944],
        [    0,     1,     3,  ..., 21930, 21938, 21944]])

In [8]:
# train_test_split
train_ratio = (100 - config["val_split"] - config["test_split"])/100
val_ratio = config["val_split"]/100
test_ratio = config["test_split"]/100
splits = [train_ratio,val_ratio,test_ratio]

gen = torch.Generator().manual_seed(config["random_seed"])
train_set , val_set , test_set = torch.utils.data.random_split(documents,splits,generator=gen)
len(train_set),len(val_set),len(test_set)

(22793, 6512, 3256)

In [109]:
# these are global variables ... dont delete cell
context_wind , vocab_len  = train_set[0].shape[0], train_set[0][-1]
context_wind = context_wind - 2 # you shouldnt need the blank token and start token to predict the end token
vocab_len = vocab_len + 1
context_wind, vocab_len

(34, tensor(21945))

In [110]:
import torch
from torch.utils.data import Dataset, DataLoader

class AutoRegDataset(Dataset):
    def __init__(self, train_set, context_window):
        self.train_set = train_set
        self.context_window = context_window
        self.X, self.y = self.compile_dataset()

    def compile_dataset(self):
        X = []
        y = []
        for row in self.train_set:
            row_tensor = torch.tensor(row[1:], dtype=torch.long)
            context = torch.zeros(self.context_window, dtype=torch.long)

            for value in row_tensor:
                X.append(context.clone().unsqueeze(0))
                y.append(torch.tensor([value]))
                context = torch.cat((context[1:], torch.tensor([value])))

        X = torch.cat(X, dim=0)
        y = torch.cat(y, dim=0)
        return X, y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [111]:

train_dataset = AutoRegDataset(train_set, context_wind)

# inspect an entire row just to make sure with shuffle = False
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=False)

# Example of iterating over the DataLoader
i = 0
for batch in train_dataloader:
    i += 1
    inputs, targets = batch
    print("Inputs:", inputs)
    print("Targets:", targets)
    if i > 38:
        break


  row_tensor = torch.tensor(row[1:], dtype=torch.long)


Inputs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
Targets: tensor([1])
Inputs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])
Targets: tensor([3])
Inputs: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 1, 3]])
Targets: tensor([12])
Inputs: tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3, 12]])
Targets: tensor([30])
Inputs: tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3, 12, 30]])
Targets: tensor([39])
Inputs: tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3, 12, 30, 39]])
Targets: t

In [112]:
import torch
import torch.nn as nn

class TabTransformer(nn.Module):

    def __init__(self, vocab_len, config, num_blocks=2):
        super().__init__()
        self.n_embed = config["n_embed"]
        self.n_heads = config["n_heads"]
        
        
        # note x should have tokens with val <= vocab_len -1
        self.C = torch.nn.Embedding(num_embeddings=vocab_len, embedding_dim=self.n_embed)
        # pos emb
        self.pe = torch.nn.Parameter(torch.randn(context_wind,self.n_embed) * 0.01)
        
        # no change in the size of representation happens here
        # Stack of transformer blocks
        self.transformer_blocks = nn.ModuleList([
            nn.ModuleDict({
                'multi_head': nn.MultiheadAttention(
                    embed_dim=self.n_embed,
                    num_heads=self.n_heads,
                    batch_first=True
                ),
                'layer_norm1': nn.LayerNorm(self.n_embed), # one before skip connection
                'layer_norm2': nn.LayerNorm(self.n_embed),
                'ffn': nn.Sequential(
                    nn.Linear(self.n_embed, self.n_embed),
                    nn.ReLU(),
                    nn.Linear(self.n_embed, self.n_embed)
                )
            })
            for _ in range(num_blocks)
        ])
        
        # and then a small mlp head. first with bias and second without
        # flatten before sending
        self.mlp_head0 = nn.Linear(in_features=context_wind*self.n_embed, out_features=self.n_embed,bias=True)
        self.mlp_head1 = nn.Parameter(torch.randn(self.n_embed, vocab_len) * 1e-4) # this is increase in represeentation dim

    def forward(self, x):
        # Embedding the input tokens
        emb = self.C(x)  # Shape: (B, wind, n_embed)
        emb = emb + self.pe # add positional embedding

        # Pass through each transformer block
        for block in self.transformer_blocks:
            # Apply multi-head attention with emb as query, key, and value
            attn_output, _ = block['multi_head'](emb, emb, emb)  # Self-attention

            # Add & Norm
            attn_output = block['layer_norm1'](emb + attn_output)

            # Feed-forward network
            ffn_output = block['ffn'](attn_output)

            # Final output with Add & Norm
            emb = block['layer_norm2'](attn_output + ffn_output)
            
        # flatten the representaion
        flat = einops.rearrange(emb, "batch wind n_embed -> batch (wind n_embed)")
        mlp0 = self.mlp_head0(flat)
        out = mlp0 @ self.mlp_head1

        return out

    def calculate_loss(self,x,y):
        logits = self(x)
        return logits , nn.functional.cross_entropy(logits,y)

    def generate(self,seed = None,verbose=False):
        seed = [0] * context_wind if seed is None else seed
        generation = list()
        i = 0
        while True:
            if verbose:
                print(seed)
            logits = self(torch.tensor(seed).view(1,-1))
            probs = nn.functional.softmax(logits,dim=1)
            prediction = torch.multinomial(probs,num_samples=1)
            generation.append(prediction)
            if prediction.item() == vocab_len:
                break
            seed = seed[1:] + [prediction.item()]
            if verbose:
                print(prediction.item())
            i = i+1
        return generation 
        

model = TabTransformer(vocab_len, config, num_blocks=config["transformer_blocks"])



In [113]:
model(inputs).shape # check if this is the correct shape

torch.Size([1, 21945])

In [114]:
model(inputs)[0].shape

torch.Size([21945])

In [115]:
learned_params = 0
for i,p in enumerate(model.parameters()):
    learned_params += p.nelement()

print(f"Tot Matrics : {i=} , tot learned params {learned_params=:_}")

Tot Matrics : i=28 , tot learned params learned_params=93_234_176


In [116]:
# optimizer ::
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01
)

In [117]:
track_loss = []

In [118]:
train_dataset = AutoRegDataset(train_set, context_wind)
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)

  row_tensor = torch.tensor(row[1:], dtype=torch.long)


In [119]:
for Xb,yb in tqdm(train_dataloader):
    # forward pass
    _,loss = model.calculate_loss(Xb,yb)

    # flush the gradients 
    optimizer.zero_grad(set_to_none=True)

    # backprop
    loss.backward()

    # learning step
    optimizer.step()

    # for plotting
    track_loss.append(loss.item())
    

  0%|          | 30/6233 [00:39<2:15:45,  1.31s/it]


KeyboardInterrupt: 

In [None]:
for Xb ,yb in train_dataloader:
    if vocab_len in Xb:
        print(True)

In [None]:
model.mlp_head1.shape