In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from time import time
import json

In [None]:
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

colors = ['white', '#eae2b7', '#fcbf49', '#f77f00', '#d62828', '#003049']
boundaries = [-1, 0.001, 0.10, 0.25, 0.5, 0.75, 1]
cmap = mcolors.ListedColormap(colors)
norm = mcolors.BoundaryNorm(boundaries, cmap.N, clip=True)
rose = '#ff99ac'

In [None]:
data_fname = 'data/llm/events_train.txt'

# Organizing data

In [None]:
df = pd.read_csv(data_fname, sep=' ', header=None, dtype=str)
df = df.fillna('<NaN>')

In [None]:
train_df = []
for i in range(11, 18):
    _df = df.copy()
    _df['target'] = _df[i]
    for j in range(i, 18):
        _df[j] = '<NaN>'

    train_df.append(_df)
train_df = pd.concat(train_df, ignore_index=True)
train_df = train_df[train_df['target'] != '<NaN>']

# Tokenizing

In [None]:
tokenizer_map = {str(i): i for i in range(0, 101)}
tokenizer_map.update({df[0].value_counts().index[i]: i+len(tokenizer_map) for i in range(len(df[0].value_counts()))})
tokenizer_map.update({'<PERIOD_OVER>': len(tokenizer_map)})
tokenizer_map.update({'<GAME_OVER>': len(tokenizer_map)})
tokenizer_map.update({'<NaN>': len(tokenizer_map)})
detokenizer_map = {v: k for k, v in tokenizer_map.items()}

In [None]:
json.dump(tokenizer_map, open('models/llm/tokenizer_map.json', 'w'))
json.dump(detokenizer_map, open('models/llm/detokenizer_map.json', 'w'))

In [None]:
for i in range(0, 18):
    train_df[i] = train_df[i].map(tokenizer_map)
train_df['target'] = train_df['target'].map(tokenizer_map)

In [None]:
X_train = train_df.drop('target', axis=1).astype(float).values
Y_train = pd.get_dummies(train_df['target']).astype(float).values

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
Y_train_tensor = torch.tensor(Y_train, dtype=torch.float32)

train_dataset = TensorDataset(X_train_tensor, Y_train_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

input_size = X_train.shape[1]
output_size = len(tokenizer_map) - 1 # because of <NaN> token not existing in the output

# Define Model

In [None]:
DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
MODEL_NAME = 'llm_v1_tokens_v2_lite'

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

class MultiLayerBinaryClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, activation='relu'):
        super(MultiLayerBinaryClassifier, self).__init__()

        activation_dict = {
            'relu': nn.ReLU,
            'sigmoid': nn.Sigmoid,
            'tanh': nn.Tanh,
            'leaky_relu': nn.LeakyReLU,
        }
        layers = [
            nn.Linear(input_size, hidden_size[0]),
            activation_dict[activation]()
        ] + flatten([
            [nn.Linear(hidden_size[i], hidden_size[i+1]),
            activation_dict[activation]()] for i in range(len(hidden_size) - 1)
        ]) + [
            nn.Linear(hidden_size[-1], output_size),
            nn.Sigmoid()
        ]

        self.model = nn.Sequential(*layers)
        
        # Initialize the linear layers
        self.init_weights()

    def init_weights(self):
        for m in self.model.modules():
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight)
                init.zeros_(m.bias)
    
    def forward(self, x):
        return self.model(x)
    
def cyclic_cosine_annealing_lr(lr, T_max, eta_min=0, last_epoch=-1):
    if last_epoch == 0:
        return lr

    if last_epoch % (2 * T_max) < T_max:
        return (
            eta_min
            + (lr - eta_min)
            * (1 + torch.cos(torch.tensor(3.1415 * last_epoch / T_max)))
            / 2
        )
    else:
        return (
            eta_min
            + (lr - eta_min)
            * (1 + torch.cos(torch.tensor(3.1415 * (last_epoch - T_max) / T_max)))
            / 2
        )

# Train model

In [None]:
def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0

    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

In [None]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0

    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(dataloader.dataset)
    return epoch_loss

In [None]:
#model = MultiLayerBinaryClassifier(input_size, [256, 256], output_size).to(DEVICE)
model = MultiLayerBinaryClassifier(input_size, [512, 512, 512], output_size).to(DEVICE)
learning_rate_init = 0.001
num_epochs = 50
best_val_loss = 1000

criterion = nn.BCELoss()
for epoch in range(num_epochs):
    t0 = time()
    lr_update = cyclic_cosine_annealing_lr(learning_rate_init, num_epochs, 0, epoch)
    optimizer = optim.Adam(model.parameters(), lr=lr_update)
    train_loss = train(model, train_dataloader, criterion, optimizer, DEVICE)
    test_loss = evaluate(model, train_dataloader, criterion, DEVICE)
    print(f'Epoch: {epoch+1}/{num_epochs}. Training loss: {train_loss:.4f}. Test loss: {test_loss:.4f}. Time: {time() - t0:.2f}s')

    if test_loss < best_val_loss:
        best_val_loss = test_loss
        torch.save(model, f'models/llm/full_{MODEL_NAME}.pth')