In [1]:
import os
import random
import time
import typing as ty
import yaml

import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

os.chdir('/home/mrsergazinov/TabLLM/feature_encoding_exp/')
from base_models.mlp import MLP
from base_models.tabTransformer import TabTransformer
from base_models.modernNCA import ModernNCA
from encoders.numEncoders import FourierFeatures, BinningFeatures

MODELS = {
    'MLP': MLP,
    'TabTransformer': TabTransformer,
    'ModernNCA': ModernNCA
}
ENCODERS = {
    'FourierFeatures': FourierFeatures,
    'BinningFeatures': BinningFeatures,
}


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

# load yaml config from confgs/adult.yaml
with open('configs/adult.yaml', 'r') as file:
    config = yaml.safe_load(file)



In [2]:
params = {
    'model_name': 'TabTransformer',
    'num_encoder': 'FourierFeatures',
    # None,
    # 'FourierFeatures',
    # 'BinningFeatures',
    'num_encoder_trainable': True
    # False,
    # True,
}

# Load dataset
data = fetch_openml("adult", version=2, as_frame=True)
X = data['data']
y = data['target']

# Identify categorical and numerical columns
categorical_columns = X.select_dtypes(include=['category', 'object']).columns.tolist()
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Split the data into training and test sets before processing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=config['seed']
)

# Encode the target variable
le_target = LabelEncoder()
y_train = le_target.fit_transform(y_train)
y_test = le_target.transform(y_test)

# Process categorical columns
d_in_cat = None
if params['model_name'] == 'TabTransformer':
    # encode with label encoding
    d_in_cat = []
    for col in categorical_columns:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col])
        X_test[col] = le.transform(X_test[col])
        d_in_cat.append(len(le.classes_))
    X_train_cat = X_train[categorical_columns].copy()
    X_test_cat = X_test[categorical_columns].copy()
else:
    X_train_cat = pd.get_dummies(X_train[categorical_columns], drop_first=True)
    X_test_cat = pd.get_dummies(X_test[categorical_columns], drop_first=True)

    # Align the test and train categorical features to prevent data leakage
    X_train_cat, X_test_cat = X_train_cat.align(X_test_cat, join='left', axis=1, fill_value=0)

# Process numerical columns
numerical_transformer = StandardScaler()
X_train_num = numerical_transformer.fit_transform(X_train[numerical_columns])
X_test_num = numerical_transformer.transform(X_test[numerical_columns])

if (not params['num_encoder_trainable']) and (params['num_encoder'] is not None):
    num_encoder = ENCODERS[params['num_encoder']](
        n_features=X_train_num.shape[1],
        **config[params['num_encoder']],
    )
    with torch.no_grad():
        X_train_num = num_encoder(torch.from_numpy(X_train_num).float(), trainable=False)
        X_test_num = num_encoder(torch.from_numpy(X_test_num).float(), trainable=False)
        X_train_num = X_train_num.numpy()
        X_test_num = X_test_num.numpy()
    num_encoder = None
elif params['num_encoder'] is not None:
    num_encoder = ENCODERS[params['num_encoder']](
        n_features=X_train_num.shape[1],
        **config[params['num_encoder']],
    )
else:
    num_encoder = None

# Convert to tensors
y_train = torch.tensor(y_train, dtype=torch.long)
y_test = torch.tensor(y_test, dtype=torch.long)
X_train_num = torch.tensor(X_train_num, dtype=torch.float32)
X_test_num = torch.tensor(X_test_num, dtype=torch.float32)
X_train_cat = torch.tensor(X_train_cat.values, dtype=torch.float32)
X_test_cat = torch.tensor(X_test_cat.values, dtype=torch.float32)

if params['model_name'] == 'TabTransformer':
    X_train_cat = X_train_cat.to(torch.long)
    X_test_cat = X_test_cat.to(torch.long)

# Determine input dimensions
d_in_num = X_train_num.shape[1]
d_in_cat = X_train_cat.shape[1] if d_in_cat is None else d_in_cat
d_out = len(np.unique(y_train))

# Define the model
model = MODELS[params['model_name']](
    d_in_num=d_in_num,
    d_in_cat=d_in_cat,
    d_out=d_out,
    num_encoder=num_encoder,
    **config[params['model_name']],
)

# Move model to device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Define the loss criterion
loss_criterion = nn.CrossEntropyLoss()

# Start training
model.fit(
    X_num_train=X_train_num,
    X_cat_train=X_train_cat,
    y_train=y_train,
    criterion=loss_criterion,
    **config['training'],
)

# Define the accuracy criterion function
def accuracy_criterion(outputs: torch.Tensor, targets: torch.Tensor) -> float:
    with torch.no_grad():
        _, predicted = torch.max(outputs, dim=1)
        correct = (predicted == targets).sum().item()
        accuracy = correct / targets.size(0)
    return accuracy * 100

# Evaluate the model using accuracy
model.evaluate(
    X_num_test=X_test_num,
    X_cat_test=X_test_cat,
    y_test=y_test,
    criterion=accuracy_criterion,
    batch_size=32,
)


Iteration [0/611] | Loss: 0.6595
Iteration [50/611] | Loss: 0.2944
Iteration [100/611] | Loss: 0.2935
Iteration [150/611] | Loss: 0.2795
Iteration [200/611] | Loss: 0.3654
Iteration [250/611] | Loss: 0.5282
Iteration [300/611] | Loss: 0.3020
Iteration [350/611] | Loss: 0.4414
Iteration [400/611] | Loss: 0.3226
Iteration [450/611] | Loss: 0.2829
Iteration [500/611] | Loss: 0.2671
Iteration [550/611] | Loss: 0.3936
Iteration [600/611] | Loss: 0.3198
Epoch [1/10] | Loss: 0.3298 | Time: 7.30s
Iteration [0/611] | Loss: 0.2705
Iteration [50/611] | Loss: 0.2513
Iteration [100/611] | Loss: 0.3395
Iteration [150/611] | Loss: 0.3209
Iteration [200/611] | Loss: 0.1987
Iteration [250/611] | Loss: 0.2425
Iteration [300/611] | Loss: 0.2550
Iteration [350/611] | Loss: 0.3230
Iteration [400/611] | Loss: 0.3459
Iteration [450/611] | Loss: 0.2867
Iteration [500/611] | Loss: 0.3798
Iteration [550/611] | Loss: 0.2542
Iteration [600/611] | Loss: 0.2352
Epoch [2/10] | Loss: 0.2969 | Time: 6.36s
Iteration [0

In [3]:
X_train_num.shape

torch.Size([39073, 6])