In [1]:
import sys, os
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn.functional as F
import torch.nn as nn
import math
from transformers import AutoConfig, AutoTokenizer, AutoModelForMaskedLM, AutoModel

os.environ["TOKENIZERS_PARALLELISM"] = "false"

def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
seed_everything(0)


df = pd.read_csv("data/spaceship_titanic_train.csv")
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [2]:
col_exclude_threshold = 0.2

cols_distinct_rate = df.nunique()/df.shape[0]

cols_to_use = list(cols_distinct_rate[cols_distinct_rate < col_exclude_threshold].index)
cols_to_use

['HomePlanet',
 'CryoSleep',
 'Destination',
 'Age',
 'VIP',
 'RoomService',
 'FoodCourt',
 'ShoppingMall',
 'Spa',
 'VRDeck',
 'Transported']

In [3]:
from sklearn.preprocessing import LabelEncoder

cat_cols = []

for col in cols_to_use:
    if df[col].dtype in ["object", "category"]:
        df[col] = LabelEncoder().fit_transform(df[col])
        cat_cols.append(col)
    elif df[col].dtype in ["bool"]:
        df[col] = df[col].astype("float")

df[cols_to_use].dtypes

HomePlanet        int64
CryoSleep         int64
Destination       int64
Age             float64
VIP               int64
RoomService     float64
FoodCourt       float64
ShoppingMall    float64
Spa             float64
VRDeck          float64
Transported     float64
dtype: object

In [4]:
cols_with_nulls = df[cols_to_use].isnull().max()

for col in cols_with_nulls[cols_with_nulls].index:
    if col in cat_cols:
        df[col] = df[col].fillna(df[col].max() + 1)
    else:
        df[f"{col}_is_null"] = df[col].isnull().astype("float")
        df[col] = df[col].fillna(df[col].mean())
        cols_to_use.append(f"{col}_is_null")

In [5]:
df[cols_to_use]

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Age_is_null,RoomService_is_null,FoodCourt_is_null,ShoppingMall_is_null,Spa_is_null,VRDeck_is_null
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8689,0,1,1,18.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8690,0,0,2,26.0,0,0.0,0.0,1872.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
8691,1,0,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.preprocessing import StandardScaler

numeric_cols = [col for col in cols_to_use if col not in cat_cols]

df[numeric_cols] = StandardScaler().fit_transform(df[numeric_cols])

In [7]:
class Config:
    device = "cuda"
    lr = 4e-4
    shrink = 0.7
    n_layers = 3
    emb_size = 8
    batch_size = 128
    num_workers = 8
    epochs = 50



def adjust_lr(optimizer, epoch):
    optimizer.param_groups[0]['lr'] = Config.lr
    return Config.lr


def get_optimizer(net):
    params = [x[1] for x in net.named_parameters()]
    optimizer = torch.optim.Adam([{"params": params}], lr=3e-4, betas=(0.9, 0.999),
                                 eps=1e-08)
    return optimizer


class AEModel(nn.Module):
    def __init__(self):
        super(AEModel, self).__init__()
        encoder_layers, decoder_layers = [], []
        
        inp_size = len(numeric_cols)
        for l in range(1, Config.n_layers + 1):
            out_size = round(len(numeric_cols)*(Config.shrink**l))
            if l == 1:
                inp_size += Config.emb_size*len(cat_cols)
            print(inp_size, out_size)
            encoder_layers.extend([nn.Linear(inp_size, out_size), nn.BatchNorm1d(out_size), nn.LeakyReLU()])
            inp_size = out_size
            
        self.encoder = nn.Sequential(*encoder_layers)
        
        
        for l in range(1, Config.n_layers + 1):
            out_size = round(len(numeric_cols)*(Config.shrink**(Config.n_layers - l)))
            print(inp_size, out_size)
            decoder_layers.extend([nn.BatchNorm1d(inp_size), nn.LeakyReLU(), nn.Linear(inp_size, out_size)])
            inp_size = out_size
            
        self.decoder = nn.Sequential(*decoder_layers)
        self.embeddings = nn.ModuleList([nn.Embedding(df[col].max() + 1, Config.emb_size) for col in cat_cols])

    def forward(self, x_numeric, x_cat):
        x = torch.cat([x_numeric] + [self.embeddings[i](x_cat[:, i]) for i in range(len(cat_cols))], axis=1)
        
        x = self.encoder(x)
        return self.decoder(x)

    def save(self, path):
        torch.save(self.state_dict(), path)

    def load(self, path):
        self.load_state_dict(torch.load(path, map_location='cpu'))

        
model = AEModel()
model = model.to(Config.device)

45 9
9 6
6 4
4 6
6 9
9 13


In [8]:
class AEDataset(Dataset):
    def __init__(self, df):
        self.df = df.reset_index(drop=True)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        row = self.df.iloc[index]
        
        return torch.FloatTensor(row[numeric_cols]), torch.LongTensor(row[cat_cols])

In [9]:
def train(model, df, epochs, n_batches):
    train_ds = AEDataset(df)

    train_loader = DataLoader(train_ds, batch_size=Config.batch_size, shuffle=True, 
                              num_workers=Config.num_workers, pin_memory=False, drop_last=True)

    optimizer = get_optimizer(model)

    scaler = torch.cuda.amp.GradScaler()

    for e in range(epochs):
        model.cuda()
        model.train()
        tbar = tqdm(train_loader, file=sys.stdout)

        lr = adjust_lr(optimizer, e)

        loss_list = []

        optimizer.zero_grad()
        for idx, (x_numeric, x_cat) in enumerate(tbar):
            x_numeric, x_cat = x_numeric.to(Config.device), x_cat.to(Config.device)
            
            with torch.cuda.amp.autocast(enabled=(Config.device == "cuda")):
                pred = model(x_numeric, x_cat)
                error = (pred - x_numeric)**2
                loss = error.mean()

            loss = loss / n_batches
            scaler.scale(loss).backward()

            if ((idx + 1) % n_batches) == 0:
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
                

            loss_list.append(loss.detach().cpu().item())
            avg_loss = np.round(n_batches * np.mean(loss_list), 4)

            tbar.set_description(f"Epoch {e + 1} Loss: {avg_loss} lr: {lr}")

n_batches = 1 # gradient accumulation batch count. if batches dont fit into memory, increase this and shrink batch size

train(model, df, epochs=Config.epochs, n_batches=n_batches)

Epoch 1 Loss: 1.1268 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 40.73it/s]
Epoch 2 Loss: 1.0358 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 59.35it/s]
Epoch 3 Loss: 0.9612 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 65.06it/s]
Epoch 4 Loss: 0.9269 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 55.35it/s]
Epoch 5 Loss: 0.9008 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 56.48it/s]
Epoch 6 Loss: 0.8752 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 62.64it/s]
Epoch 7 Loss: 0.8537 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 65.27it/s]
Epoch 8 Loss: 0.8354 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 55.32it/s]
Epoch 9 Loss: 0.8174 lr: 0.0004: 100%|██████████| 67/67 [00:01<00:00, 60.16it/s]
Epoch 10 Loss: 0.7988 lr: 0.0004: 100%|█████████| 67/67 [00:01<00:00, 58.38it/s]
Epoch 11 Loss: 0.7816 lr: 0.0004: 100%|█████████| 67/67 [00:01<00:00, 56.31it/s]
Epoch 12 Loss: 0.7672 lr: 0.0004: 100%|█████████| 67/67 [00:01<00:00, 63.43it/s]
Epoch 13 Loss: 0.7486 lr: 0.

In [10]:
def inference(model, df):
    ds = AEDataset(df)
    loader = DataLoader(ds, batch_size=Config.batch_size, shuffle=False, 
                        num_workers=Config.num_workers, pin_memory=False, drop_last=False)
    
    model.eval()

    tbar = tqdm(loader, file=sys.stdout)

    errors = []

    with torch.no_grad():
        for idx, (x_numeric, x_cat) in enumerate(tbar):
            x_numeric, x_cat = x_numeric.to(Config.device), x_cat.to(Config.device)

            pred = model(x_numeric, x_cat)
            error = (pred - x_numeric)**2
            errors.append(error.detach().cpu().numpy())

    return np.concatenate(errors)


errors = inference(model, df)
df["reconstruction_error"] = errors.mean(axis=1)
df.sort_values("reconstruction_error").tail(5)

100%|███████████████████████████████████████████| 68/68 [00:00<00:00, 68.23it/s]


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,VRDeck,Name,Transported,Age_is_null,RoomService_is_null,FoodCourt_is_null,ShoppingMall_is_null,Spa_is_null,VRDeck_is_null,reconstruction_error
7425,7941_01,1,0,C/257/P,2,0.848924,0,-0.34059,-0.214557,17.629008,...,-0.260198,Alrakan Apedishaft,-1.007274,-0.144997,-0.145822,-0.146643,-0.156569,-0.146643,-0.148676,26.045612
6223,6583_01,1,0,B/254/S,2,-1.103897,0,-0.34059,-0.224592,20.22031,...,-0.198426,Charda Sunlove,0.992779,-0.144997,-0.145822,-0.146643,-0.156569,-0.146643,-0.148676,29.138103
4416,4690_02,1,0,,2,-0.127486,0,21.376811,0.645358,-0.290817,...,-0.269023,Tope Dishocatal,-1.007274,-0.144997,-0.145822,-0.146643,-0.156569,-0.146643,-0.148676,32.275818
5619,5977_02,1,0,B/230/S,0,1.964822,0,-0.037422,-0.281669,-0.290817,...,21.027422,,-1.007274,-0.144997,-0.145822,-0.146643,-0.156569,6.819291,-0.148676,32.877277
8415,8989_01,1,0,B/291/P,2,-0.824923,0,-0.34059,-0.057752,39.034033,...,-0.266375,,0.992779,-0.144997,-0.145822,-0.146643,-0.156569,-0.146643,-0.148676,111.955864
