# Categorical Tabular Pytorch Classifier
_By Nick Brooks, 2020-01-10_

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import sys
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from sklearn import metrics
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import ReduceLROnPlateau
# from torch.utils.data import Dataset, DataLoader

print("\nPytorch Version: {}".format(torch.__version__))
print("Python Version: {}\n".format(sys.version))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Pytorch Compute Device: {}".format(device))

from contextlib import contextmanager
import time
import gc
notebookstart = time.time()

@contextmanager
def timer(name):
    """
    Time Each Process
    """
    t0 = time.time()
    yield
    print('\n[{}] done in {} Minutes'.format(name, round((time.time() - t0)/60,2)))

In [None]:
seed = 50
debug = None

if debug:
    nrow = 20000
else:
    nrow = None

In [None]:
with timer("Load"):
    PATH = "/kaggle/input/cat-in-the-dat-ii/"
    train = pd.read_csv(PATH + "train.csv", index_col = 'id', nrows = nrow)
    test = pd.read_csv(PATH + "test.csv", index_col = 'id', nrows = nrow)
    submission_df = pd.read_csv(PATH + "sample_submission.csv")
    [print(x.shape) for x in [train, test, submission_df]]

    traindex = train.index
    testdex = test.index

    y_var = train.target.copy()
    print("Target Distribution:\n",y_var.value_counts(normalize = True).to_dict())

    df = pd.concat([train.drop('target',axis = 1), test], axis = 0)
    del train, test, submission_df

In [None]:
with timer("FE 1"):
    drop_cols=["bin_0"]

    # Split 2 Letters; This is the only part which is not generic and would actually require data inspection
    df["ord_5a"]=df["ord_5"].str[0]
    df["ord_5b"]=df["ord_5"].str[1]
    drop_cols.append("ord_5")

    xor_cols = []
    nan_cols = []
    for col in df.columns:
        # NUll Values
        tmp_null = df.loc[:,col].isnull().sum()
        if tmp_null > 0:
            print("{} has {} missing values.. Filling".format(col, tmp_null))
            nan_cols.append(col)
            if df.loc[:,col].dtype == "O":
                df.loc[:,col].fillna("NAN", inplace=True)
            else:
                df.loc[:,col].fillna(-1, inplace=True)
        
        # Categories that do not overlap
        train_vals = set(df.loc[traindex, col].unique())
        test_vals = set(df.loc[testdex, col].unique())
        
        xor_cat_vals=train_vals ^ test_vals
        if xor_cat_vals:
            df.loc[df[col].isin(xor_cat_vals), col]="xor"
            print("{} has {} xor factors, {} rows".format(col, len(xor_cat_vals),df.loc[df[col] == 'xor',col].shape[0]))
            xor_cols.append(col)


    # One Hot Encode None-Ordered Categories
    ordinal_cols=['ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5a', 'day', 'month']
    X_oh=df[df.columns.difference(ordinal_cols)]
    oh1=pd.get_dummies(X_oh, columns=X_oh.columns, drop_first=True, sparse=True)
    ohc1=oh1.sparse.to_coo()

In [None]:
from sklearn.base import TransformerMixin
from itertools import repeat
import scipy

class ThermometerEncoder(TransformerMixin):
    """
    Assumes all values are known at fit
    """
    def __init__(self, sort_key=None):
        self.sort_key = sort_key
        self.value_map_ = None
    
    def fit(self, X, y=None):
        self.value_map_ = {val: i for i, val in enumerate(sorted(X.unique(), key=self.sort_key))}
        return self
    
    def transform(self, X, y=None):
        values = X.map(self.value_map_)
        
        possible_values = sorted(self.value_map_.values())
        
        idx1 = []
        idx2 = []
        
        all_indices = np.arange(len(X))
        
        for idx, val in enumerate(possible_values[:-1]):
            new_idxs = all_indices[values > val]
            idx1.extend(new_idxs)
            idx2.extend(repeat(idx, len(new_idxs)))
            
        result = scipy.sparse.coo_matrix(([1] * len(idx1), (idx1, idx2)), shape=(len(X), len(possible_values)), dtype="int8")
            
        return result

In [None]:
other_classes = ["NAN", 'xor']

with timer("Thermometer Encoder"):
    thermos=[]
    for col in ordinal_cols:
        if col=="ord_1":
            sort_key=(other_classes + ['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster']).index
        elif col=="ord_2":
            sort_key= (other_classes + ['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot']).index
        elif col in ["ord_3", "ord_4", "ord_5a"]:
            sort_key=str
        elif col in ["day", "month"]:
            sort_key=int
        else:
            raise ValueError(col)

        enc=ThermometerEncoder(sort_key=sort_key)
        thermos.append(enc.fit_transform(df[col]))

In [None]:
ohc=scipy.sparse.hstack([ohc1] + thermos).tocsr()
display(ohc)

X_sparse = ohc[:len(traindex)]
test_sparse = ohc[len(traindex):]

print(X_sparse.shape)
print(test_sparse.shape)

del ohc; gc.collect()

In [None]:
# Train Test Split
X_train, X_valid, y_train, y_valid = train_test_split(X_sparse, y_var, test_size=0.2, shuffle=True)

[print(table.shape) for table in [X_train, y_train, X_valid, y_valid]];

In [None]:
class TabularDataset(torch.utils.data.Dataset):
    def __init__(self, data, y=None):
        self.n = data.shape[0]
        self.y = y
        self.X = data

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        if self.y is not None:
            return [self.X[idx].toarray(), self.y.astype(float).values[idx]]
        else:
            return [self.X[idx].toarray()]

In [None]:
train_dataset = TabularDataset(data = X_train, y = y_train)
valid_dataset = TabularDataset(data = X_valid, y = y_valid)
submission_dataset = TabularDataset(data = test_sparse, y = None)

In [None]:
batch_size = 16384

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(valid_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

submission_loader = torch.utils.data.DataLoader(submission_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

In [None]:
next(iter(train_loader))

In [None]:
class Net(nn.Module):
    def __init__(self, dropout = .60):
        super().__init__()
        self.dropout = dropout
        
        self.fc1 = nn.Linear(X_sparse.shape[1], 4096)
        self.d1 = nn.Dropout(p=self.dropout)
        self.bn1 = nn.BatchNorm1d(num_features=4096)
        self.fc2 = nn.Linear(4096, 2048)
        self.d2 = nn.Dropout(p=self.dropout)
        self.bn2 = nn.BatchNorm1d(num_features=2048)
        self.fc3 = nn.Linear(2048, 64)
        self.d3 = nn.Dropout(p=self.dropout)
        self.bn3 = nn.BatchNorm1d(num_features=64)
        self.fc4 = nn.Linear(64, 1)
        self.out_act = nn.Sigmoid()

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.d1(x)
        x = self.bn1(x)
        x = F.relu(self.fc2(x))
        x = self.d2(x)
        x = self.bn2(x)
        x = F.relu(self.fc3(x))
        x = self.d3(x)
        x = self.bn3(x)
        x = self.fc4(x)
        x = self.out_act(x)
        return x

In [None]:
net = Net()
net.to(device)

### Lets follow a recipe:

1. Fix random seed.
1. Do not trust learning rate decay defaults - Check, remove nesterov and momentum.
1.

In [None]:
learning_rate = 0.01
# https://github.com/ncullen93/torchsample/blob/master/README.md
# optimizer = torch.optim.SGD(net.parameters(), lr=learning_rate, momentum=0, nesterov=0)
# scheduler = ReduceLROnPlateau(optimizer, min_lr = 0.00001, mode='min', factor=0.5, patience=3, verbose=True)

EPOCHS = 50

criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters(), lr=learning_rate)

nn_output = []
patience = 0
min_val_loss = np.Inf

full_train_loss = []
full_val_loss = []

for epoch in range(EPOCHS): # 3 full passes over the data
    train_loss = []
    train_metric_pred = []
    train_metric_label = []
    net.train()
    
    for data in train_loader:  # `data` is a batch of data
        X, y = Variable(data[0].to(device).squeeze(1).float()), Variable(data[1].to(device))  # X is the batch of features, y is the batch of targets.
        optimizer.zero_grad()  # sets gradients to 0 before loss calc. You will do this likely every step.
        output = net(X).squeeze()  # pass in the reshaped batch
        tloss = criterion(output, y)  # calc and grab the loss value
        
        tloss.backward()  # apply this loss backwards thru the network's parameters
        optimizer.step()  # attempt to optimize weights to account for loss/gradients 
        
        train_loss.append(tloss.item())
        train_metric_pred.append(output.detach().cpu().numpy())
        train_metric_label.append(y.cpu().numpy())
    # Evaluation with the validation set
    train_metric_score = metrics.roc_auc_score(np.concatenate(train_metric_label), np.concatenate(train_metric_pred))
    full_train_loss.append(train_loss)
    
    net.eval() # eval mode
    val_loss = []
    val_metric_pred = []
    val_metric_label = []
    val_metric_score = 0
    
    with torch.no_grad():
        for data in val_loader:
            X, y = Variable(data[0].to(device).squeeze(1).float()), Variable(data[1].to(device))
            
            preds = net(X).squeeze() # get predictions
            vloss = criterion(preds, y) # calculate the loss
            
            val_loss.append(vloss.item())
            val_metric_pred.append(preds.detach().cpu().numpy())
            val_metric_label.append(y.cpu().numpy())
    
    val_metric_score = metrics.roc_auc_score(np.concatenate(val_metric_label), np.concatenate(val_metric_pred))
    full_val_loss.append(val_loss)
    
    mean_val_loss = np.mean(val_loss)
    tmp_nn_output = [epoch + 1,EPOCHS,
                     np.mean(train_loss),
                     train_metric_score,
                     mean_val_loss,
                     val_metric_score
                     ]
    nn_output.append(tmp_nn_output)
    
    # ReduceLossOnPlateau
#     scheduler.step(final_val_loss)
    
    # Print the loss and accuracy for the validation set
    print('Epoch [{}/{}] train loss: {:.4f} train metric: {:.4f} valid loss: {:.4f} val metric: {:.4f}'
        .format(*tmp_nn_output))
    
    # Early Stopping
    if min_val_loss > round(mean_val_loss,4) :
        min_val_loss = round(mean_val_loss,4)
        patience = 0
        # Checkpoint Best Model so far
        checkpoint = {'model': Net(),
              'state_dict': net.state_dict().copy(),
              'optimizer' : optimizer.state_dict().copy()}
    else:
        patience += 1
    
    if patience > 6:
        print("Early Stopping..")
        break

In [None]:
# Plot loss by batch.. √
# Checkpoint √
# Try batch norm.. √
# Examine Gradients
# Try Adam 3e-4 
# Fix Seen
# Fix decay/ momentum

In [None]:
pd_results = pd.DataFrame(nn_output,
    columns = ['epoch','total_epochs','train_loss','train_metric','valid_loss','valid_metric']
                         )
display(pd_results)

In [None]:
train_batch_loss = np.concatenate(full_train_loss)
val_batch_loss = np.concatenate(full_val_loss)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
axes[0].plot(train_batch_loss, label='validation_loss')
axes[0].plot(val_batch_loss, label='train_loss')
axes[0].set_title("Loss")

axes[0].legend()

axes[1].plot(pd_results['epoch'],pd_results['valid_metric'], label='Val')
axes[1].plot(pd_results['epoch'],pd_results['train_metric'], label='Train')
# axes[1].plot(pd_results['epoch'],pd_results['test_acc'], label='test_acc')
axes[1].set_title("Roc_AUC Score")
axes[1].legend()
plt.show()

In [None]:
# Load Best Model
net = checkpoint['model'].to(device)
net.load_state_dict(checkpoint['state_dict'])

In [None]:
net.eval() # Safety first
predictions = torch.Tensor().to(device) # Tensor for all predictions

# Go through the test set, saving the predictions in... 'predictions'
for data in submission_loader:
    X = data[0].squeeze(1).float()
    preds = net(X.to(device)).squeeze()
    predictions = torch.cat((predictions, preds))

In [None]:
submission = pd.DataFrame({'id': testdex, 'target': predictions.cpu().detach().numpy()})
submission.to_csv('submission.csv', index=False)
submission.head()