# Tabular Pytorch Entity Embeddings
_By Nick Brooks, 2019-10-30_

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
# from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
if False: # Debug..
    nrow = 50000
else:
    nrow = None

train = pd.read_csv("/kaggle/input/cat-in-the-dat/train.csv", index_col = 'id', nrows = nrow)
test = pd.read_csv("/kaggle/input/cat-in-the-dat/test.csv", index_col = 'id', nrows = nrow)
submission_df = pd.read_csv("/kaggle/input/cat-in-the-dat/sample_submission.csv")

traindex = train.index
testdex = test.index

target = train.target.copy()
print("Class Distribution: ", target.value_counts(normalize= True).mul(100).round(2).to_dict())

df = pd.concat([train.drop('target',axis = 1), test], axis = 0)
del train, test

categorical = list(df.columns)

In [None]:
# Continuous
df['zero_count'] = (df[categorical] == 0).astype(int).sum(axis=1) / (df[categorical].shape[1])
continuous = ['zero_count']

In [None]:
# Categorical
# Encoder:
for col in categorical:
    diff = list(set(df.loc[testdex, col].unique()) - set(df.loc[traindex,col].unique()))
    if diff:
        print("Column {} has {} unseen categories in test set".format(col, len(diff)))
        df.loc[df[col].isin(diff),col] = 999
    if df[col].dtypes == object:
        df[col] = df[col].astype(str)
    lbl = preprocessing.LabelEncoder()
    df[col] = pd.Series(lbl.fit_transform(df[col].values))

In [None]:
train = df.loc[traindex, :]
test = df.loc[testdex, :]
y = target.loc[train.index]

# Train Test Split
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.3)

[print(table.shape) for table in [X_train, y_train, X_valid, y_valid]];

In [None]:
class TabularDataset(torch.utils.data.Dataset):
    def __init__(self, data, cat_cols=None, cont_cols=None, y=None):
        self.n = data.shape[0]
        self.y = y

        self.cat_cols = cat_cols if cat_cols else []
        self.cont_cols = cont_cols if cont_cols else []
        
        if self.cont_cols:
            self.cont_X = data[self.cont_cols].astype(np.float32).values
        else:
            self.cont_X = np.zeros((self.n, 1))

        if self.cat_cols:
            self.cat_X = data[self.cat_cols].astype(np.int64).values
        else:
            self.cat_X = np.zeros((self.n, 1))

    def __len__(self):
        return self.n

    def __getitem__(self, idx):
        if self.y is not None:
            return [self.cat_X[idx], self.cont_X[idx], self.y.values[idx]]
        else:
            return [self.cat_X[idx], self.cont_X[idx]]

In [None]:
train_dataset = TabularDataset(data = X_train, cat_cols=categorical, cont_cols=continuous, y = y_train)
valid_dataset = TabularDataset(data = X_valid, cat_cols=categorical, cont_cols=continuous, y = y_valid)
submission_dataset = TabularDataset(data = test, cat_cols=categorical, cont_cols=continuous, y = None)

In [None]:
batch_size = 64

train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True)

val_loader = torch.utils.data.DataLoader(valid_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

submission_loader = torch.utils.data.DataLoader(submission_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False)

In [None]:
next(iter(train_loader))

#### Prepare Embeddings

In [None]:
cat_dims = [int(df[col].nunique()) for col in categorical]
print(cat_dims)
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
print(emb_dims)

### Neural Net

In [None]:
class Net(nn.Module):
    def __init__(
        self,
        emb_dims,
        no_of_cont,
        lin_layer_sizes,
        output_size,
        emb_dropout,
        lin_layer_dropouts,
    ):

        super().__init__()

        # Embedding layers
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])

        no_of_embs = sum([y for x, y in emb_dims])
        self.no_of_embs = no_of_embs
        self.no_of_cont = no_of_cont

        # Linear Layers
        first_lin_layer = nn.Linear(
            self.no_of_embs + self.no_of_cont, lin_layer_sizes[0]
        )

        self.lin_layers = nn.ModuleList(
            [first_lin_layer]
            + [
                nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
                for i in range(len(lin_layer_sizes) - 1)
            ]
        )

        for lin_layer in self.lin_layers:
            nn.init.kaiming_normal_(lin_layer.weight.data)

        # Batch Norm Layers
        self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
        self.bn_layers = nn.ModuleList(
            [nn.BatchNorm1d(size) for size in lin_layer_sizes]
        )

        # Dropout Layers
        self.emb_dropout_layer = nn.Dropout(emb_dropout)
        self.droput_layers = nn.ModuleList(
            [nn.Dropout(size) for size in lin_layer_dropouts]
        )
        
        # Output Layer
        self.output_layer = nn.Linear(lin_layer_sizes[-1], output_size)
#         nn.init.kaiming_normal_(self.output_layer.weight.data)

    def forward(self, cont_data, cat_data):        
        if self.no_of_embs != 0:
            x = [
                emb_layer(cat_data[:, i]) for i, emb_layer in enumerate(self.emb_layers)
            ]
            x = torch.cat(x, 1)
            x = self.emb_dropout_layer(x)

        if self.no_of_cont != 0:
            normalized_cont_data = self.first_bn_layer(cont_data)

            if self.no_of_embs != 0:
                x = torch.cat([x, normalized_cont_data], 1)
            else:
                x = normalized_cont_data

        for lin_layer, dropout_layer, bn_layer in zip(
            self.lin_layers, self.droput_layers, self.bn_layers
        ):

            x = F.relu(lin_layer(x))
            x = bn_layer(x)
            x = dropout_layer(x)

        x = self.output_layer(x)

        return F.softmax(x, dim=-1)

In [None]:
net = Net(emb_dims, no_of_cont = 1, lin_layer_sizes=[50, 100],
                          output_size=2, emb_dropout=0.04,
                          lin_layer_dropouts=[0.001,0.01]).to(device)
net

In [None]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

EPOCHS = 10
nn_output = []

# Loss Function
criterion = nn.CrossEntropyLoss()
# criterion = F.nll_loss

# Gradient Descent
# optimizer = optim.SGD(net.parameters(),lr=1e-1)
optimizer = optim.Adam(net.parameters(), lr=1e-1)

for epoch in range(EPOCHS):
    epoch_loss = 0
    epoch_correct = 0
    t_pred = []
    t_truth = []
    net.train()
    
    for i,(cat_x, cont_x, y) in enumerate(train_loader):
        
        cont_x = cont_x.to(device)
        cat_x = cat_x.to(device)
        y = y.to(device)
        
        net.zero_grad()  # sets gradients to 0 before loss calc
        output = net(cont_x, cat_x)  # pass in the reshaped batch (recall they are 28x28 atm)
        tloss = criterion(output, y)  # calc and grab the loss value
        t_pred.extend(output.cpu().detach().numpy()[:,1])
        t_truth.extend(y.cpu().detach().numpy())
        tloss.backward()  # apply this loss backwards thru the network's parameters
        optimizer.step()  # attempt to optimize weights to account for loss/gradients 
        
        epoch_loss += tloss.item()
        epoch_correct += get_num_correct(output, y)
    
    # Evaluation with the validation set
    net.eval() # eval mode
    val_loss = 0
    val_correct = 0
    v_pred = []
    v_truth = []
    
    with torch.no_grad():
        # First Validation Set
        for cat_x, cont_x, y in val_loader:
            cont_x = cont_x.to(device)
            cat_x = cat_x.to(device)
            y = y.to(device)
            
            preds = net(cont_x, cat_x) # get predictions
            vloss = criterion(preds, y) # calculate the loss
            v_pred.extend(preds.cpu().detach().numpy()[:,1])
            v_truth.extend(y.cpu().detach().numpy())
            
            val_correct += get_num_correct(preds, y)
            val_loss += vloss.item()
                
    tmp_nn_output = [epoch + 1,EPOCHS,
                     epoch_loss/len(train_loader.dataset),epoch_correct/len(train_loader.dataset)*100, roc_auc_score(t_truth, t_pred),
                     val_loss/len(val_loader.dataset), val_correct/len(val_loader.dataset)*100, roc_auc_score(v_truth, v_pred)
                    ]
    nn_output.append(tmp_nn_output)
    
    # Print the loss and accuracy for the validation set
    print('Epoch [{}/{}] train loss: {:.6f} acc: {:.3f} rocauc: {:.3f} - valid loss: {:.6f} acc: {:.3f} rocauc: {:.3f}'
        .format(*tmp_nn_output))

In [None]:
pd_results = pd.DataFrame(nn_output,
    columns = ['epoch','total_epochs','train_loss','train_acc', 'train_rocauc','valid_loss','valid_acc', 'valid_rocauc']
                         )
display(pd_results)

print("Best Epoch: {}".format(pd_results.loc[pd_results.valid_rocauc.idxmax()]['epoch']))

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 4))
axes[0].plot(pd_results['epoch'],pd_results['valid_loss'], label='validation_loss')
axes[0].plot(pd_results['epoch'],pd_results['train_loss'], label='train_loss')
# axes[0].plot(pd_results['epoch'],pd_results['test_loss'], label='test_loss')

axes[0].legend()

axes[1].plot(pd_results['epoch'],pd_results['valid_rocauc'], label='validation_rocauc')
axes[1].plot(pd_results['epoch'],pd_results['train_rocauc'], label='train_rocauc')
# axes[1].plot(pd_results['epoch'],pd_results['test_acc'], label='test_acc')
axes[1].legend()

In [None]:
net.eval() # Safety first
predictions = torch.Tensor().to(device) # Tensor for all predictions

# Go through the test set, saving the predictions in... 'predictions'
for cat_x, cont_x in submission_loader:
    cont_x = cont_x.to(device)
    cat_x = cat_x.to(device)
    preds = net(cont_x, cat_x)
    predictions = torch.cat((predictions, preds), dim=0)

In [None]:
submission = pd.DataFrame({'id': testdex, 'target': predictions.cpu().detach().numpy()[:, 1]})
submission.to_csv('submission.csv', index=False)
submission.head()