In [1]:
!pip install tab-transformer-pytorch

In [2]:
import pandas as pd
import numpy as np
import torch
import wandb
import shap
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from tab_transformer_pytorch import TabTransformer

shap.initjs()

# XGBoost

In [9]:
df = pd.read_csv('../input/league/dataset.csv')
X = df.drop(columns=['gameId','winner'])
Y = df[['winner']]
del df

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
model = XGBClassifier(
    n_estimators=150,
    learning_rate=0.14,
    use_label_encoder=False
)
model.fit(
    X_train, 
    Y_train.values.ravel(),
    eval_set = [(X_train, Y_train), (X_test, Y_test)],
    eval_metric=['error'],
    verbose=True,
)

In [10]:
Y_pred_test = model.predict(X_test)
Y_pred_train = model.predict(X_train)
accuracy_test = accuracy_score(Y_test, Y_pred_test)
accuracy_train = accuracy_score(Y_train, Y_pred_train)
print(f"Test accuracy: {accuracy_test * 100.0:.2f}%")
print(f"Train accuracy: {accuracy_train * 100.0:.2f}%")

explainer = shap.Explainer(model)
shap_values = explainer(X_train)

In [11]:
print(shap_values[0][1])

In [23]:
k = 0
# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[k])
# visualize the first prediction's explanation with a force plot
shap.plots.force(shap_values[k])

In [13]:
shap.plots.scatter(shap_values[:,"team_1_lp_mean"], color=shap_values)

In [18]:
shap.plots.force(shap_values[3000:3500])

# Neural Network

In [6]:
class BasicBlock(torch.nn.Module):
    def __init__(self,input_features, output_features):
        super(BasicBlock, self).__init__()
        self.nn = torch.nn.Sequential(
            torch.nn.Linear(input_features,output_features),
            torch.nn.BatchNorm1d(output_features),
            torch.nn.ReLU(),
            torch.nn.Dropout()
        )
    def forward(self,x):
        return self.nn(x)
    

class SimpleFeedForward(torch.nn.Module):
    def __init__(self,input_features=66):
        super(SimpleFeedForward, self).__init__()
        self.nn = torch.nn.Sequential(
            BasicBlock(input_features,64),
            BasicBlock(64,32),
            torch.nn.Linear(32,1)
        )
    def forward(self,x):
        return self.nn(x)

class TorchDataset(torch.utils.data.Dataset):
    def __init__(self,file='../input/league/dataset.csv'):
        # Get input
        df = pd.read_csv(file)
        self.X = df.drop(columns=['gameId','winner']).values
        self.Y = df[['winner']].values.astype(np.double)
        
        # Scale inputs + remove nan's
        scaler = StandardScaler()
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        self.X = scaler.fit_transform(self.X)
        self.X = imputer.fit_transform(self.X)
        self.Y = imputer.fit_transform(self.Y)
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self,idx):
        return {
            'input': self.X[idx],
            'output': self.Y[idx]
        }

# Parameters
batch_size = 8
learning_rate = 5e-4
epoch = 100
criterion = torch.nn.BCEWithLogitsLoss()
log = True


# Dataset
dataset = TorchDataset()
n = len(dataset)
n_test = int(0.2*n)
n_train = n - n_test
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, 
    [n_train,n_test]
)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True
)

# Model
model = SimpleFeedForward()
model.double()

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

def metrics(pred,target):
    accuracy = accuracy_score(target, pred)
    precision = precision_score(target, pred)
    recall = recall_score(target, pred)
    f1 = f1_score(target, pred)
    return accuracy, precision, recall, f1

def visualize_embedding(model,dataset):
    # Visualize embedding
    pca = PCA(3)

    emb = model.embedding.weight.cpu().detach().numpy()
    emb_2d = pca.fit_transform(emb)

    plt.scatter(emb_2d[:,0],emb_2d[:,1])
    for i, pt in enumerate(emb_2d):
        plt.text(pt[0]+0.03,pt[1]-0.03, table[dataset.le.classes_[i]], fontsize=11)
    plt.show()

# Wandb
if log:
    wandb.init(project="league")
    wandb.watch(model)

for k in tqdm(range(epoch)):
    train_loss, test_loss, train_acc, test_acc = 0,0,0,0
    # Train
    model.train()
    for batch in train_loader:
        X = batch['input']
        Y = batch['output']
        pred = model(X)
        loss = criterion(pred,Y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Add metrics
        train_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
        train_loss += loss.item()
    # Test  
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            X = batch['input']
            Y = batch['output']
            pred = model(X)
            loss = criterion(pred,Y)
            
            # Add metrics
            test_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
            test_loss += loss.item()
    
    # Log
    train_loss /= len(train_dataset)
    train_acc /= n_train
    test_loss /= len(test_dataset)
    test_acc /= n_test
    if log:
        wandb.log({
            "loss_train" : train_loss,
            "loss_test" : test_loss,
            "acc_train" : train_acc,
            "acc_test" : test_acc,
        })

# TabTransformer

In [3]:
class TorchDataset(torch.utils.data.Dataset):
    def __init__(self,file='../input/league-cont-cat/dataset.csv'):
        # Get input
        df = pd.read_csv(file)
        self.X = df.drop(columns=['gameId','winner']).values
        self.Y = df[['winner']].values.astype(np.double)
        self.cat = self.X[:,:60]
        self.cont = self.X[:,60:]
        
        # Process each inputs
        scaler = StandardScaler()
        imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
        self.cont = scaler.fit_transform(self.cont)
        self.cont = imputer.fit_transform(self.cont)
        self.Y = imputer.fit_transform(self.Y)
        
        self.cat_sizes = []
        for k in range(self.cat.shape[1]):
            le = LabelEncoder()
            self.cat[:,k] = le.fit_transform(self.cat[:,k].astype(str))
            self.cat_sizes.append(len(le.classes_))
        
        self.cat = self.cat.astype(int)
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self,idx):
        return {
            'cont': self.cont[idx],
            'cat': self.cat[idx],
            'output': self.Y[idx]
        }

# Parameters
batch_size = 32
learning_rate = 3e-4
epoch = 100
criterion = torch.nn.BCEWithLogitsLoss()
log = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
dataset = TorchDataset()
n = len(dataset)
n_test = int(0.2*n)
n_train = n - n_test
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, 
    [n_train,n_test]
)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True
)

# Model
model = TabTransformer(
    categories = dataset.cat_sizes,     # tuple containing the number of unique values within each category
    num_continuous = 66,                # number of continuous values
    dim = 16,                           # dimension, paper set at 32
    dim_out = 1,                        # binary prediction, but could be anything
    depth = 6,                          # depth, paper recommended 6
    heads = 8,                          # heads, paper recommends 8
    attn_dropout = 0.6,                 # post-attention dropout
    ff_dropout = 0.6,                   # feed forward dropout
    mlp_hidden_mults = (4, 2),          # relative multiples of each hidden dimension of the last mlp to logits
    mlp_act = torch.nn.ReLU(),          # activation for final mlp, defaults to relu, but could be anything else (selu etc)
)
model.double()
model.to(device)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

# Wandb
if log:
    wandb.init(project="league")
    wandb.watch(model)

for k in range(epoch):
    train_loss, test_loss, train_acc, test_acc = 0,0,0,0
    # Train
    model.train()
    for batch in tqdm(train_loader):
        cat = batch['cat'].to(device)
        cont = batch['cont'].to(device)
        Y = batch['output'].to(device)
        pred = model(cat,cont)
        loss = criterion(pred,Y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Add metrics
        train_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
        train_loss += loss.item()
    # Test  
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            cat = batch['cat'].to(device)
            cont = batch['cont'].to(device)
            Y = batch['output'].to(device)
            pred = model(cat,cont)
            loss = criterion(pred,Y)
            
            # Add metrics
            test_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
            test_loss += loss.item()
    
    # Log
    train_loss /= len(train_dataset)
    train_acc /= n_train
    test_loss /= len(test_dataset)
    test_acc /= n_test
    if log:
        wandb.log({
            "loss_train" : train_loss,
            "loss_test" : test_loss,
            "acc_train" : train_acc,
            "acc_test" : test_acc,
        })
    else:
        print(f'Loss train: {train_loss}, Loss test: {test_loss}, Acc train: {train_acc}, Acc test: {test_acc}')