In [None]:
import pandas as pd
import numpy as np
import torch
import wandb
import shap
import matplotlib.pyplot as plt
import lightgbm as lgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, make_scorer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

shap.initjs()

# Machine Learning techniques

### Dataset

In [None]:
onehotcat = False
df = pd.read_csv('../input/league/dataset.csv')
X = df.drop(columns=['gameId','winner','duration','Unnamed: 0']).copy()
X = X.drop(columns=[f'ban_100_{k}' for k in range(5)])
X = X.drop(columns=[f'ban_200_{k}' for k in range(5)])
Y = df[['winner']].copy().values
del df

# Remove nans
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer_cst = SimpleImputer(missing_values=np.nan, strategy='constant')

numeric_columns = X.select_dtypes(include=[np.number])
other_columns = X.select_dtypes(exclude=[np.number])

numeric_columns_imputed = imputer_mean.fit_transform(numeric_columns)
other_columns_imputed = imputer_cst.fit_transform(other_columns)

numeric_columns = pd.DataFrame(numeric_columns_imputed, columns = numeric_columns.columns)
other_columns = pd.DataFrame(other_columns_imputed, columns = other_columns.columns)

X = pd.concat([numeric_columns,other_columns],axis=1).copy()
del numeric_columns, other_columns, other_columns_imputed, numeric_columns_imputed

# Separate categorical features
cat_feat = []
for summ in range(10):
    cat_feat.append(f'summoner_{summ}_championId')
    cat_feat.append(f'summoner_{summ}_teamPosition')
    cat_feat.append(f'summoner_{summ}_tier')
    cat_feat.append(f'summoner_{summ}_rank')
    cat_feat.append(f'summoner_{summ}_primaryStyle')
    cat_feat.append(f'summoner_{summ}_subStyle')
n_cat = len(cat_feat)
X_cat = X[cat_feat]
for k in range(10):
    X = X.drop(columns=[
        f'summoner_{k}_teamPosition',
        f'summoner_{k}_puuid',
        f'summoner_{k}_summonerId',
        f'summoner_{k}_championId',
        f'summoner_{k}_summoner1Id',
        f'summoner_{k}_summoner2Id',
        f'summoner_{k}_primaryStyle',
        f'summoner_{k}_subStyle',
        f'summoner_{k}_tier',
        f'summoner_{k}_rank',
        f'summoner_{k}_gold',
        f'summoner_{k}_kills',
        f'summoner_{k}_deaths',
        f'summoner_{k}_assists',
    ])

# Process categorical features
float_columns = X_cat.select_dtypes(include=[np.float])
X_cat[float_columns.columns] = float_columns.astype(int)
X_cat_cols = X_cat.columns
if onehotcat:
    onehot = OneHotEncoder(sparse=False)
    X_cat = onehot.fit_transform(X_cat)
    X_cat = pd.DataFrame(X_cat)
else:
    ordinal = OrdinalEncoder(dtype=np.int)
    X_cat = ordinal.fit_transform(X_cat)
    X_cat = pd.DataFrame(X_cat, columns=X_cat_cols)

X = pd.concat([X_cat,X],axis=1).copy()
print(f'One hot encodings: {onehotcat}, Dataset shape: {X.shape}')
print(f'Class output balance: {Y.sum()/len(Y)}')
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
del X_cat, Y, float_columns

### CatBoost

In [None]:
cat_features = list(range(n_cat))
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.15,
    eval_metric='Accuracy',
    use_best_model='True',
    cat_features=cat_features,
    loss_function='CrossEntropy',
    # logging_level='Silent'
)
params = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.001, 0.01, 0.1, 0.15, 0.4],
    'loss_function': ['Logloss', 'CrossEntropy'],
    'l2_leaf_reg': [1, 3, 5, 10, 100],
}
"""
scorer = make_scorer(accuracy_score)
gridsearch = GridSearchCV(
    model, 
    param_grid=params,
    verbose=2,
    scoring=scorer,
    cv=5,
    refit=True
)
gridsearch.fit(X_train, Y_train)

"""

model.fit(
    X_train, Y_train, 
    eval_set=(X_test, Y_test), 
    verbose=True,
    use_best_model=True
)

In [None]:
Y_pred_test = model.predict(X_test)
Y_pred_train = model.predict(X_train)
accuracy_test = accuracy_score(Y_test, Y_pred_test)
accuracy_train = accuracy_score(Y_train, Y_pred_train)
print(f"Test accuracy: {accuracy_test * 100.0:.2f}%")
print(f"Train accuracy: {accuracy_train * 100.0:.2f}%")

In [None]:
model.get_feature_importance(prettified=True).head(10)

In [None]:
from catboost import Pool
feature_names = X_test.columns.values.tolist()
X_test_pool = Pool(data=X_test, cat_features=cat_features, feature_names=feature_names)
shap_values = model.get_feature_importance(X_test_pool,'ShapValues')

expected_values = shap_values[0,-1]
shap_values = shap_values[:,:-1]

### LightGBM

In [None]:
model = lgb.LGBMClassifier(
    max_bin=500,
    num_leaves=60,
    learning_rate=0.05,
    num_iterations=1000
)
model.fit(
    X_train, Y_train.ravel(),
    categorical_feature=cat_features,
)

In [None]:
Y_pred_test = model.predict(X_test)
Y_pred_train = model.predict(X_train)
accuracy_test = accuracy_score(Y_test, Y_pred_test)
accuracy_train = accuracy_score(Y_train, Y_pred_train)
print(f"Test accuracy: {accuracy_test * 100.0:.2f}%")
print(f"Train accuracy: {accuracy_train * 100.0:.2f}%")
print(classification_report(Y_test, Y_pred_test))

### XGBoost

In [None]:
model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.14,
    use_label_encoder=False
)
model.fit(
    X_train, 
    Y_train.ravel(),
    eval_set = [(X_train, Y_train), (X_test, Y_test)],
    eval_metric=['error'],
    verbose=True,
)

In [None]:
Y_pred_test = model.predict(X_test)
Y_pred_train = model.predict(X_train)
accuracy_test = accuracy_score(Y_test, Y_pred_test)
accuracy_train = accuracy_score(Y_train, Y_pred_train)
print(f"Test accuracy: {accuracy_test * 100.0:.2f}%")
print(f"Train accuracy: {accuracy_train * 100.0:.2f}%")

In [None]:
explainer = shap.Explainer(model)
shap_values = explainer(X_test)

In [None]:
k = 65
shap.plots.waterfall(shap_values[k])

In [None]:
# visualize the first prediction's explanation with a force plot
shap.plots.force(expected_values, shap_values[k],feature_names=feature_names)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.force(shap_values[3500:4000])

# Neural Network

In [None]:
# Dataset
class TorchDataset(torch.utils.data.Dataset):
    def __init__(self,file='../input/league/dataset.csv'):
        scaler = StandardScaler()
        label_features = ['gameId','winner','duration','Unnamed: 0']
        for summ in range(10):
            label_features.append(f'summoner_{summ}_gold')
            label_features.append(f'summoner_{summ}_kills')
            label_features.append(f'summoner_{summ}_deaths')
            label_features.append(f'summoner_{summ}_assists')
        # Get data
        df = pd.read_csv(file)
        self.X = df.drop(columns=label_features).copy()
        self.X = self.X.drop(columns=[f'ban_100_{k}' for k in range(5)])
        self.X = self.X.drop(columns=[f'ban_200_{k}' for k in range(5)])
        self.Y = df[['winner']].copy().values.astype(np.double)
        del df

        # Remove nans
        imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        imputer_cst = SimpleImputer(missing_values=np.nan, strategy='constant')

        numeric_columns = self.X.select_dtypes(include=[np.number])
        other_columns = self.X.select_dtypes(exclude=[np.number])

        numeric_columns_imputed = imputer_mean.fit_transform(numeric_columns)
        other_columns_imputed = imputer_cst.fit_transform(other_columns)

        numeric_columns = pd.DataFrame(numeric_columns_imputed, columns = numeric_columns.columns)
        other_columns = pd.DataFrame(other_columns_imputed, columns = other_columns.columns)

        self.X = pd.concat([numeric_columns,other_columns],axis=1).copy()
        del numeric_columns, other_columns, other_columns_imputed, numeric_columns_imputed
        # Split data for each players
        summ_features = [{'cat':[], 'cont':[]} for _ in range(10)]
        for summ in range(10):
            summ_features[summ]['cat'].append(f'summoner_{summ}_puuid')
            summ_features[summ]['cat'].append(f'summoner_{summ}_championId')
            summ_features[summ]['cat'].append(f'summoner_{summ}_teamPosition')
            # summ_features[summ]['cat'].append(f'summoner_{summ}_summoner1Id')
            # summ_features[summ]['cat'].append(f'summoner_{summ}_summoner2Id')
            summ_features[summ]['cat'].append(f'summoner_{summ}_primaryStyle')
            summ_features[summ]['cat'].append(f'summoner_{summ}_subStyle')
            summ_features[summ]['cat'].append(f'summoner_{summ}_tier')
            summ_features[summ]['cat'].append(f'summoner_{summ}_rank')
            summ_features[summ]['cont'].append(f'summoner_{summ}_summonerLevel')
            summ_features[summ]['cont'].append(f'summoner_{summ}_lp')
            summ_features[summ]['cont'].append(f'summoner_{summ}_wr')
            summ_features[summ]['cont'].append(f'summoner_{summ}_nb')
            summ_features[summ]['cont'].append(f'summoner_{summ}_champion_mastery')
            
        team_features = [[],[]]
        for team in [0,1]:
            for feat in ['lp','mastery','wr']:
                for op in ['mean','std','median','skew','kurtosis','variance']:
                    team_features[team].append(f'team_{team+1}_{feat}_{op}')
        
        # Team features
        self.team_1_feat = scaler.fit_transform(self.X[team_features[0]].values)
        self.team_2_feat = scaler.fit_transform(self.X[team_features[1]].values)
        
        # Player features
        self.players_features_cat = []
        self.players_features_cont = []
        for summ in range(10):
            self.players_features_cat.append(self.X[summ_features[summ]['cat']])
            self.players_features_cont.append(self.X[summ_features[summ]['cont']].values)
            
        # Convert floats in cats to ints 
        for i, cat in enumerate(self.players_features_cat):
            float_columns = cat.select_dtypes(include=[np.float])
            self.players_features_cat[i][float_columns.columns] = float_columns.astype(int)
            
        # Scale cont + Categorize cat
        self.categorizer = OrdinalEncoder(dtype=np.int)
        # self.onehot = OneHotEncoder(sparse=False)
        self.categorizer.fit(np.concatenate(self.players_features_cat))
        # self.onehot.fit(np.concatenate(self.players_features_cat).astype(str)[:,1:])
        for i, cat in enumerate(self.players_features_cat):
            categorized = self.categorizer.transform(cat)
            # other_onehotencoded = self.onehot.transform(cat.astype(str)[:,1:])
            self.players_features_cat[i] = categorized.copy()
        for i, cont in enumerate(self.players_features_cont):
            self.players_features_cont[i] = scaler.fit_transform(cont.astype(float))
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self,idx):
        output = {
            'team_1': self.team_1_feat[idx],
            'team_2': self.team_1_feat[idx],
            'output': self.Y[idx]
        }
        for summ in range(10):
            output[f'summ_{summ}_cat'] = self.players_features_cat[summ][idx]
            output[f'summ_{summ}_cont'] = self.players_features_cont[summ][idx]
        return output

dataset = TorchDataset()
n = len(dataset)
dataset = dataset
n_test = int(0.2*n)
n_train = n - n_test
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, 
    [n_train,n_test]
)

emb_n_seq = [len(cat) for cat in dataset.categorizer.categories_]
emb_dim_seq = [50, 10, 2, 2, 2, 2, 2]
cont_dim = dataset[0][f'summ_{2}_cont'].shape[0]
cat_dim = dataset[0][f'summ_{2}_cat'].shape[0]
team_dim = dataset[0][f'team_1'].shape[0]
print(f'Trainset size: {len(train_dataset)}, testset size: {len(test_dataset)}')
print(f'Size of emb: {emb_n_seq}')
print(f'Embedding dim: {emb_dim_seq}')
print(f'Number of continuous features: {cont_dim}, number of cat features: {cat_dim}, number of team features: {team_dim}')

In [None]:
# Neural Network
class BasicBlock(torch.nn.Module):
    def __init__(self,input_features, output_features):
        super(BasicBlock, self).__init__()
        self.nn = torch.nn.Sequential(
            torch.nn.Linear(input_features,output_features),
            torch.nn.LayerNorm([output_features]),
            # torch.nn.BatchNorm1d(output_features),
            torch.nn.ReLU(),
            torch.nn.Dropout()
        )
    def forward(self,x):
        return self.nn(x)
    
class SummonerNetwork(torch.nn.Module):
    def __init__(self,emb_n_seq,emb_dim_seq,cont_dim,cat_dim):
        super(SummonerNetwork, self).__init__()
        self.embs = torch.nn.ModuleList([
            torch.nn.Embedding(emb_n_seq[k],emb_dim_seq[k])
            for k in range(cat_dim)
        ])
        self.n_cat = cat_dim
        self.n = sum(emb_dim_seq) + cont_dim
        self.nn = torch.nn.Sequential(
            BasicBlock(self.n, 128),
            BasicBlock(128,64),
            BasicBlock(64,64),
            torch.nn.Linear(64,16)
        )
    def forward(self,x_cat,x_cont):
        emb = torch.cat([
            self.embs[k](x_cat[:,k])
            for k in range(self.n_cat)
        ],dim=1)
        x = torch.cat([emb,x_cont],dim=1)
        return self.nn(x)

        
class BaseLine(torch.nn.Module):
    def __init__(self,emb_n_seq,emb_dim_seq,cont_dim,cat_dim,team_dim,device=torch.device('cpu')):
        super(BaseLine, self).__init__()
        self.device = device
        self.summ_nn = SummonerNetwork(emb_n_seq,emb_dim_seq,cont_dim,cat_dim)
        self.n = 16*10 + team_dim*2
        self.nn = torch.nn.Sequential(
            BasicBlock(self.n,256),
            BasicBlock(256,124),
            BasicBlock(124,124),
            BasicBlock(124,64),
            torch.nn.Linear(64,1)
        )
    def forward(self,x):
        players_features = torch.cat([
            self.summ_nn(
                x[f'summ_{summ}_cat'].long().to(self.device),
                x[f'summ_{summ}_cont'].float().to(self.device)
            )
            for summ in range(10)
        ], dim=1)
        # team_2_feat = torch.max(players_features[5:],0,True)[0].squeeze()
        features = torch.cat([
            players_features,
            x['team_1'].to(self.device),
            x['team_2'].to(self.device)
        ],dim=1)
        return self.nn(features)
    
class FCEmbedding(torch.nn.Module):
    def __init__(self,emb_n_seq,emb_dim_seq,cont_dim,cat_dim,team_dim,device=torch.device('cpu')):
        super(FCEmbedding, self).__init__()
        self.device = device
        self.embs = torch.nn.ModuleList([
            torch.nn.Embedding(emb_n_seq[k],emb_dim_seq[k])
            for k in range(cat_dim)
        ])
        self.n = 10 * (sum(emb_dim_seq) + cont_dim) + team_dim*2
        self.n_cat = cat_dim
        self.nn = torch.nn.Sequential(
            BasicBlock(self.n,512),
            BasicBlock(512,256),
            BasicBlock(256,124),
            BasicBlock(124,124),
            BasicBlock(124,64),
            BasicBlock(64,64),
            torch.nn.Linear(64,1)
        )
    def forward(self,x):
        for summ in range(10):
            x[f'summ_{summ}_cat'] = x[f'summ_{summ}_cat'].long().to(self.device)
            x[f'summ_{summ}_cont'] = x[f'summ_{summ}_cont'].float().to(self.device)
        x_team_1 = x['team_1'].to(self.device)
        x_team_2 = x['team_2'].to(self.device)
        
        # Get cat embeddings
        cat_emb = torch.cat([
            self.embs[k](x[f'summ_{summ}_cat'][:,k])
            for k in range(self.n_cat)
            for summ in range(10)
        ],dim=1)
        cont_feat = torch.cat([
            x[f'summ_{summ}_cont']
            for summ in range(10)
        ], dim=1)
        features = torch.cat([
            cat_emb,
            cont_feat,
            x_team_1,
            x_team_2
        ],dim=1)
        return self.nn(features)
    
        
def test_summ_net():
    summ = 3
    x_cat = torch.tensor(dataset[0][f'summ_{summ}_cat']).unsqueeze(0).long()
    x_cont = torch.tensor(dataset[0][f'summ_{summ}_cont']).unsqueeze(0)

    model = SummonerNetwork(emb_n_seq,emb_dim_seq,cont_dim,cat_dim)
    print(model(x_cat,x_cont))
    
def test_baseline():
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=4,
        shuffle=True
    )
    model = BaseLine(emb_n_seq,emb_dim_seq,cont_dim,cat_dim,team_dim)
    model.double()
    for batch in train_loader:
        print(model(batch))
        break

def test_fcembedding():
    train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=4,
        shuffle=True
    )
    model = FCEmbedding(emb_n_seq,emb_dim_seq,cont_dim,cat_dim,team_dim)
    model.double()
    for batch in train_loader:
        print(model(batch))
        break
    
# test_summ_net()
test_baseline()
test_fcembedding()

In [None]:
# Parameters
batch_size = 128
learning_rate = 5e-4
epoch = 500
criterion = torch.nn.BCEWithLogitsLoss()
log = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True
)

# Model
model = FCEmbedding(emb_n_seq,emb_dim_seq,cont_dim,cat_dim,team_dim,device)
model.double().to(device)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-3)

def metrics(pred,target):
    accuracy = accuracy_score(target, pred)
    precision = precision_score(target, pred)
    recall = recall_score(target, pred)
    f1 = f1_score(target, pred)
    return accuracy, precision, recall, f1

# Wandb
if log:
    wandb.init(project="league")
    wandb.watch(model)

for k in tqdm(range(epoch)):
    train_loss, test_loss, train_acc, test_acc = 0,0,0,0
    # Train
    model.train()
    for batch in train_loader:
        X = batch
        Y = batch['output'].to(device)
        pred = model(X)
        loss = criterion(pred,Y)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
        optimizer.step()
        
        # Add metrics
        train_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
        train_loss += loss.item()
    # Test  
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            X = batch
            Y = batch['output'].to(device)
            pred = model(X)
            loss = criterion(pred,Y)
            
            # Add metrics
            test_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
            test_loss += loss.item()
    
    # Log
    train_loss /= len(train_dataset)
    train_acc /= n_train
    test_loss /= len(test_dataset)
    test_acc /= n_test
    if log:
        wandb.log({
            "loss_train" : train_loss,
            "loss_test" : test_loss,
            "acc_train" : train_acc,
            "acc_test" : test_acc,
        })

In [None]:
champs = {266:"Aatrox",103:"Ahri",84:"Akali",166:"Akshan",12:"Alistar",32:"Amumu",34:"Anivia",1:"Annie",523:"Aphelios",22:"Ashe",136:"AurelionSol",268:"Azir",432:"Bard",53:"Blitzcrank",63:"Brand",201:"Braum",51:"Caitlyn",164:"Camille",69:"Cassiopeia",31:"Chogath",42:"Corki",122:"Darius",131:"Diana",119:"Draven",36:"DrMundo",245:"Ekko",60:"Elise",28:"Evelynn",81:"Ezreal",9:"Fiddlesticks",114:"Fiora",105:"Fizz",3:"Galio",41:"Gangplank",86:"Garen",150:"Gnar",79:"Gragas",104:"Graves",887:"Gwen",120:"Hecarim",74:"Heimerdinger",420:"Illaoi",39:"Irelia",427:"Ivern",40:"Janna",59:"JarvanIV",24:"Jax",126:"Jayce",202:"Jhin",222:"Jinx",145:"Kaisa",429:"Kalista",43:"Karma",30:"Karthus",38:"Kassadin",55:"Katarina",10:"Kayle",141:"Kayn",85:"Kennen",121:"Khazix",203:"Kindred",240:"Kled",96:"KogMaw",7:"Leblanc",64:"LeeSin",89:"Leona",876:"Lillia",127:"Lissandra",236:"Lucian",117:"Lulu",99:"Lux",54:"Malphite",90:"Malzahar",57:"Maokai",11:"MasterYi",21:"MissFortune",62:"MonkeyKing",82:"Mordekaiser",25:"Morgana",267:"Nami",75:"Nasus",111:"Nautilus",518:"Neeko",76:"Nidalee",56:"Nocturne",20:"Nunu",2:"Olaf",61:"Orianna",516:"Ornn",80:"Pantheon",78:"Poppy",555:"Pyke",246:"Qiyana",133:"Quinn",497:"Rakan",33:"Rammus",421:"RekSai",526:"Rell",58:"Renekton",107:"Rengar",92:"Riven",68:"Rumble",13:"Ryze",360:"Samira",113:"Sejuani",235:"Senna",147:"Seraphine",875:"Sett",35:"Shaco",98:"Shen",102:"Shyvana",27:"Singed",14:"Sion",15:"Sivir",72:"Skarner",37:"Sona",16:"Soraka",50:"Swain",517:"Sylas",134:"Syndra",223:"TahmKench",163:"Taliyah",91:"Talon",44:"Taric",17:"Teemo",412:"Thresh",18:"Tristana",48:"Trundle",23:"Tryndamere",4:"TwistedFate",29:"Twitch",77:"Udyr",6:"Urgot",110:"Varus",67:"Vayne",45:"Veigar",161:"Velkoz",711:"Vex",254:"Vi",234:"Viego",112:"Viktor",8:"Vladimir",106:"Volibear",19:"Warwick",498:"Xayah",101:"Xerath",5:"XinZhao",157:"Yasuo",777:"Yone",83:"Yorick",350:"Yuumi",154:"Zac",238:"Zed",221:"Zeri",115:"Ziggs",26:"Zilean",142:"Zoe",143:"Zyra"}
def visualize_embedding(model,dataset,cat,name):
    # Visualize embedding
    pca = PCA(2)
    plt.figure(figsize=(8, 8), dpi=80)
    cats = dataset.categorizer.categories_[cat]
    print(cats)
    emb = model.embs[cat].weight.cpu().detach().numpy()
    emb_2d = pca.fit_transform(emb)

    plt.scatter(emb_2d[:,0],emb_2d[:,1])
    for i, pt in enumerate(emb_2d):
        plt.text(pt[0]+0.0001,pt[1]-0.0001, cats[i], fontsize=11)
        
    plt.yticks([])
    plt.xticks([])
    plt.savefig(name)
    plt.show()

visualize_embedding(model,dataset,5,'rank')

# TabTransformer

In [None]:
!pip install tab-transformer-pytorch
from tab_transformer_pytorch import TabTransformer

In [None]:
class TorchDataset(torch.utils.data.Dataset):
    def __init__(self,file='../input/league/dataset.csv'):
        # Get input
        df = pd.read_csv(file)
        X = df.drop(columns=['gameId','winner','duration','Unnamed: 0']).copy()
        X = X.drop(columns=[f'ban_100_{k}' for k in range(5)])
        X = X.drop(columns=[f'ban_200_{k}' for k in range(5)])
        self.Y = df[['winner']].copy().values
        del df

        # Remove nans
        imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
        imputer_cst = SimpleImputer(missing_values=np.nan, strategy='constant')

        numeric_columns = X.select_dtypes(include=[np.number])
        other_columns = X.select_dtypes(exclude=[np.number])

        numeric_columns_imputed = imputer_mean.fit_transform(numeric_columns)
        other_columns_imputed = imputer_cst.fit_transform(other_columns)

        numeric_columns = pd.DataFrame(numeric_columns_imputed, columns = numeric_columns.columns)
        other_columns = pd.DataFrame(other_columns_imputed, columns = other_columns.columns)

        X = pd.concat([numeric_columns,other_columns],axis=1).copy()
        del numeric_columns, other_columns, other_columns_imputed, numeric_columns_imputed

        # Separate categorical features
        cat_feat = []
        for summ in range(10):
            cat_feat.append(f'summoner_{summ}_championId')
            cat_feat.append(f'summoner_{summ}_teamPosition')
            cat_feat.append(f'summoner_{summ}_tier')
            cat_feat.append(f'summoner_{summ}_rank')
            cat_feat.append(f'summoner_{summ}_primaryStyle')
            cat_feat.append(f'summoner_{summ}_subStyle')
        n_cat = len(cat_feat)
        X_cat = X[cat_feat]
        for k in range(10):
            X = X.drop(columns=[
                f'summoner_{k}_teamPosition',
                f'summoner_{k}_puuid',
                f'summoner_{k}_summonerId',
                f'summoner_{k}_championId',
                f'summoner_{k}_summoner1Id',
                f'summoner_{k}_summoner2Id',
                f'summoner_{k}_primaryStyle',
                f'summoner_{k}_subStyle',
                f'summoner_{k}_tier',
                f'summoner_{k}_rank',
                f'summoner_{k}_gold',
                f'summoner_{k}_kills',
                f'summoner_{k}_deaths',
                f'summoner_{k}_assists',
            ])

        # Process categorical features
        float_columns = X_cat.select_dtypes(include=[np.float])
        X_cat[float_columns.columns] = float_columns.astype(int)
        X_cat_cols = X_cat.columns
        ordinal = OrdinalEncoder(dtype=np.int)
        X_cat = ordinal.fit_transform(X_cat)
        self.X_cat = pd.DataFrame(X_cat, columns=X_cat_cols).copy().values
        self.X_cont = X.copy().values
        self.cat_sizes = [len(x) for x in ordinal.categories_]
        self.cont_size = self.X_cont.shape[1]
    def __len__(self):
        return self.Y.shape[0]
    
    def __getitem__(self,idx):
        return {
            'cont': self.X_cont[idx],
            'cat': self.X_cat[idx],
            'output': self.Y[idx]
        }

# Parameters
batch_size = 32
learning_rate = 3e-4
epoch = 250
criterion = torch.nn.BCEWithLogitsLoss()
log = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset
dataset = TorchDataset()
n = len(dataset)
n_test = int(0.2*n)
n_train = n - n_test
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, 
    [n_train,n_test]
)
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    test_dataset,
    batch_size=batch_size,
    shuffle=True
)

# Model
model = TabTransformer(
    categories = dataset.cat_sizes,     # tuple containing the number of unique values within each category
    num_continuous = dataset.cont_size ,# number of continuous values
    dim = 16,                           # dimension, paper set at 32
    dim_out = 1,                        # binary prediction, but could be anything
    depth = 6,                          # depth, paper recommended 6
    heads = 8,                          # heads, paper recommends 8
    attn_dropout = 0.6,                 # post-attention dropout
    ff_dropout = 0.6,                   # feed forward dropout
    mlp_hidden_mults = (4, 2),          # relative multiples of each hidden dimension of the last mlp to logits
    mlp_act = torch.nn.ReLU(),          # activation for final mlp, defaults to relu, but could be anything else (selu etc)
)
model.double()
model.to(device)

# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)

# Wandb
if log:
    wandb.init(project="league")
    wandb.watch(model)

for k in range(epoch):
    train_loss, test_loss, train_acc, test_acc = 0,0,0,0
    # Train
    model.train()
    for batch in tqdm(train_loader):
        cat = batch['cat'].to(device)
        cont = batch['cont'].to(device)
        Y = batch['output'].to(device)
        pred = model(cat,cont)
        loss = criterion(pred,Y.float())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Add metrics
        train_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
        train_loss += loss.item()
    # Test  
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            cat = batch['cat'].to(device)
            cont = batch['cont'].to(device)
            Y = batch['output'].to(device)
            pred = model(cat,cont)
            loss = criterion(pred,Y.float())
            
            # Add metrics
            test_acc += torch.sum((torch.sigmoid(pred)>0.5).detach() == Y).item()
            test_loss += loss.item()
    
    # Log
    train_loss /= len(train_dataset)
    train_acc /= n_train
    test_loss /= len(test_dataset)
    test_acc /= n_test
    if log:
        wandb.log({
            "loss_train" : train_loss,
            "loss_test" : test_loss,
            "acc_train" : train_acc,
            "acc_test" : test_acc,
        })
    else:
        print(f'Loss train: {train_loss}, Loss test: {test_loss}, Acc train: {train_acc}, Acc test: {test_acc}')