In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
torch.version.cuda

'9.0'

In [3]:
# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
ufc_df = pd.read_csv("ufc_combined.csv", index_col=0)
ufc_df.head()

Unnamed: 0,fighter1,fighter2,winner,weight_class,title_fight,method,end_round,fight_year,win_fighter1,lose_fighter1,...,SLpM_fighter2,Str_Acc_fighter2,SApM_fighter2,Str_Def_fighter2,TD_Avg_fighter2,TD_Acc_fighter2,TD_Def_fighter2,Sub_Avg_fighter2,win%_fighter2,born_year_fighter2
0,Santiago Ponzinibbio,Neil Magny,1,Welterweight,f,KO/TKO,4,2018,28,3,...,3.86,46.0,2.22,56.0,2.62,46.0,60.0,0.3,72.413795,1987
1,Ricardo Lamas,Darren Elkins,1,Featherweight,f,KO/TKO,3,2018,19,7,...,3.36,37.0,2.83,53.0,2.68,35.0,57.0,1.3,78.125,1984
2,Johnny Walker,Khalil Rountree Jr.,1,Light Heavyweight,f,KO/TKO,1,2018,15,3,...,2.3,34.0,3.29,38.0,0.0,0.0,50.0,0.3,66.666664,1990
3,Cezar Ferreira,Ian Heinisch,0,Middleweight,f,DEC,3,2018,14,7,...,3.49,58.0,1.67,52.0,0.81,14.0,50.0,1.6,92.30769,1988
4,Marlon Vera,Guido Cannetti,1,Bantamweight,f,SUB,2,2018,12,5,...,2.8,52.0,1.95,51.0,2.8,50.0,75.0,0.3,61.53846,1979


In [5]:
# categorical features 
cats = ["fighter1", "fighter2", "weight_class", "title_fight", "method", "end_round", "fight_year",
        "height_fighter1", "reach_fighter1", "stance_fighter1", "born_year_fighter1",
        "height_fighter2", "reach_fighter2", "stance_fighter2", "born_year_fighter2"]

for cat in cats:
    ufc_df[cat].fillna("unk", inplace=True)
    ufc_df[cat] = ufc_df[cat].astype("category")

In [6]:
ufc_df[cats] = ufc_df[cats].apply(lambda col: col.cat.codes)
ufc_df.head()

Unnamed: 0,fighter1,fighter2,winner,weight_class,title_fight,method,end_round,fight_year,win_fighter1,lose_fighter1,...,SLpM_fighter2,Str_Acc_fighter2,SApM_fighter2,Str_Def_fighter2,TD_Avg_fighter2,TD_Acc_fighter2,TD_Def_fighter2,Sub_Avg_fighter2,win%_fighter2,born_year_fighter2
0,1185,1007,1,10,0,2,3,24,28,3,...,3.86,46.0,2.22,56.0,2.62,46.0,60.0,0.3,72.413795,25
1,1098,328,1,2,0,2,2,24,19,7,...,3.36,37.0,2.83,53.0,2.68,35.0,57.0,1.3,78.125,22
2,673,791,1,5,0,2,0,24,15,3,...,2.3,34.0,3.29,38.0,0.0,0.0,50.0,0.3,66.666664,28
3,188,532,0,7,0,0,2,24,14,7,...,3.49,58.0,1.67,52.0,0.81,14.0,50.0,1.6,92.30769,26
4,892,504,1,0,0,3,1,24,12,5,...,2.8,52.0,1.95,51.0,2.8,50.0,75.0,0.3,61.53846,17


In [7]:
# make sure no missing values
ufc_df.isnull().values.any()

False

In [8]:
# split the dataset
X, y = ufc_df.drop("winner", axis=1), ufc_df["winner"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(3672, 41) (919, 41) (3672,) (919,)


In [10]:
print(y_train.mean(), y_test.mean())

0.5 0.49836779107725787


In [11]:
# normalization
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

conts = X.drop(cats, axis=1).columns.values.tolist()

scaler = StandardScaler()
X_train_scaled.loc[:, conts] = scaler.fit_transform(X_train[conts])
X_test_scaled.loc[:, conts] = scaler.transform(X_test[conts])

In [12]:
X_train_scaled.head()

Unnamed: 0,fighter1,fighter2,weight_class,title_fight,method,end_round,fight_year,win_fighter1,lose_fighter1,draw_fighter1,...,SLpM_fighter2,Str_Acc_fighter2,SApM_fighter2,Str_Def_fighter2,TD_Avg_fighter2,TD_Acc_fighter2,TD_Def_fighter2,Sub_Avg_fighter2,win%_fighter2,born_year_fighter2
452,1235,448,5,1,0,2,11,-0.125871,0.220705,-0.364883,...,1.11963,-0.832735,-0.244045,0.625381,-0.454634,0.399963,-0.056388,-0.050106,0.198109,17
1948,419,659,0,0,3,0,18,-0.125871,-0.255038,-0.364883,...,0.012168,0.700353,-0.235904,-2.486759,-1.253991,-1.876225,-0.366288,5.971406,-1.21262,24
517,1180,610,5,0,3,1,11,-0.888215,-0.730782,-0.364883,...,-2.294349,-2.719612,-1.383751,-0.451898,-1.253991,-1.876225,-2.579856,8.647634,-0.415252,4
3377,80,1291,0,0,2,0,21,-0.561496,-0.968653,-0.364883,...,2.185457,0.228634,1.253853,0.984474,-1.253991,-1.876225,0.740496,-0.830673,1.626012,29
3288,1302,486,5,0,2,1,21,-0.888215,-0.255038,-0.364883,...,1.011381,-0.125156,1.791143,0.026893,-0.850506,-0.785552,1.094667,-0.830673,-0.769637,23


In [13]:
class UfcDataset(Dataset):
    def __init__(self, df, cats, conts, targets):
        self.X_cats = df[cats].astype(np.int64).values
        self.X_conts = df[conts].astype(np.float32).values
        self.y = targets.astype(np.float32).values.reshape(-1, 1)
        
        
    def __len__(self):
        return self.y.shape[0]
    
    def __getitem__(self, i):
        return [self.X_cats[i], self.X_conts[i], self.y[i]]

In [14]:
train_dataset = UfcDataset(X_train_scaled, cats, conts, y_train)
test_dataset = UfcDataset(X_test_scaled, cats, conts, y_test)

In [15]:
# Data loaders
bz=32
train_dl = torch.utils.data.DataLoader(train_dataset, batch_size=bz, shuffle=True)
test_dl = torch.utils.data.DataLoader(test_dataset, batch_size=bz, shuffle=False)

In [16]:
class UfcNet(nn.Module):
    def __init__(self, emb_dims, num_conts, fc_layer_sizes, emb_drop, ps):
        super(UfcNet, self).__init__()
        
        # embedding layers for categorical features
        self.emb_layers = nn.ModuleList([nn.Embedding(x, y) for x, y in emb_dims])
        self.num_embs = sum([y for _, y in emb_dims])
        self.num_conts = num_conts
        
        # fully connected layers
        fc_layer_sizes = [self.num_embs + self.num_conts] + fc_layer_sizes
        self.fc_layers = nn.ModuleList([nn.Linear(fc_layer_sizes[i],fc_layer_sizes[i+1])
                                        for i in range(len(fc_layer_sizes)-1)])
        
        # out layer
        self.out = nn.Linear(fc_layer_sizes[-1], 1)
        
        # batch norm layers
        self.first_bn = nn.BatchNorm1d(self.num_conts)
        self.bn_layers = nn.ModuleList([nn.BatchNorm1d(sz)
                                        for sz in fc_layer_sizes[1:]])
        # dropout layers
        self.emb_drop = nn.Dropout(emb_drop)
        self.dropout_layers = nn.ModuleList([nn.Dropout(p) for p in ps])
        
    def forward(self, x_cats, x_conts):
        x = [e(x_cats[:, i]) for i, e in enumerate(self.emb_layers)]
        x = torch.cat(x, 1)
        x = self.emb_drop(x)
        
        x_c = self.first_bn(x_conts)
        x = torch.cat([x, x_c], 1)
        
        for fc, bn, d in zip(self.fc_layers, self.bn_layers, self.dropout_layers):
            x = F.relu(fc(x))
            x = bn(x)
            x = d(x)
        
        x = self.out(x)
        return torch.sigmoid(x)

In [17]:
emb_dims = [(len(ufc_df[cat].unique()), min(50, len(ufc_df[cat].unique())//2)) for cat in cats]
emb_dims

[(1396, 50),
 (1405, 50),
 (15, 7),
 (2, 1),
 (4, 2),
 (5, 2),
 (25, 12),
 (24, 12),
 (26, 13),
 (5, 2),
 (38, 19),
 (23, 11),
 (26, 13),
 (6, 3),
 (37, 18)]

In [18]:
num_conts = len(conts)
fc_layer_sizes = [256, 64, 16]
emb_drop = 0.5
ps = [0.5] * 3

In [19]:
ufc_model = UfcNet(emb_dims, num_conts, fc_layer_sizes, emb_drop, ps).to(device)

In [20]:
criterion = nn.BCELoss()
learning_rate = 1e-2
optimizer = torch.optim.Adam(ufc_model.parameters(), lr=learning_rate, weight_decay=1e-4)  

In [22]:
num_epochs = 4
total_step = len(train_dl)
for epoch in range(num_epochs):
    for i, (x_cats, x_conts, y) in enumerate(train_dl):
        x_cats, x_conts, y = x_cats.to(device), x_conts.to(device), y.to(device)
        
        # forward
        outputs = ufc_model(x_cats, x_conts)
        loss = criterion(outputs, y)
        
        # backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/4], Step [100/115], Loss: 0.6192
Epoch [2/4], Step [100/115], Loss: 0.4864
Epoch [3/4], Step [100/115], Loss: 0.4440
Epoch [4/4], Step [100/115], Loss: 0.4844


In [23]:
with torch.no_grad():
    correct = 0
    total = 0
    for x_cats, x_conts, y in test_dl:
        x_cats, x_conts, y = x_cats.to(device), x_conts.to(device), y.to(device)
        outputs = ufc_model(x_cats, x_conts)
        preds = (outputs>0.5).type(torch.cuda.FloatTensor)
        total += y.size(0)
        correct += (preds == y).sum().item()

print ("Accuracy: {:.2f}%".format(100*correct/total))

Accuracy: 68.77%
