## Import

In [1]:
# libraries

import pandas as pd
import numpy as np

# models

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.svm import LinearSVC
from sklearn.linear_model import Lasso
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#XG boost
from sklearn import datasets
import xgboost as xgb

# other tools

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler  

import warnings

# visualization
import seaborn as sns
import matplotlib.pyplot as plt

## Load excel file (ufc.xlsx) and print out the first couple rows

In [2]:
ufc = pd.read_excel('ufc-master.xlsx')
ufc.head(5)

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1
0,Jessica Eye,Cynthia Calvillo,120,-130,120.0,76.923077,2020-06-13,"Las Vegas, Nevada, USA",USA,Blue,...,-1,3,-5.08,-5.08,-1,0.33,0.9,1.37,1,1
1,Karl Roberson,Marvin Vettori,210,-230,210.0,43.478261,2020-06-13,"Las Vegas, Nevada, USA",USA,Blue,...,0,-1,-2.54,0.0,-3,0.89,-0.2,0.36,1,1
2,Charles Rosa,Kevin Aguilar,170,-185,170.0,54.054054,2020-06-13,"Las Vegas, Nevada, USA",USA,Red,...,0,-2,-5.08,10.16,-2,0.52,-1.0,-2.11,1,1
3,Andre Fili,Charles Jourdain,-220,200,45.454545,200.0,2020-06-13,"Las Vegas, Nevada, USA",USA,Red,...,-2,0,-5.08,-12.7,-5,0.85,-0.2,-2.49,1,1
4,Jordan Espinosa,Mark De La Rosa,-167,157,59.88024,157.0,2020-06-13,"Las Vegas, Nevada, USA",USA,Red,...,0,1,0.0,-10.16,-5,-0.97,-0.2,-0.19,1,1


## Get number of rows and columns

In [3]:
print('Number of rows:', ufc.shape[0])
print('Number of columns:', ufc.shape[1])

Number of rows: 4304
Number of columns: 79


## Check for missing data

In [4]:
ufc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4304 entries, 0 to 4303
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   R_fighter                     4304 non-null   object        
 1   B_fighter                     4304 non-null   object        
 2   R_odds                        4304 non-null   int64         
 3   B_odds                        4304 non-null   int64         
 4   R_ev                          4304 non-null   float64       
 5   B_ev                          4304 non-null   float64       
 6   date                          4304 non-null   datetime64[ns]
 7   location                      4304 non-null   object        
 8   country                       4304 non-null   object        
 9   Winner                        4304 non-null   object        
 10  title_bout                    4304 non-null   bool          
 11  weight_class                  

## Check for duplicates

In [5]:
duplicates = ufc.duplicated()
ufc[duplicates]

# turns out: no duplicates

Unnamed: 0,R_fighter,B_fighter,R_odds,B_odds,R_ev,B_ev,date,location,country,Winner,...,ko_dif,sub_dif,height_dif,reach_dif,age_dif,sig_str_dif,avg_sub_att_dif,avg_td_dif,empty_arena,constant_1


## Delete rows with at least 1 missing value (NaN), then check shape again

In [6]:
ufc_df = ufc.dropna()

print('Number of rows:', ufc_df.shape[0])
print('Number of columns:', ufc_df.shape[1])

Number of rows: 3243
Number of columns: 79


## Check for missing data again

In [7]:
ufc_df.info()

# we have 3244 non-null value in every column --> no missing data

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3243 entries, 0 to 4303
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   R_fighter                     3243 non-null   object        
 1   B_fighter                     3243 non-null   object        
 2   R_odds                        3243 non-null   int64         
 3   B_odds                        3243 non-null   int64         
 4   R_ev                          3243 non-null   float64       
 5   B_ev                          3243 non-null   float64       
 6   date                          3243 non-null   datetime64[ns]
 7   location                      3243 non-null   object        
 8   country                       3243 non-null   object        
 9   Winner                        3243 non-null   object        
 10  title_bout                    3243 non-null   bool          
 11  weight_class                  

## Set data (X) and target (y)

In [None]:
X = ufc_df[['R_Height_cms', 'B_Height_cms', 'R_Reach_cms', 'B_Reach_cms', 'R_Weight_lbs', 'B_Weight_lbs', 'R_age', 'B_age', 'R_current_win_streak', 'B_current_win_streak', 'R_avg_SIG_STR_landed', 'B_avg_SIG_STR_landed']]
y = ufc_df['Winner']

print('X shape:', X.shape)
print('y shape:', y.shape)

# now we have 3243 instances instead of 4304

## Get training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

print("X_train has shape", X_train.shape)
print("X_test has shape", X_test.shape)
print("y_train has shape", y_train.shape)
print("y_test has shape", y_test.shape)

## Get prediction set 

In [None]:
user_inputs = []
input_names = ['R_Height_cms', 'B_Height_cms', 'R_Reach_cms', 'B_Reach_cms', 'R_Weight_lbs', 'B_Weight_lbs', 'R_age', 'B_age', 'R_current_win_streak', 'B_current_win_streak', 'R_avg_SIG_STR_landed', 'B_avg_SIG_STR_landed']


for i in input_names:
    print(i, ':')
    user_inputs.append(input())
    print('')

## Build models

In [None]:
# knn
knn = KNeighborsClassifier()

# neural network
mpl = MLPClassifier()

# decision tree
tree = DecisionTreeClassifier()

# random forest
forest = RandomForestClassifier()

classifiers = [knn, mpl, tree, forest]

# Scale Data

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 'applyModel' function

In [None]:
def applyModel(model,name,X_train, y_train, X_test, y_test):
    m = model.fit(X_train,y_train)
    print(name, '- Training accuracy:', m.score(X_train, y_train))
    print(name, '- Testing accuracy:', m.score(X_test, y_test))
    
    X_new = np.array([user_inputs]) 
    prediction = knn.predict(X_new) 
    print("The prediction: ", prediction)

## Train & test with the different models

In [None]:
for c in classifiers:
    n = str(c)
    applyModel(c, n, X_train, y_train, X_test, y_test)
    print('')
    
warnings.filterwarnings("ignore")

## Play around with KNN

In [None]:
training_accuracy = []
testing_accuracy = []
number_of_neighbors =[]
weighting_choice = []

weight_values = ['distance', 'uniform']

for n_neighbors in range(1,100):
    for weights in weight_values:
        clf = KNeighborsClassifier(n_neighbors = n_neighbors, weights = weights)
        clf.fit(X_train, y_train)
        training_accuracy.append(clf.score(X_train, y_train))
        testing_accuracy.append(clf.score(X_test, y_test))
        number_of_neighbors.append(n_neighbors)
        weighting_choice.append(weights)
      
     
combinations_sorted_knn = sorted(list(zip(number_of_neighbors, weighting_choice, training_accuracy, testing_accuracy)), key = lambda e:e[3], reverse = True)

print('Top 5 results, sorted by test accuracy:\n')
print(*combinations_sorted_knn[0:5], sep = "\n")

# save the best variables
knn_best_n_neighbors = combinations_sorted_knn[0][0]
knn_best_weights = combinations_sorted_knn[0][1]

## Play around with Neural Network

In [None]:
train_acc = []
test_acc = []
alpha_value = []
learning_rate_value = []


alphas = (0.0001, 0.001, 0.1, 0, 1, 5, 100)
learning_rates = ('constant', 'invscaling', 'adaptive')

for a in alphas:
    for l in learning_rates:
        mpl = MLPClassifier(alpha=a, learning_rate = l)
        mpl.fit(X_train, y_train)
        train_acc.append(accuracy_score(mpl.predict(X_train), y_train))
        test_acc.append(accuracy_score(mpl.predict(X_test), y_test))
        alpha_value.append(a)
        learning_rate_value.append(l)
        

combinations_sorted_mpl = sorted(list(zip(alpha_value, learning_rate_value, train_acc, test_acc)), key = lambda e:e[3], reverse = True)

print('Top 5 results, sorted by test accuracy:\n')
print(*combinations_sorted_mpl[0:5], sep = "\n")

# save the best variables
mpl_best_alpha = combinations_sorted_knn[0][0]
mpl_best_learning_rate = combinations_sorted_knn[0][1]

## Play around with Decision Tree

In [None]:
train_acc = []
test_acc = []
max_depth_value = []

for i in range(1,30):
    dt = DecisionTreeClassifier(max_depth = i, random_state=0)
    dt.fit(X_train, y_train)
    train_acc.append(accuracy_score(dt.predict(X_train), y_train))
    test_acc.append(accuracy_score(dt.predict(X_test), y_test))
    max_depth_value.append(i)

combinations_sorted_tree = sorted(list(zip(max_depth_value, train_acc, test_acc)), key = lambda e:e[2], reverse = True)

print('Top 5 results, sorted by test accuracy:\n   (Values: depth, training accracy, test accuracy)\n')
print(*combinations_sorted_tree[0:5], sep = "\n")

# save the best variable
tree_best_max_depth = combinations_sorted_tree[0][0]

## Play around with Random Forest

In [None]:
train_acc = []
test_acc = []
max_depth_value = []
criterion_value = []
max_features_value = []

criterions = ('gini', 'entropy')
number_of_features = range(1,6)

for i in range(1,9):
    for c in criterions:
        for f in number_of_features:
                rf = RandomForestClassifier(criterion = c, max_depth = i, max_features = f, random_state=0)
                rf.fit(X_train, y_train)
                train_acc.append(accuracy_score(rf.predict(X_train), y_train))
                test_acc.append(accuracy_score(rf.predict(X_test), y_test))
                max_depth_value.append(i)
                criterion_value.append(c)
                max_features_value.append(f)

combinations_sorted_forest = sorted(list(zip(max_features_value, criterion_value, max_depth_value, train_acc, test_acc)), key = lambda e:e[3], reverse = True)

print('Top 5 results, sorted by test accuracy:\n   (Values: depth, training accracy, test accuracy)\n')
print(*combinations_sorted_forest[0:5], sep = "\n")

# save the best variable


## *Here we should try XGBoost too*

In [8]:
#First we create dummy variables. 
# create dataset
df = pd.DataFrame(ufc_df)
  
# display dataset
print(df)
  
# create dymmy variables
df.drop(["R_fighter", "B_fighter", "location", "country"], axis=1)
df = pd.get_dummies(df)

df.head()

            R_fighter             B_fighter  R_odds  B_odds        R_ev  \
0         Jessica Eye      Cynthia Calvillo     120    -130  120.000000   
1       Karl Roberson        Marvin Vettori     210    -230  210.000000   
2        Charles Rosa         Kevin Aguilar     170    -185  170.000000   
3          Andre Fili      Charles Jourdain    -220     200   45.454545   
4     Jordan Espinosa       Mark De La Rosa    -167     157   59.880240   
...               ...                   ...     ...     ...         ...   
4296   Alessio Sakara           James Irvin    -120     100   83.333333   
4297       Clay Guida       Shannon Gugerty    -420     335   23.809524   
4298   Eliot Marshall  Vladimir Matyushenko     145    -165  145.000000   
4301   Brendan Schaub         Chase Gormley    -260     220   38.461538   
4303     Eric Schafer           Jason Brilz     140    -160  140.000000   

            B_ev       date                   location country Winner  ...  \
0      76.923077 2020

Unnamed: 0,R_odds,B_odds,R_ev,B_ev,date,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,...,gender_FEMALE,gender_MALE,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,120,-130,120.0,76.923077,2020-06-13,False,5,0,2,1,...,1,0,0,1,0,0,0,1,0,0
1,210,-230,210.0,43.478261,2020-06-13,False,3,0,2,1,...,0,1,0,0,1,0,0,0,1,0
2,170,-185,170.0,54.054054,2020-06-13,False,3,2,0,0,...,0,1,0,1,0,0,0,0,0,1
3,-220,200,45.454545,200.0,2020-06-13,False,3,0,1,0,...,0,1,0,0,0,1,0,1,0,0
4,-167,157,59.88024,157.0,2020-06-13,False,3,3,0,0,...,0,1,0,1,0,0,0,1,0,0


In [10]:
X = df[['R_Height_cms', 'B_Height_cms', 'R_Reach_cms', 'B_Reach_cms', 'R_Weight_lbs', 'B_Weight_lbs', 'R_age', 'B_age', 'R_current_win_streak', 'B_current_win_streak', 'R_avg_SIG_STR_landed', 'B_avg_SIG_STR_landed']]
y = df['Red']

KeyError: 'Red'

In [None]:
D_train = xgb.DMatrix(X_train, label=y_train)
D_test = xgb.DMatrix(X_test, label=y_test)

## Best combinations with each model

# print('KNN:', combinations_sorted_knn[0])
print('Neural Network:', combinations_sorted_mpl[0])
print('Decision Tree:', combinations_sorted_tree[0])
print('Random Forest:', combinations_sorted_forest[0])

# for now, neural network model has the highest test accuracy