In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from keras.models import Sequential
from keras.layers import Dense
from keras import regularizers

# import pydot
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

%matplotlib inline

Using TensorFlow backend.


In [2]:
# read data for fights and fighters
matches = pd.read_csv("ufc_bouts.csv", index_col = 0)
fighters = pd.read_csv("ufc_fighters.csv")

In [3]:
# check if there are fighters with the same name
fighters[fighters.duplicated(subset="name", keep=False)]

Unnamed: 0,fighter_id,name,win,lose,draw,nc,height,weight,reach,stance,dob,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg,last_updated
511,512,Michael McDonald,17,4,0,0,"5' 9""",135 lbs.,"70""",Orthodox,"Jan 15, 1991",2.69,42%,2.76,57%,1.09,66%,52%,1.4,2018-11-21 20:06:32.497948
513,514,Michael McDonald,1,1,0,0,"5' 11""",205 lbs.,,Orthodox,"Feb 06, 1965",0.0,0%,0.4,50%,0.0,0%,0%,0.0,2018-11-21 20:06:39.825372
845,845,Dong Hyun Kim,16,8,3,0,"5' 11""",155 lbs.,"70""",Orthodox,"Sep 09, 1988",3.77,45%,4.53,53%,1.67,60%,33%,0.0,2018-11-21 20:26:57.805101
852,852,Dong Hyun Kim,22,4,1,1,"6' 2""",170 lbs.,"76""",Southpaw,"Nov 17, 1981",2.12,49%,1.89,58%,2.93,43%,71%,0.6,2018-11-21 20:27:20.82459
946,947,Tony Johnson,11,3,0,0,"6' 1""",265 lbs.,,,,2.0,53%,4.73,31%,2.0,22%,0%,0.0,2018-11-21 20:33:09.727428
954,955,Tony Johnson,4,2,0,0,,185 lbs.,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0,2018-11-21 20:33:41.289451
1674,1674,Mike Davis,5,1,0,0,"6' 0""",145 lbs.,"72""",Orthodox,"Oct 07, 1992",5.13,49%,7.4,48%,0.0,0%,100%,0.0,2018-11-21 21:17:43.856431
1679,1679,Mike Davis,2,0,0,0,,,,,,0.0,0%,0.0,0%,0.0,0%,0%,0.0,2018-11-21 21:18:02.161503


In [4]:
# note that we got two Michael McDonalds, Dong Hyun Kims, and Tony Johnsons
# Fortunately, they belong to different weight classes
fighters.iloc[485, 1] = "Michael McDonald 205"
fighters.iloc[800, 1] = "Dong Hyun Kim 155"
fighters.iloc[903, 1] = "Tony Johnson 185"

In [5]:
# use fighter names as index
fighters.drop("fighter_id", axis=1, inplace=True)
fighters.set_index("name", inplace=True)

In [6]:
# clean up dataset, if all the eight indicators are zeros
# it means that there is no statistics for that fighter
fighers_clean = fighters.loc[~((fighters["SLpM"] == 0) &
                               (fighters["Str_Acc"] == "0%") & 
                               (fighters["SApM"] == 0) &
                               (fighters["Str_Def"] == "0%") &
                               (fighters["TD_Avg"] == 0) &
                               (fighters["TD_Acc"] == "0%") &
                               (fighters["TD_Def"] == "0%") &
                               (fighters["Sub_Avg"] == 0))].copy()                        

In [7]:
print("{0} fighers in total, after clean up: {1} fighers".format(len(fighters), len(fighers_clean)))

3187 fighers in total, after clean up: 2559 fighers


In [8]:
# add winning percentages
fighers_clean["win%"] = (fighers_clean["win"] / (fighers_clean["win"] +
                                                 fighers_clean["lose"] +
                                                 fighers_clean["draw"] +
                                                 fighers_clean["nc"]))
# change datatypes
percentages = ["Str_Acc", "Str_Def", "TD_Acc", "TD_Def"]
statistics = ["SLpM", "Str_Acc", "SApM", "Str_Def", "TD_Avg", "TD_Acc", "TD_Def", "Sub_Avg", "win%"]

fighers_clean.loc[:, percentages] = fighers_clean.loc[:, percentages].applymap(
    lambda x: x.replace("%", ""))

fighers_clean.loc[:, statistics] = fighers_clean.loc[:, statistics].astype(np.float32)

In [9]:
# remove other columns
fighers_clean = fighers_clean[statistics]

In [10]:
# make sure no missing values
fighers_clean.isnull().values.any()

False

In [11]:
fighers_clean.sample(10)

Unnamed: 0_level_0,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg,win%
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Fabio Mello,0.86,35.0,3.94,50.0,0.0,0.0,0.0,0.0,0.571429
Artur Oumakhanov,1.2,33.0,1.67,82.0,3.0,75.0,33.0,0.0,0.6
Idris Wasi,0.4,31.0,3.53,46.0,0.0,0.0,20.0,0.0,0.125
Ryo Kawamura,0.6,16.0,3.93,39.0,0.0,0.0,6.0,0.0,0.545455
Ivan Lopez,0.9,26.0,2.24,65.0,0.0,0.0,30.0,0.0,0.8
Fabricio Camoes,1.29,33.0,1.85,61.0,2.34,32.0,20.0,1.5,0.583333
Cain Carrizosa,2.72,42.0,5.8,51.0,0.0,0.0,14.0,0.0,0.75
Roman Zentsov,1.18,36.0,0.53,70.0,1.6,100.0,33.0,3.2,0.580645
Keita Nakamura,2.21,36.0,3.46,58.0,2.0,55.0,62.0,0.4,0.733333
Alan Jouban,5.41,53.0,3.48,60.0,0.52,50.0,62.0,0.0,0.727273


In [12]:
# use matches that have a winner, ignore nc, draws
matches_clean = matches.loc[matches["result"] == "win"].copy()

# remove unnecessary columns
matches_clean = matches_clean.loc[:, ["fighter1", "fighter2", "winner", "weight_class"]]
matches_clean.reset_index(drop=True, inplace=True)
matches_clean.head()

Unnamed: 0,fighter1,fighter2,winner,weight_class
0,Santiago Ponzinibbio,Neil Magny,Santiago Ponzinibbio,Welterweight
1,Ricardo Lamas,Darren Elkins,Ricardo Lamas,Featherweight
2,Johnny Walker,Khalil Rountree Jr.,Johnny Walker,Light Heavyweight
3,Ian Heinisch,Cezar Ferreira,Ian Heinisch,Middleweight
4,Marlon Vera,Guido Cannetti,Marlon Vera,Bantamweight


In [13]:
# note that winner is the same as figher1
matches_clean["fighter1"].equals(matches_clean["winner"])

True

In [14]:
# randomly swap figher1 and fighter2 for about half of the matches
# Since in the original dataset, the winner is always treated as fighter 1, 
# we will need to randomly swap fighter1 and fighter2 for half of 
# the dataset in order to make the predictions work.
swap_indices = np.random.choice(len(matches_clean), size = int(len(matches_clean) / 2), replace = False)
matches_clean.iloc[swap_indices, [0, 1]] = matches_clean.iloc[swap_indices, [1, 0]].values

In [15]:
matches_clean.head(10)

Unnamed: 0,fighter1,fighter2,winner,weight_class
0,Neil Magny,Santiago Ponzinibbio,Santiago Ponzinibbio,Welterweight
1,Darren Elkins,Ricardo Lamas,Ricardo Lamas,Featherweight
2,Johnny Walker,Khalil Rountree Jr.,Johnny Walker,Light Heavyweight
3,Ian Heinisch,Cezar Ferreira,Ian Heinisch,Middleweight
4,Marlon Vera,Guido Cannetti,Marlon Vera,Bantamweight
5,Poliana Botelho,Cynthia Calvillo,Cynthia Calvillo,Women's Strawweight
6,Michel Prazeres,Bartosz Fabinski,Michel Prazeres,Welterweight
7,Ulka Sasaki,Alexandre Pantoja,Alexandre Pantoja,Flyweight
8,Austin Arnett,Humberto Bandenay,Austin Arnett,Featherweight
9,Laureano Staropoli,Hector Aldana,Laureano Staropoli,Welterweight


In [16]:
# handel duplicate names
for col in ["fighter1", "fighter2", "winner"]:
    matches_clean[col][(matches_clean[col] == "Michael McDonald") &
                       (matches_clean["weight_class"] == "Light Heavyweight")] = "Michael McDonald 205"
    
    matches_clean[col][(matches_clean[col] == "Dong Hyun Kim") &
                       (matches_clean["weight_class"] == "Lightweight")] = "Dong Hyun Kim 155"
    
    matches_clean[col][(matches_clean[col] == "Tony Johnson") &
                       (matches_clean["weight_class"] == "Middleweight")] = "Tony Johnson 185"

In [17]:
# remove weight_class column
matches_clean.drop("weight_class", axis=1, inplace=True)

# binary results
matches_clean["result"] = (matches_clean["winner"] == matches_clean["fighter1"]).astype("int")
matches_clean.drop("winner", axis=1, inplace=True)

In [18]:
# combine the two
# make sure all fighters are in the figher database
# otherwise remove the corresponding match
all_fighter_names = fighers_clean.index.values.tolist()

matches_clean = matches_clean.loc[(matches_clean["fighter1"].isin(all_fighter_names)) &
                                  (matches_clean["fighter2"].isin(all_fighter_names))].copy()

# use fighter 1 - fighter 2 (the differences) for learning
for col in statistics:
    matches_clean[col] = matches_clean.apply(
        lambda row: fighers_clean.loc[row["fighter1"], col] - fighers_clean.loc[row["fighter2"], col], axis=1)
    
# drop fighter1 and fighter2 columns
matches_clean.drop(["fighter1", "fighter2"], axis=1, inplace=True)

In [19]:
matches_clean.head()

Unnamed: 0,result,SLpM,Str_Acc,SApM,Str_Def,TD_Avg,TD_Acc,TD_Def,Sub_Avg,win%
0,0,-0.41,7,-1.8,-8,2.08,2,0,0.3,-0.179088
1,0,0.49,-8,0.22,-6,1.02,3,10,0.3,0.0504808
2,1,3.07,36,0.0699999,-13,0.89,100,50,2.3,0.166667
3,1,1.66,16,-0.47,-3,-2.16,-45,-40,1.1,0.25641
4,1,0.43,-8,1.98,1,-2.02,-13,-7,1.0,0.051282


In [20]:
matches_clean.isnull().values.any()

False

In [21]:
# get ready for deep learning

# X, y = matches_clean.iloc[:, 1:], matches_clean.iloc[:, 0]
X = matches_clean.drop("result", axis=1)
y = matches_clean["result"]

print(X.shape, y.shape)

# from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# # normalization
from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
# print(X_train)

(4553, 9) (4553,)


In [22]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [25]:
# X_train = np.asarray(list(X_train), dtype=np.float)
# X_train = np.asarray(list(X_train))
# X_train = pd.to_numeric(X_train, errors='coerce')
# X_train.isnull().values.any()
# X_scaler = StandardScaler().fit(X_train)

print(X_train)

# X_train.describe()



ValueError: setting an array element with a sequence.

In [None]:
model = Sequential()

model.add(Dense(16, input_dim=X_train_scaled.shape[1],
                activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])

In [None]:
model.fit(x=X_train_scaled, y=y_train, epochs=200, batch_size=64, verbose=0)
test_results = model.evaluate(x = X_test_scaled, y = y_test, verbose=0)
print("Test Accuracy = {}".format(test_results[1]))

In [None]:
# function for predictions
def predict(model, figher1, figher2):
    data1 = fighers_clean.loc[figher1, statistics]
    data2 = fighers_clean.loc[figher2, statistics]
    data_diff = (data1 - data2).values.reshape(1, -1)
    data_diff_scaled = scaler.transform(data_diff)
    return model.predict(data_diff_scaled)

print(predict(model, "Cub Swanson", "Brian Ortega"))