In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from cleanser import *

In [3]:
fights = pd.read_csv('raw_total_fight_data.csv', sep=';')

In [4]:
cleanser = RawFightsCleanser()

In [5]:
fights = cleanser.cleanse_column_names(fights)

In [6]:
fights = cleanser.split_composite_columns(fights)

In [7]:
# TODO:  further consolidation of setup and cleansing, so less copy / paste

In [8]:
def find_loser(row):
    if row.winner == row.r_fighter:
        return row.b_fighter
    elif row.winner == row.b_fighter:
        return row.r_fighter
    else:
        return 'None'

In [9]:
fights['loser'] = fights.apply(find_loser, axis=1)

In [10]:
def winner_b_r(row):
    if row['winner'] == row.r_fighter:
        return 'r'
    elif row['winner'] == row.b_fighter:
        return 'b'
    else:
        return 'None' 
    

In [11]:
fights['r_b_winner'] = fights.apply(lambda row: winner_b_r(row), axis=1)

In [12]:
fights['r_won'] = fights.r_b_winner.apply(lambda x: 1 if x == 'r' else 0)

In [13]:
fights.head()[['r_fighter', 'b_fighter', 'winner', 'r_b_winner']]

Unnamed: 0,r_fighter,b_fighter,winner,r_b_winner
0,Henry Cejudo,Marlon Moraes,Henry Cejudo,r
1,Valentina Shevchenko,Jessica Eye,Valentina Shevchenko,r
2,Tony Ferguson,Donald Cerrone,Tony Ferguson,r
3,Jimmie Rivera,Petr Yan,Petr Yan,b
4,Tai Tuivasa,Blagoy Ivanov,Blagoy Ivanov,b


In [14]:
# See if we can use the data of the fight to predict if red or blue won.  This won't help us predict who will win before
# the fight, but it is a good ML exercise, and could reveal something about what is effective, and may also help predict
# the winner during the fight and the stats start tom come in

In [15]:
# Start with logistic regression.
# Start with using the diff columns that represent the difference in successful strikes etc. between R and B.

In [16]:
diff_columns = [c for c in fights.columns if '_diff' in c]
diff_columns

['sig_str_diff',
 'total_str_diff',
 'td_diff',
 'head_diff',
 'body_diff',
 'leg_diff',
 'distance_diff',
 'clinch_diff',
 'ground_diff']

In [22]:
diffs = fights[diff_columns + ['r_won']]
diffs.head(10)

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_won
0,33,40,1,38,6,-11,-9,17,25,1
1,6,-5,2,4,4,-2,3,2,1,1
2,36,36,-1,22,10,4,35,1,0,1
3,17,18,-1,2,2,13,18,-1,0,0
4,-9,-15,-2,-26,-1,18,-12,9,-6,0
5,27,41,4,-2,7,22,3,3,21,1
6,69,67,0,79,7,-17,70,-1,0,1
7,-58,-71,0,-37,-6,-15,-44,-14,0,0
8,-10,-10,0,-9,-7,6,-4,-2,-4,0
9,23,27,0,20,-10,13,27,-5,1,1


In [23]:
# I think you need to "scale" or "normalize" variables for logistic regression.
# From the graphs in the exploratory notebook, all the diff columns look like bell curves
# We'll try both ways and see which is better.

In [24]:
scaled_diffs = diffs.copy()

In [25]:
scaled_diffs.head()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_won
0,33,40,1,38,6,-11,-9,17,25,1
1,6,-5,2,4,4,-2,3,2,1,1
2,36,36,-1,22,10,4,35,1,0,1
3,17,18,-1,2,2,13,18,-1,0,0
4,-9,-15,-2,-26,-1,18,-12,9,-6,0


In [26]:
scaled_diffs.describe()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_won
count,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0
mean,5.174572,9.611003,0.349533,3.732309,0.850505,0.591757,1.859642,0.614502,2.700428,0.674572
std,24.959977,45.75649,2.506284,19.704231,8.030155,8.417927,16.4406,7.444167,12.987118,0.46858
min,-119.0,-276.0,-12.0,-115.0,-85.0,-62.0,-117.0,-54.0,-94.0,0.0
25%,-7.0,-10.0,-1.0,-5.0,-2.0,-2.0,-4.0,-2.0,-2.0,0.0
50%,4.0,6.0,0.0,3.0,0.0,0.0,1.0,0.0,1.0,1.0
75%,17.25,30.0,1.0,13.0,4.0,3.0,7.0,3.0,7.0,1.0
max,180.0,315.0,20.0,157.0,46.0,61.0,180.0,66.0,88.0,1.0


In [27]:
scaled_diffs.columns

Index(['sig_str_diff', 'total_str_diff', 'td_diff', 'head_diff', 'body_diff',
       'leg_diff', 'distance_diff', 'clinch_diff', 'ground_diff', 'r_won'],
      dtype='object')

In [28]:
diff_columns

['sig_str_diff',
 'total_str_diff',
 'td_diff',
 'head_diff',
 'body_diff',
 'leg_diff',
 'distance_diff',
 'clinch_diff',
 'ground_diff']

In [29]:
for column in diff_columns:
    #col = scaled_diffs[column]
    mean = scaled_diffs[column].mean()
    sd = scaled_diffs[column].std()
    scaled_diffs[column] = scaled_diffs[column].apply(lambda x: (x - mean) / sd)

In [30]:
scaled_diffs.td_diff.mean()

-1.1050431349301714e-17

In [31]:
diffs.td_diff.mean()

0.3495334370139969

In [32]:
scaled_diffs.describe()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_won
count,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0,5144.0
mean,2.7626080000000002e-17,0.0,-1.1050430000000002e-17,-1.1050430000000002e-17,-2.762608e-18,6.9065200000000004e-18,5.525216e-18,5.525216e-18,-1.1050430000000002e-17,0.674572
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.46858
min,-4.974947,-6.241978,-4.927428,-6.025727,-10.69102,-7.435531,-7.229641,-7.33655,-7.445872,0.0
25%,-0.4877638,-0.428595,-0.53846,-0.4431693,-0.3549752,-0.3078855,-0.3564129,-0.3512149,-0.36193,0.0
50%,-0.04705823,-0.078918,-0.1394628,-0.03716509,-0.105914,-0.07029728,-0.05228777,-0.08254816,-0.1309319,1.0
75%,0.4837916,0.445598,0.2595343,0.4703401,0.3922085,0.286085,0.3126624,0.3204519,0.3310644,1.0
max,7.00423,6.674223,7.84048,7.778415,5.622494,7.176142,10.83539,8.783454,6.568014,1.0


In [33]:
scaled_diffs.head()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_won
0,1.114802,0.664146,0.259534,1.739103,0.64127,-1.377032,-0.660538,2.201119,1.717053,1
1,0.03307,-0.319321,0.658531,0.013585,0.392208,-0.307885,0.069362,0.186119,-0.130932,1
2,1.234994,0.576727,-0.53846,0.927095,1.139392,0.404879,2.015763,0.051785,-0.207931,1
3,0.473776,0.18334,-0.53846,-0.087916,0.143147,1.474026,0.981738,-0.216882,-0.207931,0
4,-0.567892,-0.537869,-0.937457,-1.50893,-0.230445,2.067996,-0.843013,1.126452,-0.669927,0


In [34]:
(1 -.349) / 2.5

0.2604

In [35]:
(3 - 1.859642) / 16.44

0.06936484184914841

In [36]:
# A couple of them look like they were correctly converted.
# Now I think we should convert the target column to 1 / 0 because I think that is what logistic regression wants.

In [39]:
scaled_diffs.head(25)[['r_won']]

Unnamed: 0,r_won
0,1
1,1
2,1
3,0
4,0
5,1
6,1
7,0
8,0
9,1


In [41]:
scaled_diffs.head()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_won
0,1.114802,0.664146,0.259534,1.739103,0.64127,-1.377032,-0.660538,2.201119,1.717053,1
1,0.03307,-0.319321,0.658531,0.013585,0.392208,-0.307885,0.069362,0.186119,-0.130932,1
2,1.234994,0.576727,-0.53846,0.927095,1.139392,0.404879,2.015763,0.051785,-0.207931,1
3,0.473776,0.18334,-0.53846,-0.087916,0.143147,1.474026,0.981738,-0.216882,-0.207931,0
4,-0.567892,-0.537869,-0.937457,-1.50893,-0.230445,2.067996,-0.843013,1.126452,-0.669927,0


In [42]:
# => Now we have a data frame that is scaled or normalized or whatever you call it, and has a 1 / 0 target column.
# Now to figure out logistic regression.

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
x_train, x_test, y_train, y_test = train_test_split(scaled_diffs[['sig_str_diff', 'total_str_diff', 'td_diff', 'head_diff', 'body_diff',
       'leg_diff', 'distance_diff', 'clinch_diff', 'ground_diff']], scaled_diffs['r_won'], test_size=.33, random_state=1)

In [45]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3446, 9), (1698, 9), (3446,), (1698,))

In [46]:
x_train.head()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff
4375,0.553904,0.511162,-0.139463,0.876344,0.018617,-0.42668,-0.052288,0.186119,1.024059
398,-0.327507,-0.297466,-0.139463,-0.189417,-0.479506,-0.070297,-0.356413,-0.082548,-0.130932
4908,-0.087122,0.554872,0.658531,0.064336,-0.354975,-0.070297,-0.173938,-0.082548,0.100066
797,-1.088726,-0.690853,-0.139463,-0.950674,-2.721057,1.59282,-1.207963,-0.485548,-0.284931
1877,-0.64802,-0.494159,-0.53846,-0.696922,0.143147,-0.42668,-0.356413,-0.216882,-0.669927


In [47]:
x_test.head()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff
4227,0.433711,0.117776,-0.139463,0.41959,0.267678,0.048497,0.069362,1.529452,-0.130932
3712,0.233391,0.030356,-0.139463,1.028596,0.516739,-2.208591,0.495137,0.051785,-0.207931
5112,0.113198,-0.078918,-0.53846,-0.138666,-0.105914,0.761261,0.008537,-0.082548,0.254065
4250,-0.207315,-0.231902,0.259534,-0.138666,-0.105914,-0.189091,-0.113113,-0.082548,-0.207931
457,-1.529431,-1.390207,-0.53846,-1.305928,-1.226689,-0.307885,-1.086313,-1.694549,-0.592928


In [48]:
y_train.head()

4375    1
398     1
4908    1
797     0
1877    0
Name: r_won, dtype: int64

In [49]:
y_test.head()

4227    1
3712    1
5112    1
4250    1
457     0
Name: r_won, dtype: int64

In [50]:
from sklearn.linear_model import LogisticRegression

In [51]:
lr = LogisticRegression()

In [52]:
lr.fit(x_train, y_train)

LogisticRegression()

In [53]:
predictions = lr.predict(x_test)

In [54]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [55]:
confusion_matrix(y_test, predictions)

array([[ 346,  197],
       [  77, 1078]], dtype=int64)

In [56]:
accuracy_score(y_test, predictions)

0.8386336866902238

In [57]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.82      0.64      0.72       543
           1       0.85      0.93      0.89      1155

    accuracy                           0.84      1698
   macro avg       0.83      0.79      0.80      1698
weighted avg       0.84      0.84      0.83      1698



In [76]:
scaled_diffs.r_won.value_counts() / scaled_diffs.shape[0]

1    0.674572
0    0.325428
Name: r_won, dtype: float64

In [77]:
fights.r_b_winner.value_counts() / fights.shape[0]

r       0.674572
b       0.309292
None    0.016135
Name: r_b_winner, dtype: float64

In [78]:
pd.Series(predictions).value_counts() / len(predictions)

1    0.750883
0    0.249117
dtype: float64

In [61]:
# The red winner won about 67.5 % of the time.  So if we were to guess "Red" as the winner all the time, we would be right
# about 67.4 % of the time.
# The logistic regression model was correct about 83.9% of the time.  So it did better than just guessing.

In [62]:
# Let's try to run a logistic regression on the original diffs df.

In [63]:
lr2 = LogisticRegression()

In [64]:
# using diffs, not scaled_diffs
x_train2, x_test2, y_train2, y_test2 = train_test_split(diffs[['sig_str_diff', 'total_str_diff', 'td_diff', 'head_diff', 'body_diff',
       'leg_diff', 'distance_diff', 'clinch_diff', 'ground_diff']], diffs['r_won'], test_size=.33, random_state=1)

In [65]:
x_train2.shape, x_test2.shape, y_train2.shape, y_test2.shape

((3446, 9), (1698, 9), (3446,), (1698,))

In [66]:
x_train2.head()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff
4375,19,33,0,21,1,-3,1,2,16
398,-3,-4,0,0,-3,0,-4,0,1
4908,3,35,2,5,-2,0,-1,0,4
797,-22,-22,0,-15,-21,14,-18,-3,-1
1877,-11,-13,-1,-10,2,-3,-4,-1,-6


In [67]:
lr2.fit(x_train2, y_train2)

LogisticRegression()

In [68]:
predictions2 = lr.predict(x_test2)

In [69]:
confusion_matrix(y_test2, predictions2)

array([[426, 117],
       [179, 976]], dtype=int64)

In [70]:
accuracy_score(y_test2, predictions2)

0.8256772673733804

In [71]:
print(classification_report(y_test2, predictions2))

              precision    recall  f1-score   support

           0       0.70      0.78      0.74       543
           1       0.89      0.85      0.87      1155

    accuracy                           0.83      1698
   macro avg       0.80      0.81      0.81      1698
weighted avg       0.83      0.83      0.83      1698



In [72]:
pd.Series(predictions2).value_counts() / len(predictions2)

1    0.643698
0    0.356302
dtype: float64

In [73]:
scaled_diffs.r_won.value_counts() / scaled_diffs.shape[0]

1    0.674572
0    0.325428
Name: r_won, dtype: float64

In [65]:
# That is interesting.  Using the non scaled version is almost as accurate as using the scaled version.

In [79]:
# reminder of the predictions histogram
pd.Series(predictions).value_counts() / len(predictions)

1    0.750883
0    0.249117
dtype: float64

In [None]:
# Se the first one was more skewed toward predicting "r won".  The second was closer to the actual ratio of red to blue, while still being almost as accurate.
# Something about that seems better to me.  But I could be wrong.