In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
plt.style.use('ggplot') # This styles the graphs in a nicer format

import statsmodels.formula.api as smf

In [31]:
UFC = pd.read_csv('Data/2a) UFC Fights.csv')

#make sure theres no space in the columns
UFC.columns = [c.replace(' ', '_') for c in UFC.columns]

#change outcome win = 1, everything else 0
UFC['OUTCOME1'] = UFC.OUTCOME.map({'win':1, 'loss':0, 'draw':0, 'NC':0})

#change stance and stance_diff into 1,0
#check out what stances there are
print UFC.STANCE.value_counts()
#lets just put orthodox as 1, other as 0
UFC['STANCE1'] = UFC.STANCE.map({'Orthodox':1, 'Southpaw':0, 'Switch':0, 'Open Stance':0,'Sideways':0})
#same stance = 0
UFC['STANCE_DIFF1'] = UFC.STANCE_DIFF.map({'Same Stance':0, 'Different Stance':1})

#print column names and type
print UFC.dtypes
print UFC.shape

Orthodox       5338
Southpaw       1350
Switch          161
Open Stance      24
Sideways          5
Name: STANCE, dtype: int64
FIGHT_ORDER                 int64
KEY                        object
ROUND_FORMAT               object
EVENT_TYPE                 object
EVENT_NAME                 object
EVENT_YEAR                  int64
EVENT_MONTH                 int64
EVENT_DAY                   int64
LOCATION_COUNTRY           object
LOCATION_CITY              object
VENUE                      object
ATTENDANCE                float64
MAIN_FIGHT                 object
MID                         int64
WEIGHT_CLASS               object
REF                        object
METHOD                     object
METHOD_D                   object
ROUND                     float64
FINISH_ROUND_MIN          float64
FINISH_ROUND_SEC          float64
FINISH_ROUND_INSECONDS    float64
TOTAL_INSECONDS           float64
FIGHTER_NAME               object
FIGHER_NO                  object
OUTCOME                

In [32]:
#Add Business Filters
#Exclude fights where it is single round only
UFC1=UFC[(UFC.ROUND_FORMAT == 'MULTI-ROUND')]
#see how many fights are there for each weight class
print UFC1.WEIGHT_CLASS.value_counts()
#from initial investigation we will exclude Women's classes, Catch Weight and Super Heavyweight
UFC1=UFC[(UFC.ROUND_FORMAT == 'MULTI-ROUND')&(-UFC.WEIGHT_CLASS.isin(['Super Heavyweight','Catch Weight','Womens Bantamweight','Womens Strawweight']))&(UFC.WEIGHT_CLASS.notnull())]
print UFC1.WEIGHT_CLASS.value_counts()

Lightweight            1434
Welterweight           1400
Middleweight           1094
Light Heavyweight       726
Heavyweight             676
Featherweight           546
Bantamweight            434
Flyweight               212
Womens Bantamweight     110
Womens Strawweight       76
Catch Weight             28
Super Heavyweight         2
Name: WEIGHT_CLASS, dtype: int64
Lightweight          1434
Welterweight         1400
Middleweight         1094
Light Heavyweight     726
Heavyweight           676
Featherweight         546
Bantamweight          434
Flyweight             212
Name: WEIGHT_CLASS, dtype: int64


In [33]:
#check out missing values
#print UFC1.isnull().sum()

#we'll need to see if we want to fill the missing values or just exclude them
#Lets see how much data we exclude by class if we exclude all observations with any missing values
UFC1a = UFC1.dropna()
print UFC1.shape
print UFC1a.shape

#roughly we've dropped half the observations - let's check by weight class but first..
#check if nickname means anything, otherwise let's drop as it makes up most of the missing
UFC1.NICK.isnull().sum()
#print UFC1[(UFC1.NICK.isnull())&(UFC1.OUTCOME=='win')].shape
#print UFC1[(UFC1.NICK.notnull())&(UFC1.OUTCOME=='win')].shape
#seems irrelevant, I'm going to remove the column and exclude again to increase data points

UFC1b = UFC1.drop('NICK', 1)
UFC1b = UFC1b.dropna()
print UFC1.shape
print UFC1a.shape
print UFC1b.shape

#how much remaining by class?
print UFC1b.WEIGHT_CLASS.value_counts()

(6522, 55)
(3520, 55)
(6522, 55)
(3520, 55)
(4694, 54)
Lightweight          1074
Welterweight          975
Middleweight          748
Light Heavyweight     496
Featherweight         460
Heavyweight           411
Bantamweight          339
Flyweight             191
Name: WEIGHT_CLASS, dtype: int64


In [37]:
#lets now have a look at correlation matrix for each class - total one won't mean much 
#as fighters fight at different weight classes

#Weight class	Upper weight limit
#Strawweight	115 lb (52.2 kg; 8.2 st)
#Flyweight	125 lb (56.7 kg; 8.9 st)
#Bantamweight	135 lb (61.2 kg; 9.6 st)
#Featherweight	145 lb (65.8 kg; 10.4 st)
#Lightweight	155 lb (70.3 kg; 11.1 st)
#Welterweight	170 lb (77.1 kg; 12.1 st)
#Middleweight	185 lb (83.9 kg; 13.2 st)
#Light Heavyweight	205 lb (93.0 kg; 14.6 st)
#Heavyweight	265 lb (120.2 kg; 18.9 st)
#Super Heavyweight	N/A

#for each class, look at which variables correlate highest with outcome
#use this to build first gen model
#we also need to check out if these variables are correlated with each other

FLYCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Flyweight')].corr()).OUTCOME1
#FLYCORR
FLYCORR1 = UFC1b.iloc[:,[34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]]
FLYCORR1.corr()

#BANTCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Bantamweight')].corr()).OUTCOME1
#BANTCORR

#FEATCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Featherweight')].corr()).OUTCOME1
#FEATCORR
#weight difference seems to be important in this class!!!

#LIGHTCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Lightweight')].corr()).OUTCOME1
#LIGHTCORR

#WELTCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Welterweight')].corr()).OUTCOME1
#WELTCORR

#MIDDLECORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Middleweight')].corr()).OUTCOME1
#LHCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Light Heavyweight')].corr()).OUTCOME1
#HEAVYCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Heavyweight')].corr()).OUTCOME1





FIGHT_ORDER               0.009309
EVENT_YEAR                0.004417
EVENT_MONTH               0.024460
EVENT_DAY                 0.009076
ATTENDANCE               -0.016541
MID                       0.015229
ROUND                    -0.003581
FINISH_ROUND_MIN         -0.033595
FINISH_ROUND_SEC          0.017719
FINISH_ROUND_INSECONDS   -0.033698
TOTAL_INSECONDS          -0.010126
FIGHTER_ID                0.057751
STR                       0.292652
TD                        0.316421
SUB                       0.302158
PASS                      0.319390
BIRTH_YEAR                0.113906
AGE_AT_FIGHT             -0.115286
HEIGHT_CM                -0.163583
WEIGHT_KG                 0.025946
REACH_INCH               -0.074713
WIN_AT_FIGHT              0.166004
NOTWIN_AT_FIGHT          -0.003696
TOTAL_AT_FIGHT            0.124945
WINRATIO_AT_FIGHT         0.134179
AGE_DIFF                 -0.192418
HEIGHT_DIFF              -0.248488
WEIGHT_DIFF               0.022338
REACH_DIFF          

Unnamed: 0,BIRTH_YEAR,AGE_AT_FIGHT,HEIGHT_CM,WEIGHT_KG,REACH_INCH,WIN_AT_FIGHT,NOTWIN_AT_FIGHT,TOTAL_AT_FIGHT,WINRATIO_AT_FIGHT,AGE_DIFF,HEIGHT_DIFF,WEIGHT_DIFF,REACH_DIFF,EXP_DIFF,WINRATIO_DIFF,OUTCOME1,STANCE1,STANCE_DIFF1
BIRTH_YEAR,1.0,-0.813834,-0.179348,-0.34294,-0.145516,-0.2657,-0.271848,-0.292103,-0.090887,-0.553699,0.101719,0.029312,0.103448,-0.140256,0.01514,0.06806,0.029785,-0.008787
AGE_AT_FIGHT,-0.813834,1.0,0.10097,0.260969,0.074624,0.33403,0.39085,0.386405,0.067017,0.628491,-0.117356,-0.031521,-0.116145,0.162265,-0.016881,-0.079608,-0.040784,0.0183
HEIGHT_CM,-0.179348,0.10097,1.0,0.769471,0.86272,0.107238,0.019394,0.082601,0.091755,-0.081095,0.411222,0.102313,0.26735,-0.000532,0.009579,0.007238,0.006451,-0.008449
WEIGHT_KG,-0.34294,0.260969,0.769471,1.0,0.726451,0.148159,0.068507,0.130419,0.107469,-0.014541,0.049477,0.185512,0.045641,0.009121,0.005965,0.013881,0.014576,-0.009521
REACH_INCH,-0.145516,0.074624,0.86272,0.726451,1.0,0.115793,0.003397,0.082335,0.103693,-0.089035,0.306579,0.112938,0.463196,-0.002649,0.020534,0.031727,0.01902,-0.002086
WIN_AT_FIGHT,-0.2657,0.33403,0.107238,0.148159,0.115793,1.0,0.654623,0.955379,0.475927,0.105603,0.010735,0.017335,0.00981,0.513203,0.16741,0.030761,-0.08742,0.054838
NOTWIN_AT_FIGHT,-0.271848,0.39085,0.019394,0.068507,0.003397,0.654623,1.0,0.84871,0.019109,0.182812,-0.025987,0.021473,-0.039378,0.534499,-0.050208,-0.023879,0.007977,0.009162
TOTAL_AT_FIGHT,-0.292103,0.386405,0.082601,0.130419,0.082335,0.955379,0.84871,1.0,0.34042,0.145311,-0.002644,0.020518,-0.008523,0.567883,0.0975,0.01219,-0.058041,0.041944
WINRATIO_AT_FIGHT,-0.090887,0.067017,0.091755,0.107469,0.103693,0.475927,0.019109,0.34042,1.0,-0.016936,0.012298,0.013332,0.021109,0.095237,0.567596,0.050039,-0.044282,0.037374
AGE_DIFF,-0.553699,0.628491,-0.081095,-0.014541,-0.089035,0.105603,0.182812,0.145311,-0.016936,1.0,-0.188113,-0.046978,-0.18091,0.255797,-0.026749,-0.123812,-0.04745,-0.001053
