In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
plt.style.use('ggplot') # This styles the graphs in a nicer format

import statsmodels.formula.api as smf

In [3]:
UFC = pd.read_csv('Data/2a) UFC Fights.csv')

#make sure theres no space in the columns
UFC.columns = [c.replace(' ', '_') for c in UFC.columns]
#change outcome win = 1, everything else 0
UFC['OUTCOME1'] = UFC.OUTCOME.map({'win':1, 'loss':0, 'draw':0, 'NC':0})
#print column names and type
UFC.dtypes
UFC.shape

(7138, 53)

In [4]:
#Add Business Filters
#Exclude fights where it is single round only
UFC1=UFC[(UFC.ROUND_FORMAT == 'MULTI-ROUND')]
#see how many fights are there for each weight class
print UFC1.WEIGHT_CLASS.value_counts()
#from initial investigation we will exclude Women's classes, Catch Weight and Super Heavyweight
UFC1=UFC[(UFC.ROUND_FORMAT == 'MULTI-ROUND')&(-UFC.WEIGHT_CLASS.isin(['Super Heavyweight','Catch Weight','Womens Bantamweight','Womens Strawweight']))&(UFC.WEIGHT_CLASS.notnull())]
print UFC1.WEIGHT_CLASS.value_counts()

Lightweight            1434
Welterweight           1400
Middleweight           1094
Light Heavyweight       726
Heavyweight             676
Featherweight           546
Bantamweight            434
Flyweight               212
Womens Bantamweight     110
Womens Strawweight       76
Catch Weight             28
Super Heavyweight         2
Name: WEIGHT_CLASS, dtype: int64
Lightweight          1434
Welterweight         1400
Middleweight         1094
Light Heavyweight     726
Heavyweight           676
Featherweight         546
Bantamweight          434
Flyweight             212
Name: WEIGHT_CLASS, dtype: int64


In [5]:
#check out missing values
#print UFC1.isnull().sum()

#we'll need to see if we want to fill the missing values or just exclude them
#Lets see how much data we exclude by class if we exclude all observations with any missing values
UFC1a = UFC1.dropna()
print UFC1.shape
print UFC1a.shape

#roughly we've dropped half the observations - let's check by weight class but first..
#check if nickname means anything, otherwise let's drop as it makes up most of the missing
UFC1.NICK.isnull().sum()
#print UFC1[(UFC1.NICK.isnull())&(UFC1.OUTCOME=='win')].shape
#print UFC1[(UFC1.NICK.notnull())&(UFC1.OUTCOME=='win')].shape
#seems irrelevant, I'm going to remove the column and exclude again to increase data points

UFC1b = UFC1.drop('NICK', 1)
UFC1b = UFC1b.dropna()
print UFC1.shape
print UFC1a.shape
print UFC1b.shape

#how much remaining by class?
print UFC1b.WEIGHT_CLASS.value_counts()

(6522, 53)
(3520, 53)
(6522, 53)
(3520, 53)
(4694, 52)
Lightweight          1074
Welterweight          975
Middleweight          748
Light Heavyweight     496
Featherweight         460
Heavyweight           411
Bantamweight          339
Flyweight             191
Name: WEIGHT_CLASS, dtype: int64


In [24]:
#lets now have a look at correlation matrix for each class - total one won't mean much 
#as fighters fight at different weight classes


#Weight class	Upper weight limit
#Strawweight	115 lb (52.2 kg; 8.2 st)
#Flyweight	125 lb (56.7 kg; 8.9 st)
#Bantamweight	135 lb (61.2 kg; 9.6 st)
#Featherweight	145 lb (65.8 kg; 10.4 st)
#Lightweight	155 lb (70.3 kg; 11.1 st)
#Welterweight	170 lb (77.1 kg; 12.1 st)
#Middleweight	185 lb (83.9 kg; 13.2 st)
#Light Heavyweight	205 lb (93.0 kg; 14.6 st)
#Heavyweight	265 lb (120.2 kg; 18.9 st)
#Super Heavyweight	N/A


FLYCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Flyweight')].corr()).OUTCOME1
BANTCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Bantamweight')].corr()).OUTCOME1
FEATCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Featherweight')].corr()).OUTCOME1
LIGHTCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Lightweight')].corr()).OUTCOME1
WELTCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Welterweight')].corr()).OUTCOME1
MIDDLECORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Middleweight')].corr()).OUTCOME1
LHCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Light Heavyweight')].corr()).OUTCOME1
HEAVYCORR = (UFC1b[(UFC1b.WEIGHT_CLASS == 'Heavyweight')].corr()).OUTCOME1

LIGHTCORR

FIGHT_ORDER              -0.003482
EVENT_YEAR               -0.004577
EVENT_MONTH               0.003071
EVENT_DAY                 0.000557
ATTENDANCE                0.008482
MID                       0.006118
ROUND                    -0.005023
FINISH_ROUND_MIN          0.005327
FINISH_ROUND_SEC         -0.009988
FINISH_ROUND_INSECONDS    0.003787
TOTAL_INSECONDS          -0.003223
FIGHTER_ID                0.066602
STR                       0.248722
TD                        0.254760
SUB                       0.120424
PASS                      0.328087
BIRTH_YEAR                0.116619
AGE_AT_FIGHT             -0.129337
HEIGHT_CM                 0.019949
WEIGHT_KG                 0.048809
REACH_INCH                0.025906
WIN_AT_FIGHT              0.034591
NOTWIN_AT_FIGHT          -0.016923
TOTAL_AT_FIGHT            0.016752
WINRATIO_AT_FIGHT         0.056973
AGE_DIFF                 -0.194598
HEIGHT_DIFF               0.029992
WEIGHT_DIFF               0.067754
REACH_DIFF          