In [1]:
#Import various useful libraries
import pandas as pd
import numpy as np

import datetime as dt
import datetime

from sqlalchemy import create_engine

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sql_con = "sqlite:///../database/ufc_data.db"
con = create_engine(sql_con)

In [3]:
fighter_df = pd.read_sql("SELECT * FROM clean_fighter_data", con)
bout_df = pd.read_sql("SELECT * FROM clean_bout_data", con)

In [4]:
#Add losses to the data for fighter_1
all_data = bout_df.copy()
all_data_losses = all_data[all_data.results=='win'].copy() #Make a copy of all_data with just the wins

all_data_losses['results'] = all_data_losses.results.str.replace("win","loss") #Replace win with loss
all_data_losses = all_data_losses.rename(columns={'fighter_1':'temp'}).rename(columns={'fighter_2':'fighter_1'}).rename(columns={'temp':'fighter_2'}) #Rename the columns
all_data_doubled = pd.concat([all_data,all_data_losses]).sort_values(by='date') #Aggregate data and sort the values

In [5]:
#Count the number of ufc wins

all_data_doubled['ufc_f1_wins_and_losses'] = all_data_doubled.groupby(by=['fighter_1','results']).event_name.cumcount() + 1 #Long forms wins and losses
all_data_mini = all_data_doubled[['date','fighter_1','fighter_2','results','ufc_f1_wins_and_losses']].copy() #Subset the data in preparation for pivotting

all_data_pivot = all_data_mini.pivot_table(index=['date','fighter_1','fighter_2'], columns='results').reset_index() #Pivot to create wide columns of wins
all_data_pivot.columns = ['date','fighter_1','fighter_2','draw','loss','nc','win'] #Rename the columns to remove multi-indexing

#filled_nas = all_data_pivot.groupby(by='fighter_1')['draw','loss','nc','win'].fillna(method='bfill') #Fillnas - first fill with previous values
filled_nas = all_data_pivot.groupby(by='fighter_1')['draw','loss','nc','win'].fillna(method='ffill') #Fillnas - then fill with forward values

for i in ['draw','loss','nc','win']: #Replace the columns in all_data pivot with that from filled_nas
    all_data_pivot[i] = filled_nas[i]
    all_data_pivot[i] = all_data_pivot.groupby(by='fighter_1')[i].shift()
    
all_data_pivot = all_data_pivot.fillna(0) #Fill the remaining NAs wtith zeros
all_data_doubled = all_data_doubled.merge(all_data_pivot, on=['date','fighter_1','fighter_2']) #Left merge all_data_doubled - creates draw, loss, nc and win columns
all_data_doubled = all_data_doubled.drop(columns=['ufc_f1_wins_and_losses']) #Drop the unneeded column

for i in ['draw','loss','nc','win']:
    all_data_doubled = all_data_doubled.rename(columns={i:'ufc_'+i}) 

  # Remove the CWD from sys.path while we load stuff.


In [6]:
#Merge in the fighter df

fighter_df = fighter_df.rename(columns={'wins':'max_wins','losses':'max_losses','draws':'max_draws'})

fighter_columns = ['height', 'weight', 'reach', 'stance', 'dob',
       'strikes_landed_per_min', 'strike_accuracy', 'strikes_absorbed_per_min',
       'strike_defense', 'takedowns_per_15_min', 'takedown_accuracy',
       'takedown_defense', 'submission_attempts_per_15_min', 'max_wins',
       'max_losses', 'max_draws', 'total_fights', 'win_pct', 'loss_pct',
       'draw_pct']

all_data_doubled = all_data_doubled.merge(fighter_df,left_on='fighter_1',right_on='fighter_name')

for i in fighter_columns:
    all_data_doubled = all_data_doubled.rename(columns={i:'f1_'+i}) 
    
all_data_doubled = all_data_doubled.drop(columns='fighter_name')

In [7]:
#Create total fights features

max_fights = all_data_doubled[['fighter_1','ufc_win','ufc_draw','ufc_loss','ufc_nc']].groupby(by='fighter_1').max()
max_fights.columns = ['f1_max_ufc_win', 'f1_max_ufc_draw', 'f1_max_ufc_loss', 'f1_max_ufc_nc']
all_data_doubled = all_data_doubled.merge(max_fights,on='fighter_1',how='left')

all_data_doubled = all_data_doubled.rename(columns={'ufc_win':'f1_ufc_win','ufc_draw':'f1_ufc_draw','ufc_loss':'f1_ufc_loss','ufc_nc':'f1_ufc_nc'}) #Renaming of columns to make fighter explicit

ufc_cols = ['f1_ufc_win','f1_ufc_loss','f1_ufc_draw']
max_ufc_cols = ['f1_max_ufc_win', 'f1_max_ufc_loss', 'f1_max_ufc_draw']
max_all_cols = ['f1_max_wins', 'f1_max_losses', 'f1_max_draws']
all_cols = ['f1_all_win','f1_all_loss','f1_all_draw']

for i in range(len(ufc_cols)):
    all_data_doubled[all_cols[i]] = all_data_doubled[max_all_cols[i]] - all_data_doubled[max_ufc_cols[i]] + all_data_doubled[ufc_cols[i]]
    
all_data_doubled = all_data_doubled.drop(columns=['f1_max_wins','f1_max_losses','f1_max_draws','f1_max_ufc_win','f1_max_ufc_draw','f1_max_ufc_loss','f1_max_ufc_nc'])
all_data_doubled['f1_total_fights'] = all_data_doubled['f1_all_win'] + all_data_doubled['f1_all_loss'] + all_data_doubled['f1_all_draw'] + all_data_doubled['f1_ufc_nc']

In [8]:
#Drop the leakage columns
leakage_cols = ['f1_strikes_landed_per_min','f1_strike_accuracy','f1_strikes_absorbed_per_min','f1_strike_defense','f1_takedowns_per_15_min','f1_takedown_accuracy','f1_takedown_defense','f1_submission_attempts_per_15_min','f1_win_pct','f1_loss_pct','f1_draw_pct']
all_data_doubled = all_data_doubled.drop(columns = leakage_cols)

In [9]:
#Create total UFC fights feature
all_data_doubled['f1_ufc_total_fights'] = all_data_doubled['f1_ufc_draw'] + all_data_doubled['f1_ufc_loss'] + all_data_doubled['f1_ufc_nc'] + all_data_doubled['f1_ufc_win']

In [10]:
#Sum cumulative features
orig_f1_features = ['no_rounds','total_fight_time','fighter_1_strikes','fighter_1_td','fighter_1_sub']
cum_f1_target = ['f1_cum_rnd','f1_cum_ftime','f1_cum_strikes','f1_cum_td','f1_cum_sub']

for i in range(len(orig_f1_features)):
    all_data_doubled[cum_f1_target[i]] = all_data_doubled.groupby(by=['fighter_1'])[orig_f1_features[i]].cumsum()
    
orig_f2_features = ['no_rounds','total_fight_time','fighter_2_strikes','fighter_2_td','fighter_2_sub']
cum_f2_target = ['f2_cum_rnd','f2_cum_ftime','f2_cum_strikes','f2_cum_td','f2_cum_sub']

for i in range(len(orig_f2_features)):
    all_data_doubled[cum_f2_target[i]] = all_data_doubled.groupby(by=['fighter_2'])[orig_f2_features[i]].cumsum()
    
f1_per_min_orig = ['f1_cum_strikes','f1_cum_td','f1_cum_sub']
f1_per_min_target = ['f1_stpm','f1_tdpm','f1_subpm']

for i in range(len(f1_per_min_orig)):
    all_data_doubled[f1_per_min_target[i]] = all_data_doubled[f1_per_min_orig[i]] / all_data_doubled['f1_cum_ftime']
    
f2_per_min_orig = ['f2_cum_strikes','f2_cum_td','f2_cum_sub']
f2_per_min_target = ['f2_stpm','f2_tdpm','f2_subpm']

for i in range(len(f2_per_min_orig)):
    all_data_doubled[f2_per_min_target[i]] = all_data_doubled[f2_per_min_orig[i]] / all_data_doubled['f2_cum_ftime']

In [11]:
#Create a win streak feature for f1
all_data_doubled['results_2'] = np.where(all_data_doubled.results=='win',0,1)
all_data_doubled['cumsum'] = all_data_doubled.groupby(by=['fighter_1']).results_2.cumsum()
all_data_doubled['val'] = np.where(all_data_doubled.results=='win',1,0)
all_data_doubled['f1_win_streak'] = all_data_doubled.groupby(by=['fighter_1','cumsum']).val.cumsum()
all_data_doubled['f1_win_streak'] = all_data_doubled.groupby(by=['fighter_1'])['f1_win_streak'].shift(periods=1,fill_value=0)
all_data_doubled = all_data_doubled.drop(columns=['results_2','cumsum','val'])

In [12]:
#Create a loss streak feature for f1
all_data_doubled['results_2'] = np.where(all_data_doubled.results=='loss',0,1)
all_data_doubled['cumsum'] = all_data_doubled.groupby(by=['fighter_1']).results_2.cumsum()
all_data_doubled['val'] = np.where(all_data_doubled.results=='loss',1,0)
all_data_doubled['f1_loss_streak'] = all_data_doubled.groupby(by=['fighter_1','cumsum']).val.cumsum()
all_data_doubled['f1_loss_streak'] = all_data_doubled.groupby(by=['fighter_1'])['f1_loss_streak'].shift(periods=1,fill_value=0)
all_data_doubled = all_data_doubled.drop(columns=['results_2','cumsum','val'])

In [13]:
#List the f1 features to duplicate as f2

features_to_duplicate = ['f1_ufc_draw', 'f1_ufc_loss', 'f1_ufc_nc',
       'f1_ufc_win', 'f1_height', 'f1_weight', 'f1_reach', 'f1_stance',
       'f1_dob', 'f1_total_fights', 'f1_all_win', 'f1_all_loss', 'f1_all_draw',
       'f1_ufc_total_fights', 'f1_win_streak','f1_loss_streak']

In [14]:
# Define f1_to_f2 duplicating feature - parameters

def dup_f1f2(features_to_duplicate, all_data_doubled):

    # Create a list of total features to merge on - include fighter_1 and date
    total_features = features_to_duplicate + ['fighter_1','date']
    
    # Duplicate all_data_doubled with a subset of the features to duplicate
    dup_df = all_data_doubled[total_features].copy()
    
    # Find and replace f1 as f2 in the column names
    dup_df.columns = [i.replace("f1","f2") for i in dup_df.columns]
    
    # Merge the duplicated dataset (R) unto the first dataset (L), left_on = fighter_1, right_on = fighter_1
    all_data_doubled = all_data_doubled.merge(dup_df, on=['fighter_1','date'])
    
    return all_data_doubled

In [15]:
all_data_doubled = dup_f1f2(features_to_duplicate, all_data_doubled)

In [16]:
def fill_na_mean(df,gb_feat,replace):
    # Groupby weight class, then find the mean f1_reach
    gb = df.groupby(by=gb_feat)[replace].mean()

    # Subset all_data_doubled for the rows where f1_reach is NA
    subset = df[df[replace].isna()]

    # Merge the mean values
    df.loc[df[replace].isna(),[replace]] = subset[gb_feat].map(gb)
    
    return df

In [None]:
,

In [17]:
#Replace reach NAs reaches with appropriate means
all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f1_reach')
all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f2_reach')

all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f1_height')
all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f2_height')

all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f1_weight')
all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f2_weight')

all_data_doubled = fill_na_mean(all_data_doubled,'f1_total_fights','f1_dob')
all_data_doubled = fill_na_mean(all_data_doubled,'f2_total_fights','f2_dob')

In [32]:
all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f1_weight')
all_data_doubled = fill_na_mean(all_data_doubled,'weight_class','f2_weight')

In [19]:
#List the f1 features on which to create f1 - f2 features
diff_feat = ['fighter_1','fighter_2',
        'f1_ufc_draw', 'f1_ufc_loss', 'f1_ufc_nc',
       'f1_ufc_win', 'f1_height', 'f1_weight', 'f1_reach', 'f1_stance',
       'f1_dob', 'f1_total_fights', 'f1_all_win', 'f1_all_loss', 'f1_all_draw',
       'f1_ufc_total_fights', 'f1_cum_rnd', 'f1_cum_ftime', 'f1_cum_strikes',
       'f1_cum_td', 'f1_cum_sub', 'f2_cum_rnd', 'f2_cum_ftime',
       'f2_cum_strikes', 'f2_cum_td', 'f2_cum_sub', 'f1_stpm', 'f1_tdpm',
       'f1_subpm', 'f2_stpm', 'f2_tdpm', 'f2_subpm', 'f1_win_streak',
       'f1_loss_streak', 'f2_ufc_draw', 'f2_ufc_loss', 'f2_ufc_nc',
       'f2_ufc_win', 'f2_height', 'f2_weight', 'f2_reach', 'f2_stance',
       'f2_dob', 'f2_total_fights', 'f2_all_win', 'f2_all_loss', 'f2_all_draw',
       'f2_ufc_total_fights', 'f2_win_streak', 'f2_loss_streak'
]