# Purpose
Include 3 fight averages

### Features
- Career Average Significant Strike Attempts per 15 Minutes
- 3 Fight Average Significant Strike Attempts per 15 Minutes
- Career Average Takedown Attempts per 15 Minutes
- 3 Fight Average Takedown Attempts per 15 Minutes
- Career Average Significant Strike Attempts per 15 Minutes Differentials
- 3 Fight Average Significant Strike Attempts per 15 Minutes Differentials
- Career Average Takedown Attempts per 15 Minutes Differentials
- 3 Fight Average Takedown Attempts per 15 Minutes Differentials

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from sqlalchemy import create_engine
from src import local
from src import functions

#### Load the data

In [2]:
# Set up tables
# Credentials
USER = local.user 
PASS = local.password
HOST = local.host
PORT = local.port

#create engine
engine = create_engine(f'postgresql://{USER}:{PASS}@{HOST}:{PORT}/match_finder')

#### Join our 5 advanced statistics tables with our bouts and events table to get dates for each bout

In [3]:
query = """
SELECT striking_position_adv.bout_id, striking_position_adv.fighter_id, striking_position_adv.round, date, 
(d_ss_a_p15m + c_ss_a_p15m) as s_ss_a_p15m, td_a_p15m,
(d_ss_a_p15m_di + c_ss_a_p15m_di) as s_ss_a_p15m_di, td_a_p15m_di
FROM striking_position_adv
    JOIN grappling_adv ON 
    CONCAT(striking_position_adv.bout_id, striking_position_adv.fighter_id, CAST(striking_position_adv.round AS CHAR)) =
    CONCAT(grappling_adv.bout_id, grappling_adv.fighter_id, CAST(grappling_adv.round AS CHAR))
JOIN bouts ON bouts.id = striking_position_adv.bout_id
JOIN events ON events.id = bouts.event_id
"""

data = pd.read_sql(query, engine)

In [4]:
data.head(10)

Unnamed: 0,bout_id,fighter_id,round,date,s_ss_a_p15m,td_a_p15m,s_ss_a_p15m_di,td_a_p15m_di
0,000da3152b7b5ab1,6da99156486ed6c2,1,"July 08, 2006",111.0,6.0,81.0,6.0
1,000da3152b7b5ab1,6da99156486ed6c2,2,"July 08, 2006",78.0,9.0,39.0,3.0
2,000da3152b7b5ab1,6da99156486ed6c2,3,"July 08, 2006",45.0,6.0,24.0,3.0
3,000da3152b7b5ab1,d1a1314976c50bef,1,"July 08, 2006",30.0,0.0,-81.0,-6.0
4,000da3152b7b5ab1,d1a1314976c50bef,2,"July 08, 2006",39.0,6.0,-39.0,-3.0
5,000da3152b7b5ab1,d1a1314976c50bef,3,"July 08, 2006",21.0,3.0,-24.0,-3.0
6,0019ec81fd706ade,326f94d6cfb1bf25,1,"October 18, 2019",96.0,0.0,-21.0,0.0
7,0019ec81fd706ade,326f94d6cfb1bf25,2,"October 18, 2019",105.0,0.0,12.0,0.0
8,0019ec81fd706ade,326f94d6cfb1bf25,3,"October 18, 2019",90.0,0.0,-15.0,0.0
9,0019ec81fd706ade,85073dbd1be65ed9,1,"October 18, 2019",117.0,0.0,21.0,0.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25772 entries, 0 to 25771
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bout_id         25772 non-null  object 
 1   fighter_id      25772 non-null  object 
 2   round           25772 non-null  object 
 3   date            25772 non-null  object 
 4   s_ss_a_p15m     25772 non-null  float64
 5   td_a_p15m       25772 non-null  float64
 6   s_ss_a_p15m_di  25772 non-null  float64
 7   td_a_p15m_di    25772 non-null  float64
dtypes: float64(4), object(4)
memory usage: 1.6+ MB


In [6]:
data['date'] = pd.to_datetime(data['date'])

## Create fighter-bout instance dataframe

A fighter-bout instance represents one fighter in one bout.
 - The same fighter has exactly one fighter-bout instance for every single bout he has been in. 
 - Every bout has exactly two fighter-bout instances, one for each fighter in the bout. 
  
In this case a fighter-bout instance is assigned a unique identifier comprised of the bout_id combined with the fighter_link.

In [7]:
fighter_bout_inst = functions.create_fighter_bout_instance_table(data, 's_ss_a_p15m')

In [8]:
fighter_bout_inst

Unnamed: 0_level_0,bout_id,fighter_id,date,target
fighter_bout_inst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000da3152b7b5ab16da99156486ed6c2,000da3152b7b5ab1,6da99156486ed6c2,2006-07-08,78.000000
000da3152b7b5ab1d1a1314976c50bef,000da3152b7b5ab1,d1a1314976c50bef,2006-07-08,30.000000
0019ec81fd706ade326f94d6cfb1bf25,0019ec81fd706ade,326f94d6cfb1bf25,2019-10-18,97.000000
0019ec81fd706ade85073dbd1be65ed9,0019ec81fd706ade,85073dbd1be65ed9,2019-10-18,105.000000
0027e179b743c86c3aa794cbe1e3484b,0027e179b743c86c,3aa794cbe1e3484b,2015-03-14,38.250000
...,...,...,...,...
ffe629a5232a878bb361180739bed4b0,ffe629a5232a878b,b361180739bed4b0,2003-06-06,0.000000
ffea776913451b6d22a92d7f62195791,ffea776913451b6d,22a92d7f62195791,2015-02-28,171.974522
ffea776913451b6d75e5fec9f72910ef,ffea776913451b6d,75e5fec9f72910ef,2015-02-28,114.649682
fffa21388cdd78b75d7bdab5e03e3216,fffa21388cdd78b7,5d7bdab5e03e3216,2013-10-19,113.000000


## Calculate metrics

The metrics will be calculated from the fighters entire career and from their last 3 fights, to give an idea of their current state.
Career averages are prefixed with 'ca_' and 3 fight averages with '3fa_'

In [9]:
list_of_metrics = ['s_ss_a_p15m', 'td_a_p15m',
                  's_ss_a_p15m_di', 'td_a_p15m_di']

#### How these metrics are calculated
The following cell iterates through each row in the fighter bout instance table. It takes the unique fighter_id and the date from that row and calculates the fighters metrics up until that date. This represents the fact that our model will only have prior knowledge of the fighters when making it's predictions.


This cell takes about 5-10 minutes to load.

In [10]:
for metric in list_of_metrics:
    print(metric+'\n')
    values = fighter_bout_inst.apply(lambda row: functions.calculate_metric_average(metric, 
                                                                                    row['fighter_id'], 
                                                                                    row['date'], 
                                                                                    data), 
                                     axis=1)

    fighter_bout_inst['ca_'+metric] = values.map(lambda x: x[0])
    fighter_bout_inst['3fa_'+metric] = values.map(lambda x: x[1])

s_ss_a_p15m

td_a_p15m

s_ss_a_p15m_di

td_a_p15m_di



### Debut fights and inexperienced fighters
Many fights include fighter who have never fought in the UFC before, and some do not have long enough records to calculate 3 fight averages. These fighters would have null values in their career and 3-fight-averages, so I drop all of those rows here.

In [14]:
fighter_mask = fighter_bout_inst['3fa_s_ss_a_p15m'].isnull()
debut_bouts = fighter_bout_inst[fighter_mask].bout_id

In [15]:
bout_mask = fighter_bout_inst.bout_id.isin(debut_bouts)
fighter_bout_inst = fighter_bout_inst[-bout_mask]

## Create the final dataframe

The current fighter bout instance table has two rows for each fight (one row for each fighter). In order to create a table where each row represents one fight, I need get each fighter on the same row.

In [20]:
model_df = functions.merge_fighter_instances(fighter_bout_inst)

## Creating Combined Significant Strike Attempts Per 15 Minute (c_sig_str_a_p15m)

Our create fighter bout instance table calculates the significant strike attempts for each fighter separately. In order to get our target, we need to combine them.

In [21]:
model_df['c_s_ss_a_p15m'] = model_df['target_0'] + model_df['target_1']
model_df

Unnamed: 0,bout_id,fighter_id_0,date_0,target_0,ca_s_ss_a_p15m_0,3fa_s_ss_a_p15m_0,ca_td_a_p15m_0,3fa_td_a_p15m_0,ca_s_ss_a_p15m_di_0,3fa_s_ss_a_p15m_di_0,...,ca_s_ss_a_p15m_1,3fa_s_ss_a_p15m_1,ca_td_a_p15m_1,3fa_td_a_p15m_1,ca_s_ss_a_p15m_di_1,3fa_s_ss_a_p15m_di_1,ca_td_a_p15m_di_1,3fa_td_a_p15m_di_1,inst_id_1,c_s_ss_a_p15m
0,000da3152b7b5ab1,d1a1314976c50bef,2006-07-08,30.000000,40.846154,40.8462,2.587045,2.58704,-11.174089,-11.1741,...,77.963058,77.9631,12.964286,12.9643,5.577906,5.57791,9.941558,9.94156,000da3152b7b5ab16da99156486ed6c2,108.000000
1,0027e179b743c86c,91ea901c458e95dd,2015-03-14,29.125000,58.128358,44.8977,3.300000,3.85714,18.420896,18.6013,...,56.000000,56,0.000000,0,-89.000000,-89,-1.000000,-1,0027e179b743c86c3aa794cbe1e3484b,67.375000
2,002921976d27b7da,ebc1f40e00e0c481,2014-12-13,17.786561,110.627907,132.044,0.616720,0,-5.867031,-2.9005,...,80.736725,77.633,1.455984,2.02077,-53.833359,17.1986,-0.562281,0.22077,002921976d27b7dab4ad3a06ee4d660c,35.573123
3,002c1562708ac307,44470bfd9483c7ad,2014-05-24,160.975610,43.000000,43,4.000000,4,-19.000000,-19,...,189.508547,190.929,1.948718,0.857143,30.673382,39.0306,-1.503663,-3.58163,002c1562708ac30722a92d7f62195791,406.097561
4,002cb1bb411c5f60,d897897060f10a3a,2006-03-04,130.800000,150.972124,191.764,0.850746,0,89.987048,118.068,...,48.074099,38.9674,1.851064,0,-10.016784,15.7357,-0.933250,-5.01176,002cb1bb411c5f6022e47b53e4ceb27c,174.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3311,ff64fc34065565d0,6fb1ba67bef41b37,2015-05-30,85.000000,83.083333,92.3333,11.342105,12,11.828947,26,...,80.918964,50.6719,6.640719,8.23346,-2.015486,1.50551,1.670613,5.57812,ff64fc34065565d007225ba28ae309b6,164.857143
3312,ff872fa3e9ec32a9,b7d524c77c27389b,2008-06-07,50.000000,70.742909,74.1412,2.981765,3.54331,0.808173,-37.7997,...,148.356981,96.3333,1.615385,2.33333,73.870502,35.3333,-4.532799,-4.33333,ff872fa3e9ec32a99fe85152f351e737,91.000000
3313,ffd3e3d37cba32da,92a9aa9c93192871,2014-10-25,152.000000,125.173992,162.832,8.342624,6,51.198658,64.4622,...,188.460491,190.157,0.375000,0.428571,2.664204,7.11352,-2.250000,-2.57143,ffd3e3d37cba32da7413b80dbb0f8f9f,234.000000
3314,ffea776913451b6d,75e5fec9f72910ef,2015-02-28,114.649682,78.074522,94.3333,7.724068,7.66667,-13.552672,-34,...,157.058175,108.383,1.888696,1.79866,17.877106,-1.31731,-2.877713,-4.93879,ffea776913451b6d22a92d7f62195791,286.624204


#### drop unnecessary columns

In [22]:
model_df.columns

Index(['bout_id', 'fighter_id_0', 'date_0', 'target_0', 'ca_s_ss_a_p15m_0',
       '3fa_s_ss_a_p15m_0', 'ca_td_a_p15m_0', '3fa_td_a_p15m_0',
       'ca_s_ss_a_p15m_di_0', '3fa_s_ss_a_p15m_di_0', 'ca_td_a_p15m_di_0',
       '3fa_td_a_p15m_di_0', 'inst_id_0', 'fighter_id_1', 'date_1', 'target_1',
       'ca_s_ss_a_p15m_1', '3fa_s_ss_a_p15m_1', 'ca_td_a_p15m_1',
       '3fa_td_a_p15m_1', 'ca_s_ss_a_p15m_di_1', '3fa_s_ss_a_p15m_di_1',
       'ca_td_a_p15m_di_1', '3fa_td_a_p15m_di_1', 'inst_id_1',
       'c_s_ss_a_p15m'],
      dtype='object')

In [23]:
model_df = model_df.loc[:,['date_0', 
                           'bout_id', 
                           'fighter_id_0', 
                           'ca_s_ss_a_p15m_0', 'ca_td_a_p15m_0', 'ca_s_ss_a_p15m_di_0', 'ca_td_a_p15m_di_0',
                           '3fa_s_ss_a_p15m_0', '3fa_td_a_p15m_0', '3fa_s_ss_a_p15m_di_0', '3fa_td_a_p15m_di_0',
                           'fighter_id_1', 
                           'ca_s_ss_a_p15m_1', 'ca_td_a_p15m_1', 'ca_s_ss_a_p15m_di_1', 'ca_td_a_p15m_di_1',
                           '3fa_s_ss_a_p15m_1', '3fa_td_a_p15m_1', '3fa_s_ss_a_p15m_di_1', '3fa_td_a_p15m_di_1',
                           'c_s_ss_a_p15m']]

In [24]:
model_df

Unnamed: 0,date_0,bout_id,fighter_id_0,ca_s_ss_a_p15m_0,ca_td_a_p15m_0,ca_s_ss_a_p15m_di_0,ca_td_a_p15m_di_0,3fa_s_ss_a_p15m_0,3fa_td_a_p15m_0,3fa_s_ss_a_p15m_di_0,...,fighter_id_1,ca_s_ss_a_p15m_1,ca_td_a_p15m_1,ca_s_ss_a_p15m_di_1,ca_td_a_p15m_di_1,3fa_s_ss_a_p15m_1,3fa_td_a_p15m_1,3fa_s_ss_a_p15m_di_1,3fa_td_a_p15m_di_1,c_s_ss_a_p15m
0,2006-07-08,000da3152b7b5ab1,d1a1314976c50bef,40.846154,2.587045,-11.174089,-5.681781,40.8462,2.58704,-11.1741,...,6da99156486ed6c2,77.963058,12.964286,5.577906,9.941558,77.9631,12.9643,5.57791,9.94156,108.000000
1,2015-03-14,0027e179b743c86c,91ea901c458e95dd,58.128358,3.300000,18.420896,1.500000,44.8977,3.85714,18.6013,...,3aa794cbe1e3484b,56.000000,0.000000,-89.000000,-1.000000,56,0,-89,-1,67.375000
2,2014-12-13,002921976d27b7da,ebc1f40e00e0c481,110.627907,0.616720,-5.867031,-0.747690,132.044,0,-2.9005,...,b4ad3a06ee4d660c,80.736725,1.455984,-53.833359,-0.562281,77.633,2.02077,17.1986,0.22077,35.573123
3,2014-05-24,002c1562708ac307,44470bfd9483c7ad,43.000000,4.000000,-19.000000,-5.000000,43,4,-19,...,22a92d7f62195791,189.508547,1.948718,30.673382,-1.503663,190.929,0.857143,39.0306,-3.58163,406.097561
4,2006-03-04,002cb1bb411c5f60,d897897060f10a3a,150.972124,0.850746,89.987048,-1.923989,191.764,0,118.068,...,22e47b53e4ceb27c,48.074099,1.851064,-10.016784,-0.933250,38.9674,0,15.7357,-5.01176,174.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3311,2015-05-30,ff64fc34065565d0,6fb1ba67bef41b37,83.083333,11.342105,11.828947,6.388158,92.3333,12,26,...,07225ba28ae309b6,80.918964,6.640719,-2.015486,1.670613,50.6719,8.23346,1.50551,5.57812,164.857143
3312,2008-06-07,ff872fa3e9ec32a9,b7d524c77c27389b,70.742909,2.981765,0.808173,1.968778,74.1412,3.54331,-37.7997,...,9fe85152f351e737,148.356981,1.615385,73.870502,-4.532799,96.3333,2.33333,35.3333,-4.33333,91.000000
3313,2014-10-25,ffd3e3d37cba32da,92a9aa9c93192871,125.173992,8.342624,51.198658,7.797170,162.832,6,64.4622,...,7413b80dbb0f8f9f,188.460491,0.375000,2.664204,-2.250000,190.157,0.428571,7.11352,-2.57143,234.000000
3314,2015-02-28,ffea776913451b6d,75e5fec9f72910ef,78.074522,7.724068,-13.552672,3.802460,94.3333,7.66667,-34,...,22a92d7f62195791,157.058175,1.888696,17.877106,-2.877713,108.383,1.79866,-1.31731,-4.93879,286.624204


In [27]:
model_df.to_csv('../../data/modelling_data/model_4_data.csv', index=False)