# Purpose
Create a dataframe ready to be used for modelling. In order to do this, I will calculate the career average for each fighters stats leading up to the bout being predicted. The 

### Features
- Career Average Standing Significant Strike Attempts per 15 Minutes

### Target
- Combined Standing Signicant Strike Attempts per 15 Minutes

This target was chosen because it excludes strikes thrown on the ground, which are the types of strikes only thrown by grapplers. By including clinch strikes, we enable our feature to include fighters who use dirty boxing and Muay Thai techniques.

In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from sqlalchemy import create_engine
from src import local
from src import functions

#### Load the data

In [2]:
# Set up tables
# Credentials
USER = local.user 
PASS = local.password
HOST = local.host
PORT = local.port

#create engine
engine = create_engine(f'postgresql://{USER}:{PASS}@{HOST}:{PORT}/match_finder')

#### Join our 5 advanced statistics tables with our bouts and events table to get dates for each bout

In [3]:
query = """
SELECT striking_position_adv.bout_id, striking_position_adv.fighter_id, striking_position_adv.round, date, 
(d_ss_a_p15m + c_ss_a_p15m) as s_ss_a_p15m
FROM striking_position_adv
JOIN bouts ON bouts.id = striking_position_adv.bout_id
JOIN events ON events.id = bouts.event_id
"""

data = pd.read_sql(query, engine)

In [4]:
data.head(10)

Unnamed: 0,bout_id,fighter_id,round,date,s_ss_a_p15m
0,11f715fa5e825e51,e1147d3d2dabe1ce,1,"July 25, 2020",90.0
1,11f715fa5e825e51,e1147d3d2dabe1ce,2,"July 25, 2020",69.0
2,11f715fa5e825e51,e1147d3d2dabe1ce,3,"July 25, 2020",96.0
3,11f715fa5e825e51,e1147d3d2dabe1ce,4,"July 25, 2020",102.0
4,11f715fa5e825e51,e1147d3d2dabe1ce,5,"July 25, 2020",93.0
5,a3820f1eb82f2ba6,73e09f837f3b5ecc,1,"July 25, 2020",90.0
6,a3820f1eb82f2ba6,73e09f837f3b5ecc,2,"July 25, 2020",105.0
7,a3820f1eb82f2ba6,73e09f837f3b5ecc,3,"July 25, 2020",93.0
8,f786136e55007e17,492b202d2064e7a9,1,"July 25, 2020",24.0
9,5632f2364016e879,d910665038efc639,1,"July 25, 2020",0.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25772 entries, 0 to 25771
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   bout_id      25772 non-null  object 
 1   fighter_id   25772 non-null  object 
 2   round        25772 non-null  object 
 3   date         25772 non-null  object 
 4   s_ss_a_p15m  25772 non-null  float64
dtypes: float64(1), object(4)
memory usage: 1006.8+ KB


In [6]:
data['date'] = pd.to_datetime(data['date'])

## Create fighter-bout instance dataframe

A fighter-bout instance represents one fighter in one bout.
 - The same fighter has exactly one fighter-bout instance for every single bout he has been in. 
 - Every bout has exactly two fighter-bout instances, one for each fighter in the bout. 
  
In this case a fighter-bout instance is assigned a unique identifier comprised of the bout_id combined with the fighter_link.

In [7]:
fighter_bout_inst = functions.create_fighter_bout_instance_table(data, 's_ss_a_p15m')

In [8]:
fighter_bout_inst

Unnamed: 0_level_0,bout_id,fighter_id,date,target
fighter_bout_inst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
000da3152b7b5ab16da99156486ed6c2,000da3152b7b5ab1,6da99156486ed6c2,2006-07-08,78.000000
000da3152b7b5ab1d1a1314976c50bef,000da3152b7b5ab1,d1a1314976c50bef,2006-07-08,30.000000
0019ec81fd706ade326f94d6cfb1bf25,0019ec81fd706ade,326f94d6cfb1bf25,2019-10-18,97.000000
0019ec81fd706ade85073dbd1be65ed9,0019ec81fd706ade,85073dbd1be65ed9,2019-10-18,105.000000
0027e179b743c86c3aa794cbe1e3484b,0027e179b743c86c,3aa794cbe1e3484b,2015-03-14,38.250000
...,...,...,...,...
ffe629a5232a878bb361180739bed4b0,ffe629a5232a878b,b361180739bed4b0,2003-06-06,0.000000
ffea776913451b6d22a92d7f62195791,ffea776913451b6d,22a92d7f62195791,2015-02-28,171.974522
ffea776913451b6d75e5fec9f72910ef,ffea776913451b6d,75e5fec9f72910ef,2015-02-28,114.649682
fffa21388cdd78b75d7bdab5e03e3216,fffa21388cdd78b7,5d7bdab5e03e3216,2013-10-19,113.000000


## Calculate metrics

The metrics will be calculated from the fighters entire career and from their last 3 fights, to give an idea of their current state.
Career averages are prefixed with 'ca_' and 3 fight averages with '3fa_'

In [9]:
list_of_metrics = ['s_ss_a_p15m']

#### How these metrics are calculated
The following cell iterates through each row in the fighter bout instance table. It takes the unique fighter_id and the date from that row and calculates the fighters metrics up until that date. This represents the fact that our model will only have prior knowledge of the fighters when making it's predictions.


This cell takes about 5-10 minutes to load.

In [10]:
for metric in list_of_metrics:
    print(metric+'\n')
    values = fighter_bout_inst.apply(lambda row: functions.calculate_metric_average(metric, 
                                                                                    row['fighter_id'], 
                                                                                    row['date'], 
                                                                                    data), 
                                     axis=1)

    fighter_bout_inst['ca_'+metric] = values.map(lambda x: x[0])

s_ss_a_p15m



### Debut fights and inexperienced fighters
Many fights include fighter who have never fought in the UFC before, and some do not have long enough records to calculate 3 fight averages. These fighters would have null values in their career and 3-fight-averages, so I drop all of those rows here.

In [11]:
fighter_mask = fighter_bout_inst['ca_s_ss_a_p15m'].isnull()
debut_bouts = fighter_bout_inst[fighter_mask].bout_id

In [12]:
bout_mask = fighter_bout_inst.bout_id.isin(debut_bouts)
fighter_bout_inst = fighter_bout_inst[-bout_mask]

## Create the final dataframe

The current fighter bout instance table has two rows for each fight (one row for each fighter). In order to create a table where each row represents one fight, I need get each fighter on the same row.

In [13]:
model_df = functions.merge_fighter_instances(fighter_bout_inst)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  instances_df['inst_id'] = instances_df['bout_id'] + instances_df['fighter_id']


## Creating Combined Significant Strike Attempts Per 15 Minute (c_sig_str_a_p15m)

In [14]:
model_df['c_s_ss_a_p15m'] = model_df['target_0'] + model_df['target_1']
model_df

Unnamed: 0,bout_id,fighter_id_0,date_0,target_0,ca_s_ss_a_p15m_0,inst_id_0,fighter_id_1,date_1,target_1,ca_s_ss_a_p15m_1,inst_id_1,c_s_ss_a_p15m
0,000da3152b7b5ab1,d1a1314976c50bef,2006-07-08,30.000000,40.846154,000da3152b7b5ab1d1a1314976c50bef,6da99156486ed6c2,2006-07-08,78.000000,77.963058,000da3152b7b5ab16da99156486ed6c2,108.000000
1,0027e179b743c86c,91ea901c458e95dd,2015-03-14,29.125000,58.128358,0027e179b743c86c91ea901c458e95dd,3aa794cbe1e3484b,2015-03-14,38.250000,56.000000,0027e179b743c86c3aa794cbe1e3484b,67.375000
2,002921976d27b7da,ebc1f40e00e0c481,2014-12-13,17.786561,110.627907,002921976d27b7daebc1f40e00e0c481,b4ad3a06ee4d660c,2014-12-13,17.786561,80.736725,002921976d27b7dab4ad3a06ee4d660c,35.573123
3,002c1562708ac307,44470bfd9483c7ad,2014-05-24,160.975610,43.000000,002c1562708ac30744470bfd9483c7ad,22a92d7f62195791,2014-05-24,245.121951,189.508547,002c1562708ac30722a92d7f62195791,406.097561
4,002cb1bb411c5f60,d897897060f10a3a,2006-03-04,130.800000,150.972124,002cb1bb411c5f60d897897060f10a3a,22e47b53e4ceb27c,2006-03-04,43.200000,48.074099,002cb1bb411c5f6022e47b53e4ceb27c,174.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
3966,ff872fa3e9ec32a9,b7d524c77c27389b,2008-06-07,50.000000,70.742909,ff872fa3e9ec32a9b7d524c77c27389b,9fe85152f351e737,2008-06-07,41.000000,148.356981,ff872fa3e9ec32a99fe85152f351e737,91.000000
3967,ffbc12e4f821ec68,7a703c565ccaa18f,2014-02-15,38.000000,116.911339,ffbc12e4f821ec687a703c565ccaa18f,3591d0d5d382a381,2014-02-15,71.000000,47.812834,ffbc12e4f821ec683591d0d5d382a381,109.000000
3968,ffd3e3d37cba32da,92a9aa9c93192871,2014-10-25,152.000000,125.173992,ffd3e3d37cba32da92a9aa9c93192871,7413b80dbb0f8f9f,2014-10-25,82.000000,188.460491,ffd3e3d37cba32da7413b80dbb0f8f9f,234.000000
3969,ffea776913451b6d,75e5fec9f72910ef,2015-02-28,114.649682,78.074522,ffea776913451b6d75e5fec9f72910ef,22a92d7f62195791,2015-02-28,171.974522,157.058175,ffea776913451b6d22a92d7f62195791,286.624204


#### drop unnecessary columns

In [15]:
model_df.columns

Index(['bout_id', 'fighter_id_0', 'date_0', 'target_0', 'ca_s_ss_a_p15m_0',
       'inst_id_0', 'fighter_id_1', 'date_1', 'target_1', 'ca_s_ss_a_p15m_1',
       'inst_id_1', 'c_s_ss_a_p15m'],
      dtype='object')

In [16]:
model_df = model_df.loc[:,['date_0', 'bout_id', 'fighter_id_0', 'ca_s_ss_a_p15m_0', 'fighter_id_1', 'ca_s_ss_a_p15m_1', 'c_s_ss_a_p15m']]

In [17]:
model_df

Unnamed: 0,date_0,bout_id,fighter_id_0,ca_s_ss_a_p15m_0,fighter_id_1,ca_s_ss_a_p15m_1,c_s_ss_a_p15m
0,2006-07-08,000da3152b7b5ab1,d1a1314976c50bef,40.846154,6da99156486ed6c2,77.963058,108.000000
1,2015-03-14,0027e179b743c86c,91ea901c458e95dd,58.128358,3aa794cbe1e3484b,56.000000,67.375000
2,2014-12-13,002921976d27b7da,ebc1f40e00e0c481,110.627907,b4ad3a06ee4d660c,80.736725,35.573123
3,2014-05-24,002c1562708ac307,44470bfd9483c7ad,43.000000,22a92d7f62195791,189.508547,406.097561
4,2006-03-04,002cb1bb411c5f60,d897897060f10a3a,150.972124,22e47b53e4ceb27c,48.074099,174.000000
...,...,...,...,...,...,...,...
3966,2008-06-07,ff872fa3e9ec32a9,b7d524c77c27389b,70.742909,9fe85152f351e737,148.356981,91.000000
3967,2014-02-15,ffbc12e4f821ec68,7a703c565ccaa18f,116.911339,3591d0d5d382a381,47.812834,109.000000
3968,2014-10-25,ffd3e3d37cba32da,92a9aa9c93192871,125.173992,7413b80dbb0f8f9f,188.460491,234.000000
3969,2015-02-28,ffea776913451b6d,75e5fec9f72910ef,78.074522,22a92d7f62195791,157.058175,286.624204


In [18]:
model_df.to_csv('../../data/modelling_data/model_1_data.csv')