In [1]:
%load_ext autoreload
%autoreload 2
# Add custom functions to path
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

from src import functions
import pandas as pd
from sqlalchemy import create_engine

# Credentials
from src import local
USER = local.user 
PASS = local.password
HOST = local.host
PORT = local.port

#create engine
engine = create_engine(f'postgresql://{USER}:{PASS}@{HOST}:{PORT}/match_finder')
""
query = """
SELECT rounds.*, date, time, 
final_round, timeformat
FROM rounds
JOIN bouts ON bouts.id = rounds.bout_id
JOIN events ON events.id = bouts.event_id
"""

rounds = pd.read_sql(query, engine)

In [2]:
rounds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26214 entries, 0 to 26213
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   fighter      26214 non-null  object
 1   kd           26214 non-null  int64 
 2   sba          26214 non-null  int64 
 3   ps           26214 non-null  int64 
 4   rev          26214 non-null  int64 
 5   ss_s         26214 non-null  int64 
 6   ss_a         26214 non-null  int64 
 7   h_ss_s       26214 non-null  int64 
 8   h_ss_a       26214 non-null  int64 
 9   b_ss_s       26214 non-null  int64 
 10  b_ss_a       26214 non-null  int64 
 11  l_ss_s       26214 non-null  int64 
 12  l_ss_a       26214 non-null  int64 
 13  d_ss_s       26214 non-null  int64 
 14  d_ss_a       26214 non-null  int64 
 15  c_ss_s       26214 non-null  int64 
 16  c_ss_a       26214 non-null  int64 
 17  g_ss_s       26214 non-null  int64 
 18  g_ss_a       26214 non-null  int64 
 19  ts_s         26214 non-nu

In [3]:
rounds.columns

Index(['fighter', 'kd', 'sba', 'ps', 'rev', 'ss_s', 'ss_a', 'h_ss_s', 'h_ss_a',
       'b_ss_s', 'b_ss_a', 'l_ss_s', 'l_ss_a', 'd_ss_s', 'd_ss_a', 'c_ss_s',
       'c_ss_a', 'g_ss_s', 'g_ss_a', 'ts_s', 'ts_a', 'td_s', 'td_a', 'outcome',
       'bout_id', 'fighter_id', 'round', 'date', 'time', 'final_round',
       'timeformat'],
      dtype='object')

### Differential Calculation
A differential is the difference in the amount of times a fighter executes a technique vs the amount of times his opponent does.
To calculate this, we transform our original table so that each row has a fighter AND his opponent.

In [4]:
dif_calc_df = functions.merge_fighter_instances(rounds, rounds=False)
dif_calc_df.head()

Unnamed: 0,fighter_0,kd_0,sba_0,ps_0,rev_0,ss_s_0,ss_a_0,h_ss_s_0,h_ss_a_0,b_ss_s_0,...,td_s_1,td_a_1,outcome_1,fighter_id_1,round_1,date_1,time_1,final_round_1,timeformat_1,inst_id_1
0,Josh Neer,0,1,0,0,5,9,5,9,0,...,2,2,W,6da99156486ed6c2,3,"July 08, 2006",5:00,3,3 Rnd (5-5-5),000da3152b7b5ab16da99156486ed6c2
1,Josh Neer,0,1,0,0,5,9,5,9,0,...,1,3,W,6da99156486ed6c2,2,"July 08, 2006",5:00,3,3 Rnd (5-5-5),000da3152b7b5ab16da99156486ed6c2
2,Josh Neer,0,1,0,0,5,9,5,9,0,...,1,2,W,6da99156486ed6c2,1,"July 08, 2006",5:00,3,3 Rnd (5-5-5),000da3152b7b5ab16da99156486ed6c2
3,Josh Neer,0,0,0,0,7,19,7,18,0,...,2,2,W,6da99156486ed6c2,3,"July 08, 2006",5:00,3,3 Rnd (5-5-5),000da3152b7b5ab16da99156486ed6c2
4,Josh Neer,0,0,0,0,7,19,7,18,0,...,1,3,W,6da99156486ed6c2,2,"July 08, 2006",5:00,3,3 Rnd (5-5-5),000da3152b7b5ab16da99156486ed6c2


##### In this dataframe, each row has two fighters. The first is suffixed with a _0 and the second with a _1.
##### In order to get the significant strike attempts per round differential (ss_a_pr_di), all we need to do 
##### is subtract ss_a_1 from ss_a_0

In [5]:
dif_calc_df['ss_a_pr_di'] = dif_calc_df['ss_a_0'] - dif_calc_df['ss_a_1']

In [6]:
dif_calc_df.loc[len(dif_calc_df)-5:,['ss_a_pr_di', 'ss_a_0', 'ss_a_1']]

Unnamed: 0,ss_a_pr_di,ss_a_0,ss_a_1
35998,-15,26,41
35999,-33,26,59
36000,18,36,18
36001,-5,36,41
36002,-23,36,59


##### In the list above, the first fighter only attempted 3 strikes, while his opponent attempted 17, resulting in a differential of -14

#### Calculating for all stats
##### I wrote a function that calculates the differentials for
##### per round counts and per 15 minute rates

In [7]:
advanced_stats = functions.calculate_stats_alt(rounds, 
                                               ['kd', 'sba', 'ps', 'rev', 
                                                'ss_s', 'ss_a', 'h_ss_s', 
                                                'h_ss_a','b_ss_s', 'b_ss_a', 
                                                'l_ss_s', 'l_ss_a', 'd_ss_s', 
                                                'd_ss_a', 'c_ss_s', 'c_ss_a', 
                                                'g_ss_s', 'g_ss_a', 'ts_s', 
                                                'ts_a', 'td_s', 'td_a'])

calculating minutes



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['round_id'] = df['bout_id']+df['round']


combining rows

calculating differentials for kd

calculating differentials for sba

calculating differentials for ps

calculating differentials for rev

calculating differentials for ss_s

calculating differentials for ss_a

calculating differentials for h_ss_s

calculating differentials for h_ss_a

calculating differentials for b_ss_s

calculating differentials for b_ss_a

calculating differentials for l_ss_s

calculating differentials for l_ss_a

calculating differentials for d_ss_s

calculating differentials for d_ss_a

calculating differentials for c_ss_s

calculating differentials for c_ss_a

calculating differentials for g_ss_s

calculating differentials for g_ss_a

calculating differentials for ts_s

calculating differentials for ts_a

calculating differentials for td_s

calculating differentials for td_a

cleaning df



In [8]:
advanced_stats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25772 entries, 0 to 12885
Data columns (total 100 columns):
 #   Column          Non-Null Count  Dtype          
---  ------          --------------  -----          
 0   fighter         25772 non-null  object         
 1   kd              25772 non-null  int64          
 2   sba             25772 non-null  int64          
 3   ps              25772 non-null  int64          
 4   rev             25772 non-null  int64          
 5   ss_s            25772 non-null  int64          
 6   ss_a            25772 non-null  int64          
 7   h_ss_s          25772 non-null  int64          
 8   h_ss_a          25772 non-null  int64          
 9   b_ss_s          25772 non-null  int64          
 10  b_ss_a          25772 non-null  int64          
 11  l_ss_s          25772 non-null  int64          
 12  l_ss_a          25772 non-null  int64          
 13  d_ss_s          25772 non-null  int64          
 14  d_ss_a          25772 non-null  int64

### Splitting the stats tables
Because this dataframe has 144 rows, I'm going to split it up into the following advanced stats tables:
- Grappling, includes
    - Takedowns
    - Submission Attempts
    - Reversals
    - Guard Passes
- Strikes
    - Significant Strikes
    - Total Strikes
    - Knockdowns
- Strikes by Target
    - Head
    - Leg
    - Body
- Strikes by Position
    - Distance
    - Ground 
    - Clinch

#### Functions

In [9]:
def get_metric_columns(stat, metrics_list=None):
    """
    input: stat (string) - the abbreviation of a stat from the dataframe
    output: list of column names for:
            - per 15 minute rate (_p15m)
            - per round differential (_pr_di)
            - per 15 minute differential (_p15m_di)
    """
    if metrics_list:
        metrics = metrics_list
    else:
        metrics = ['_pr_di', '_p15m', '_p15m_di']
    
    columns = [stat]
    for metric in metrics:
        # combine the stat and the metric name and add to columns list
        columns.append(stat+metric)
        
    return columns

In [10]:
def get_advanced_stats_table_columns(table_stats, metrics_list=None):
    """
    input: table_stats(list) - list of stats that will go in the table
    output: list of column names for the advanced stats table
    """
    columns = ['bout_id', 'fighter_id', 'round', 'fighter']
    for stat in table_stats:
        columns = columns + get_metric_columns(stat, metrics_list=metrics_list)
    
    return columns

#### Grappling

In [11]:
get_advanced_stats_table_columns(['td_s', 'td_a', 'sba', 'rev', 'ps'])

['bout_id',
 'fighter_id',
 'round',
 'fighter',
 'td_s',
 'td_s_pr_di',
 'td_s_p15m',
 'td_s_p15m_di',
 'td_a',
 'td_a_pr_di',
 'td_a_p15m',
 'td_a_p15m_di',
 'sba',
 'sba_pr_di',
 'sba_p15m',
 'sba_p15m_di',
 'rev',
 'rev_pr_di',
 'rev_p15m',
 'rev_p15m_di',
 'ps',
 'ps_pr_di',
 'ps_p15m',
 'ps_p15m_di']

In [12]:
grappling_stats = ['td_s', 'td_a', 'sba', 'rev', 'ps']
grappling_columns = get_advanced_stats_table_columns(grappling_stats)

grappling = advanced_stats.loc[:,grappling_columns]
grappling.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25772 entries, 0 to 12885
Data columns (total 24 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   bout_id       25772 non-null  object 
 1   fighter_id    25772 non-null  object 
 2   round         25772 non-null  object 
 3   fighter       25772 non-null  object 
 4   td_s          25772 non-null  int64  
 5   td_s_pr_di    25772 non-null  int64  
 6   td_s_p15m     25772 non-null  float64
 7   td_s_p15m_di  25772 non-null  float64
 8   td_a          25772 non-null  int64  
 9   td_a_pr_di    25772 non-null  int64  
 10  td_a_p15m     25772 non-null  float64
 11  td_a_p15m_di  25772 non-null  float64
 12  sba           25772 non-null  int64  
 13  sba_pr_di     25772 non-null  int64  
 14  sba_p15m      25772 non-null  float64
 15  sba_p15m_di   25772 non-null  float64
 16  rev           25772 non-null  int64  
 17  rev_pr_di     25772 non-null  int64  
 18  rev_p15m      25772 non-nu

##### Send to SQL

In [13]:
grappling.to_csv('../../data/ufcstats_data/advanced_stats/grappling_adv.csv', index=False)
grappling.to_sql('grappling_adv', engine, if_exists='replace', index=False)

#### Striking

In [14]:
striking_stats = ['ss_s', 'ss_a', 'ts_s', 'ts_a', 'kd']
striking_columns = get_advanced_stats_table_columns(striking_stats)

striking = advanced_stats.loc[:,striking_columns]
list(striking.columns)

['bout_id',
 'fighter_id',
 'round',
 'fighter',
 'ss_s',
 'ss_s_pr_di',
 'ss_s_p15m',
 'ss_s_p15m_di',
 'ss_a',
 'ss_a_pr_di',
 'ss_a_p15m',
 'ss_a_p15m_di',
 'ts_s',
 'ts_s_pr_di',
 'ts_s_p15m',
 'ts_s_p15m_di',
 'ts_a',
 'ts_a_pr_di',
 'ts_a_p15m',
 'ts_a_p15m_di',
 'kd',
 'kd_pr_di',
 'kd_p15m',
 'kd_p15m_di']

##### Send to SQL

In [15]:
striking.to_csv('../../data/ufcstats_data/advanced_stats/striking_adv.csv', index=False)
striking.to_sql('striking_adv', engine, if_exists='replace', index=False)

#### Striking by Target

In [16]:
striking_target_stats = ['h_ss_s', 'h_ss_a', 'b_ss_s', 'b_ss_a', 'l_ss_s', 'l_ss_a', ]
striking_target_columns = get_advanced_stats_table_columns(striking_target_stats)

striking_target = advanced_stats.loc[:,striking_target_columns]
striking_target.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25772 entries, 0 to 12885
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bout_id         25772 non-null  object 
 1   fighter_id      25772 non-null  object 
 2   round           25772 non-null  object 
 3   fighter         25772 non-null  object 
 4   h_ss_s          25772 non-null  int64  
 5   h_ss_s_pr_di    25772 non-null  int64  
 6   h_ss_s_p15m     25772 non-null  float64
 7   h_ss_s_p15m_di  25772 non-null  float64
 8   h_ss_a          25772 non-null  int64  
 9   h_ss_a_pr_di    25772 non-null  int64  
 10  h_ss_a_p15m     25772 non-null  float64
 11  h_ss_a_p15m_di  25772 non-null  float64
 12  b_ss_s          25772 non-null  int64  
 13  b_ss_s_pr_di    25772 non-null  int64  
 14  b_ss_s_p15m     25772 non-null  float64
 15  b_ss_s_p15m_di  25772 non-null  float64
 16  b_ss_a          25772 non-null  int64  
 17  b_ss_a_pr_di    25772 non-null 

##### Send to SQL

In [17]:
striking_target.to_csv('../../data/ufcstats_data/advanced_stats/striking_target_adv.csv', index=False)
striking_target.to_sql('striking_target_adv', engine, if_exists='replace', index=False)

#### Striking by Position

In [18]:
striking_position_stats = ['d_ss_s', 'd_ss_a', 'c_ss_s', 'c_ss_a', 'g_ss_s', 'g_ss_a', ]
striking_position_columns = get_advanced_stats_table_columns(striking_position_stats)

striking_position = advanced_stats.loc[:,striking_position_columns]
striking_position.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25772 entries, 0 to 12885
Data columns (total 28 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   bout_id         25772 non-null  object 
 1   fighter_id      25772 non-null  object 
 2   round           25772 non-null  object 
 3   fighter         25772 non-null  object 
 4   d_ss_s          25772 non-null  int64  
 5   d_ss_s_pr_di    25772 non-null  int64  
 6   d_ss_s_p15m     25772 non-null  float64
 7   d_ss_s_p15m_di  25772 non-null  float64
 8   d_ss_a          25772 non-null  int64  
 9   d_ss_a_pr_di    25772 non-null  int64  
 10  d_ss_a_p15m     25772 non-null  float64
 11  d_ss_a_p15m_di  25772 non-null  float64
 12  c_ss_s          25772 non-null  int64  
 13  c_ss_s_pr_di    25772 non-null  int64  
 14  c_ss_s_p15m     25772 non-null  float64
 15  c_ss_s_p15m_di  25772 non-null  float64
 16  c_ss_a          25772 non-null  int64  
 17  c_ss_a_pr_di    25772 non-null 

##### Send to SQL

In [19]:
striking_position.to_csv('../../data/ufcstats_data/advanced_stats/striking_position_adv.csv', index=False)
striking_position.to_sql('striking_position_adv', engine, if_exists='replace', index=False)

In [20]:
striking.drop_duplicates()

Unnamed: 0,bout_id,fighter_id,round,fighter,ss_s,ss_s_pr_di,ss_s_p15m,ss_s_p15m_di,ss_a,ss_a_pr_di,...,ts_s_p15m,ts_s_p15m_di,ts_a,ts_a_pr_di,ts_a_p15m,ts_a_p15m_di,kd,kd_pr_di,kd_p15m,kd_p15m_di
0,000da3152b7b5ab1,d1a1314976c50bef,3,Josh Neer,5,-3,15.000000,-9.000000,9,-14,...,45.000000,-21.000000,21,-16,63.000000,-48.000000,0,0,0.0,0.0
1,000da3152b7b5ab1,d1a1314976c50bef,2,Josh Neer,7,-6,21.000000,-18.000000,19,-7,...,48.000000,-6.000000,28,-3,84.000000,-9.000000,0,0,0.0,0.0
2,000da3152b7b5ab1,d1a1314976c50bef,1,Josh Neer,7,-7,21.000000,-21.000000,11,-28,...,33.000000,-27.000000,15,-30,45.000000,-90.000000,0,-1,0.0,-3.0
3,0019ec81fd706ade,85073dbd1be65ed9,1,Greg Hardy,19,10,57.000000,30.000000,39,7,...,57.000000,30.000000,39,7,117.000000,21.000000,0,0,0.0,0.0
4,0019ec81fd706ade,85073dbd1be65ed9,2,Greg Hardy,18,11,54.000000,33.000000,31,-4,...,54.000000,33.000000,31,-4,93.000000,-12.000000,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12881,ffe629a5232a878b,08ae5cd9aef7ddd3,1,Kimo Leopoldo,1,1,7.563025,7.563025,1,1,...,7.563025,-15.126050,1,-2,7.563025,-15.126050,0,0,0.0,0.0
12882,ffea776913451b6d,22a92d7f62195791,1,Tony Ferguson,11,9,63.057325,51.592357,38,18,...,97.452229,85.987261,45,25,257.961783,143.312102,0,0,0.0,0.0
12883,fffa21388cdd78b7,5d7bdab5e03e3216,2,CB Dollaway,8,1,24.000000,3.000000,18,4,...,72.000000,21.000000,34,6,102.000000,18.000000,0,0,0.0,0.0
12884,fffa21388cdd78b7,5d7bdab5e03e3216,3,CB Dollaway,21,5,63.000000,15.000000,41,15,...,93.000000,3.000000,51,7,153.000000,21.000000,0,0,0.0,0.0
