# Purpose
This notebook is designed to convert the existing databases into a new fighter_round_performance table.
This notebook only creates stats for significant strikes.

### Result

Fighter Round Performance:
 - SSA - Significant Strike Attempts
 - SSS - Significant Strike Successes
 - SS_AC - Significant Strike Accuracy
 - SS_DE - Significant Strike Defense
 - SSA_DI - Significant Strike Differential
 - SSS_DI - Significant Strike Differential
 - SSA_P1M - Significant Strike Attempts Per 1 Minute
 - SSS_P1M - Significant Strike Successes Per 1 Minute

In [46]:
%load_ext autoreload
%autoreload 2

import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from sqlalchemy import create_engine
from src import local
from src import functions

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [47]:
# Credentials
USER = local.user 
PASS = local.password
HOST = local.host
PORT = local.port

#create engine
engine = create_engine(f'postgresql://{USER}:{PASS}@{HOST}:{PORT}/match_finder')

# Get data from postgres database
### Join tables

- get the date from the events table
- use the bouts table to join the dates to the general table
- use the general table to join the bouts with the fighters


#### Accuracy case statement

In [48]:
accuracy_column = """
CASE 
    WHEN (sig_str_attempted > 0) THEN (CAST(sig_str_successful AS FLOAT)/CAST(sig_str_attempted AS FLOAT))
    ELSE 0
END AS accuracy
"""

In [49]:
query = """
SELECT bout_link, fighter_link, sig_str_attempted, 
sig_str_successful, "Date", round, "Time", 
"Round" as final_round, "Timeformat",
"""+accuracy_column+"""
FROM strikes_cleaned
JOIN bouts ON bouts.link = strikes_cleaned.bout_link
JOIN events ON events.link = bouts.event_link
"""

data = pd.read_sql(query, engine)

In [50]:
data = functions.format_data(data, event=False)

In [51]:
data

Unnamed: 0,bout_link,fighter_link,sig_str_attempted,sig_str_successful,Date,round,Time,final_round,Timeformat,accuracy,date,fighter_id,bout_id
0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,11,"July 25, 2020",1,5:00,5,5 Rnd (5-5-5-5-5),0.366667,2020-07-25,e1147d3d2dabe1ce,11f715fa5e825e51
6,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,15,"July 25, 2020",2,5:00,5,5 Rnd (5-5-5-5-5),0.500000,2020-07-25,e1147d3d2dabe1ce,11f715fa5e825e51
12,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,32,13,"July 25, 2020",3,5:00,5,5 Rnd (5-5-5-5-5),0.406250,2020-07-25,e1147d3d2dabe1ce,11f715fa5e825e51
18,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,34,13,"July 25, 2020",4,5:00,5,5 Rnd (5-5-5-5-5),0.382353,2020-07-25,e1147d3d2dabe1ce,11f715fa5e825e51
24,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,31,17,"July 25, 2020",5,5:00,5,5 Rnd (5-5-5-5-5),0.548387,2020-07-25,e1147d3d2dabe1ce,11f715fa5e825e51
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26447,http://www.ufcstats.com/fight-details/cecdc0da...,http://www.ufcstats.com/fighter-details/a5c53b...,0,0,"November 12, 1993",1,2:18,1,No Time Limit,0.000000,1993-11-12,a5c53b3ddb31cc7d,cecdc0da584274b9
26448,http://www.ufcstats.com/fight-details/2d2bbc86...,http://www.ufcstats.com/fighter-details/598a58...,27,15,"November 12, 1993",1,4:20,1,No Time Limit,0.555556,1993-11-12,598a58db87b890ee,2d2bbc86e941e05c
26449,http://www.ufcstats.com/fight-details/2d2bbc86...,http://www.ufcstats.com/fighter-details/d3711d...,28,12,"November 12, 1993",1,4:20,1,No Time Limit,0.428571,1993-11-12,d3711d3784b76255,2d2bbc86e941e05c
26450,http://www.ufcstats.com/fight-details/567a09fd...,http://www.ufcstats.com/fighter-details/279093...,5,3,"November 12, 1993",1,0:26,1,No Time Limit,0.600000,1993-11-12,279093302a6f44b3,567a09fd200cfa05


In [52]:
data_original = data

In order to get the striking defense, we need each column to include the fighters opponent. 

In [53]:
data_0 = functions.merge_fighter_instances(data, rounds=True)
data_1 = functions.merge_fighter_instances(data, rounds=True, flip=True)

data = pd.concat((data_0, data_1))

In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26214 entries, 0 to 13106
Data columns (total 29 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   bout_link_0           26214 non-null  object        
 1   fighter_link_0        26214 non-null  object        
 2   sig_str_attempted_0   26214 non-null  int64         
 3   sig_str_successful_0  26214 non-null  int64         
 4   Date_0                26214 non-null  object        
 5   round_0               26214 non-null  object        
 6   Time_0                26214 non-null  object        
 7   final_round_0         26214 non-null  int64         
 8   Timeformat_0          26214 non-null  object        
 9   accuracy_0            26214 non-null  float64       
 10  date_0                26214 non-null  datetime64[ns]
 11  fighter_id_0          26214 non-null  object        
 12  bout_id_0             26214 non-null  object        
 13  round_id        

In [55]:
data = data.loc[:, ['bout_link_0', 'fighter_link_0', 'sig_str_attempted_0',
                    'sig_str_successful_0', 'date_0', 'round_0', 'accuracy_0',
                    'fighter_id_0', 'bout_id_0', 'round_id', 'inst_id_0', 
                    'sig_str_attempted_1', 'sig_str_successful_1', 'accuracy_1',
                    'Time_0', 'Timeformat_0', 'fighter_id_1']]

In [56]:
data

Unnamed: 0,bout_link_0,fighter_link_0,sig_str_attempted_0,sig_str_successful_0,date_0,round_0,accuracy_0,fighter_id_0,bout_id_0,round_id,inst_id_0,sig_str_attempted_1,sig_str_successful_1,accuracy_1,Time_0,Timeformat_0,fighter_id_1
0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,11,2020-07-25,1,0.366667,e1147d3d2dabe1ce,11f715fa5e825e51,11f715fa5e825e511,11f715fa5e825e51e1147d3d2dabe1ce,24,12,0.500000,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7
1,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,15,2020-07-25,2,0.500000,e1147d3d2dabe1ce,11f715fa5e825e51,11f715fa5e825e512,11f715fa5e825e51e1147d3d2dabe1ce,12,3,0.250000,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7
2,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,32,13,2020-07-25,3,0.406250,e1147d3d2dabe1ce,11f715fa5e825e51,11f715fa5e825e513,11f715fa5e825e51e1147d3d2dabe1ce,21,6,0.285714,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7
3,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,34,13,2020-07-25,4,0.382353,e1147d3d2dabe1ce,11f715fa5e825e51,11f715fa5e825e514,11f715fa5e825e51e1147d3d2dabe1ce,19,8,0.421053,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7
4,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,31,17,2020-07-25,5,0.548387,e1147d3d2dabe1ce,11f715fa5e825e51,11f715fa5e825e515,11f715fa5e825e51e1147d3d2dabe1ce,23,12,0.521739,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13102,http://www.ufcstats.com/fight-details/ac7ca2ec...,http://www.ufcstats.com/fighter-details/279093...,17,11,1993-11-12,1,0.647059,279093302a6f44b3,ac7ca2ec38b96c1a,ac7ca2ec38b96c1a1,ac7ca2ec38b96c1a279093302a6f44b3,3,0,0.000000,0:59,No Time Limit,598a58db87b890ee
13103,http://www.ufcstats.com/fight-details/46acd54c...,http://www.ufcstats.com/fighter-details/46c8ec...,8,4,1993-11-12,1,0.500000,46c8ec317aff28ac,46acd54cc0c905fb,46acd54cc0c905fb1,46acd54cc0c905fb46c8ec317aff28ac,1,1,1.000000,1:49,No Time Limit,63b65af1c5cb02cb
13104,http://www.ufcstats.com/fight-details/cecdc0da...,http://www.ufcstats.com/fighter-details/429e7d...,3,0,1993-11-12,1,0.000000,429e7d3725852ce9,cecdc0da584274b9,cecdc0da584274b91,cecdc0da584274b9429e7d3725852ce9,0,0,0.000000,2:18,No Time Limit,a5c53b3ddb31cc7d
13105,http://www.ufcstats.com/fight-details/2d2bbc86...,http://www.ufcstats.com/fighter-details/598a58...,27,15,1993-11-12,1,0.555556,598a58db87b890ee,2d2bbc86e941e05c,2d2bbc86e941e05c1,2d2bbc86e941e05c598a58db87b890ee,28,12,0.428571,4:20,No Time Limit,d3711d3784b76255


### Calculate significant strike defense by subtracting the opponents accuracy from 1

In [57]:
data['ss_de'] = 1 - data['accuracy_1']

Clean the columns for continuity

## Fixing columns

Her we rename the columns. An important thing to note is that there are two sets of statistics. The second set, which is indicated by a '_1' suffix on the columns name, represents the stats for the opponent.

In [58]:
data.columns

Index(['bout_link_0', 'fighter_link_0', 'sig_str_attempted_0',
       'sig_str_successful_0', 'date_0', 'round_0', 'accuracy_0',
       'fighter_id_0', 'bout_id_0', 'round_id', 'inst_id_0',
       'sig_str_attempted_1', 'sig_str_successful_1', 'accuracy_1', 'Time_0',
       'Timeformat_0', 'fighter_id_1', 'ss_de'],
      dtype='object')

In [59]:
data.columns = ['bout_link', 'fighter_link', 'ssa',
                'sss', 'date', 'round', 'ss_ac',
                'fighter_id', 'bout_id', 'round_id', 'inst_id',
                'ssa_1', 'sss_1', 'ss_ac_1', 
                'time', 'timeformat', 'fighter_id_1', 'ss_de']

In [60]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26214 entries, 0 to 13106
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   bout_link     26214 non-null  object        
 1   fighter_link  26214 non-null  object        
 2   ssa           26214 non-null  int64         
 3   sss           26214 non-null  int64         
 4   date          26214 non-null  datetime64[ns]
 5   round         26214 non-null  object        
 6   ss_ac         26214 non-null  float64       
 7   fighter_id    26214 non-null  object        
 8   bout_id       26214 non-null  object        
 9   round_id      26214 non-null  object        
 10  inst_id       26214 non-null  object        
 11  ssa_1         26214 non-null  int64         
 12  sss_1         26214 non-null  int64         
 13  ss_ac_1       26214 non-null  float64       
 14  time          26214 non-null  object        
 15  timeformat    26214 non-null  object

In [61]:
data.columns

Index(['bout_link', 'fighter_link', 'ssa', 'sss', 'date', 'round', 'ss_ac',
       'fighter_id', 'bout_id', 'round_id', 'inst_id', 'ssa_1', 'sss_1',
       'ss_ac_1', 'time', 'timeformat', 'fighter_id_1', 'ss_de'],
      dtype='object')

In [62]:
data.loc[:,['date', 'bout_id', 'fighter_id', 'round', 'ssa', 'sss',  'ss_ac', 'ss_de',
            'time', 'timeformat', 'fighter_id_1', 'ssa_1', 'sss_1', 'ss_ac_1']]

Unnamed: 0,date,bout_id,fighter_id,round,ssa,sss,ss_ac,ss_de,time,timeformat,fighter_id_1,ssa_1,sss_1,ss_ac_1
0,2020-07-25,11f715fa5e825e51,e1147d3d2dabe1ce,1,30,11,0.366667,0.500000,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,24,12,0.500000
1,2020-07-25,11f715fa5e825e51,e1147d3d2dabe1ce,2,30,15,0.500000,0.750000,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,12,3,0.250000
2,2020-07-25,11f715fa5e825e51,e1147d3d2dabe1ce,3,32,13,0.406250,0.714286,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,21,6,0.285714
3,2020-07-25,11f715fa5e825e51,e1147d3d2dabe1ce,4,34,13,0.382353,0.578947,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,19,8,0.421053
4,2020-07-25,11f715fa5e825e51,e1147d3d2dabe1ce,5,31,17,0.548387,0.478261,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,23,12,0.521739
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13102,1993-11-12,ac7ca2ec38b96c1a,279093302a6f44b3,1,17,11,0.647059,1.000000,0:59,No Time Limit,598a58db87b890ee,3,0,0.000000
13103,1993-11-12,46acd54cc0c905fb,46c8ec317aff28ac,1,8,4,0.500000,0.000000,1:49,No Time Limit,63b65af1c5cb02cb,1,1,1.000000
13104,1993-11-12,cecdc0da584274b9,429e7d3725852ce9,1,3,0,0.000000,1.000000,2:18,No Time Limit,a5c53b3ddb31cc7d,0,0,0.000000
13105,1993-11-12,2d2bbc86e941e05c,598a58db87b890ee,1,27,15,0.555556,0.571429,4:20,No Time Limit,d3711d3784b76255,28,12,0.428571


### Calculating significant strike differential

In [63]:
data['sss_di'] = data['sss'] - data['sss_1']
data['ssa_di'] = data['ssa'] - data['ssa_1']

In [64]:
data.reset_index(inplace = True)

In [65]:
data[data['bout_id'] == data['bout_id'][0]]

Unnamed: 0,index,bout_link,fighter_link,ssa,sss,date,round,ss_ac,fighter_id,bout_id,...,inst_id,ssa_1,sss_1,ss_ac_1,time,timeformat,fighter_id_1,ss_de,sss_di,ssa_di
0,0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,11,2020-07-25,1,0.366667,e1147d3d2dabe1ce,11f715fa5e825e51,...,11f715fa5e825e51e1147d3d2dabe1ce,24,12,0.5,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.5,-1,6
1,1,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,15,2020-07-25,2,0.5,e1147d3d2dabe1ce,11f715fa5e825e51,...,11f715fa5e825e51e1147d3d2dabe1ce,12,3,0.25,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.75,12,18
2,2,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,32,13,2020-07-25,3,0.40625,e1147d3d2dabe1ce,11f715fa5e825e51,...,11f715fa5e825e51e1147d3d2dabe1ce,21,6,0.285714,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.714286,7,11
3,3,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,34,13,2020-07-25,4,0.382353,e1147d3d2dabe1ce,11f715fa5e825e51,...,11f715fa5e825e51e1147d3d2dabe1ce,19,8,0.421053,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.578947,5,15
4,4,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,31,17,2020-07-25,5,0.548387,e1147d3d2dabe1ce,11f715fa5e825e51,...,11f715fa5e825e51e1147d3d2dabe1ce,23,12,0.521739,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.478261,5,8
13107,0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,24,12,2020-07-25,1,0.5,9ce6d5a03af801b7,11f715fa5e825e51,...,11f715fa5e825e519ce6d5a03af801b7,30,11,0.366667,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.633333,1,-6
13108,1,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,12,3,2020-07-25,2,0.25,9ce6d5a03af801b7,11f715fa5e825e51,...,11f715fa5e825e519ce6d5a03af801b7,30,15,0.5,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.5,-12,-18
13109,2,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,21,6,2020-07-25,3,0.285714,9ce6d5a03af801b7,11f715fa5e825e51,...,11f715fa5e825e519ce6d5a03af801b7,32,13,0.40625,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.59375,-7,-11
13110,3,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,19,8,2020-07-25,4,0.421053,9ce6d5a03af801b7,11f715fa5e825e51,...,11f715fa5e825e519ce6d5a03af801b7,34,13,0.382353,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.617647,-5,-15
13111,4,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,23,12,2020-07-25,5,0.521739,9ce6d5a03af801b7,11f715fa5e825e51,...,11f715fa5e825e519ce6d5a03af801b7,31,17,0.548387,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.451613,-5,-8


### Calculate SSA_P1M
This is the Significant Strike Attempts per Minute. 

### Create round length column
First we need to have a time for each round. The current time column only measures the time on the clock at which the fight was stopped. This number can only be used for the last round. We'll group all the bout_ids and create a datafram that matches the highest round_id value to the time column.

This will be based on the assumption that all rounds in the UFC are 5 minutes long, let's see if that's accurate.

In [66]:
data.timeformat.value_counts()

3 Rnd (5-5-5)           22652
5 Rnd (5-5-5-5-5)        2964
1 Rnd + OT (12-3)         192
3 Rnd + OT (5-5-5-5)      106
No Time Limit              74
1 Rnd + 2OT (15-3-3)       62
2 Rnd (5-5)                50
1 Rnd (20)                 42
1 Rnd (15)                 16
1 Rnd (10)                 12
1 Rnd (12)                  8
1 Rnd + OT (30-5)           6
1 Rnd + OT (15-3)           6
1 Rnd + 2OT (24-3-3)        6
1 Rnd + OT (27-3)           4
1 Rnd + OT (31-5)           4
1 Rnd (18)                  4
1 Rnd + OT (30-3)           4
1 Rnd (30)                  2
Name: timeformat, dtype: int64

It looks like there are a lot of different round formats. They are likely from before they standardized the rules, so let's look at when these rounds took place.

In [67]:
data.groupby('timeformat').date.min().sort_values().index

Index(['No Time Limit', '1 Rnd + OT (31-5)', '1 Rnd (20)', '1 Rnd (30)',
       '1 Rnd + OT (30-5)', '1 Rnd + OT (30-3)', '1 Rnd (15)', '1 Rnd (18)',
       '1 Rnd + OT (27-3)', '1 Rnd (10)', '1 Rnd + 2OT (15-3-3)',
       '1 Rnd + OT (12-3)', '1 Rnd + 2OT (24-3-3)', '1 Rnd + OT (15-3)',
       '1 Rnd (12)', '2 Rnd (5-5)', '3 Rnd (5-5-5)', '5 Rnd (5-5-5-5-5)',
       '3 Rnd + OT (5-5-5-5)'],
      dtype='object', name='timeformat')

We'll only look at fights with five minute rounds to make the calculation easier.

In [68]:
non_standard_rounds = ['No Time Limit', '1 Rnd + OT (31-5)', '1 Rnd (20)', '1 Rnd (30)',
                   '1 Rnd + OT (30-5)', '1 Rnd + OT (30-3)', '1 Rnd (15)', '1 Rnd (18)',
                   '1 Rnd + OT (27-3)', '1 Rnd (10)', '1 Rnd + 2OT (15-3-3)',
                   '1 Rnd + OT (12-3)', '1 Rnd + 2OT (24-3-3)', '1 Rnd + OT (15-3)',
                   '1 Rnd (12)']

mask = data.timeformat.map(lambda x: functions.black_list_entry(x, non_standard_rounds))
new_data = data[mask]
new_data.timeformat.value_counts()

3 Rnd (5-5-5)           22652
5 Rnd (5-5-5-5-5)        2964
3 Rnd + OT (5-5-5-5)      106
2 Rnd (5-5)                50
Name: timeformat, dtype: int64

In [69]:
data = new_data

We still have most of our fights, so we'll use this as our timeframe from now on.

In [70]:
bout_groups = data.groupby('bout_id')
round_id = bout_groups.round_id.max()
round_length = bout_groups.time.max()

final_round_lengths = pd.DataFrame(dict(round_id = round_id, round_length = round_length))

In [71]:
final_round_lengths.set_index('round_id', inplace=True)
final_round_lengths

Unnamed: 0_level_0,round_length
round_id,Unnamed: 1_level_1
000da3152b7b5ab13,5:00
0019ec81fd706ade3,5:00
0027e179b743c86c3,3:12
002921976d27b7da1,4:13
002c1562708ac3071,4:06
...,...
ffd3e3d37cba32da3,5:00
ffe4379d6bd1e82b2,1:43
ffe629a5232a878b1,1:59
ffea776913451b6d1,2:37


In [72]:
new_data = data.join(final_round_lengths, on='round_id', how='outer')
new_data.head(15)

Unnamed: 0,index,bout_link,fighter_link,ssa,sss,date,round,ss_ac,fighter_id,bout_id,...,ssa_1,sss_1,ss_ac_1,time,timeformat,fighter_id_1,ss_de,sss_di,ssa_di,round_length
0,0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,11,2020-07-25,1,0.366667,e1147d3d2dabe1ce,11f715fa5e825e51,...,24,12,0.5,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.5,-1,6,
13107,0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,24,12,2020-07-25,1,0.5,9ce6d5a03af801b7,11f715fa5e825e51,...,30,11,0.366667,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.633333,1,-6,
1,1,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,15,2020-07-25,2,0.5,e1147d3d2dabe1ce,11f715fa5e825e51,...,12,3,0.25,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.75,12,18,
13108,1,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,12,3,2020-07-25,2,0.25,9ce6d5a03af801b7,11f715fa5e825e51,...,30,15,0.5,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.5,-12,-18,
2,2,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,32,13,2020-07-25,3,0.40625,e1147d3d2dabe1ce,11f715fa5e825e51,...,21,6,0.285714,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.714286,7,11,
13109,2,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,21,6,2020-07-25,3,0.285714,9ce6d5a03af801b7,11f715fa5e825e51,...,32,13,0.40625,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.59375,-7,-11,
3,3,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,34,13,2020-07-25,4,0.382353,e1147d3d2dabe1ce,11f715fa5e825e51,...,19,8,0.421053,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.578947,5,15,
13110,3,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,19,8,2020-07-25,4,0.421053,9ce6d5a03af801b7,11f715fa5e825e51,...,34,13,0.382353,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.617647,-5,-15,
4,4,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,31,17,2020-07-25,5,0.548387,e1147d3d2dabe1ce,11f715fa5e825e51,...,23,12,0.521739,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.478261,5,8,5:00
13111,4,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,23,12,2020-07-25,5,0.521739,9ce6d5a03af801b7,11f715fa5e825e51,...,31,17,0.548387,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.451613,-5,-8,5:00


 Now that we have the final rounds filled in, ever null value should be '5:00'.

In [73]:
new_data.round_length = new_data.round_length.fillna('5:00')

In [74]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25772 entries, 0 to 25992
Data columns (total 22 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   index         25772 non-null  int64         
 1   bout_link     25772 non-null  object        
 2   fighter_link  25772 non-null  object        
 3   ssa           25772 non-null  int64         
 4   sss           25772 non-null  int64         
 5   date          25772 non-null  datetime64[ns]
 6   round         25772 non-null  object        
 7   ss_ac         25772 non-null  float64       
 8   fighter_id    25772 non-null  object        
 9   bout_id       25772 non-null  object        
 10  round_id      25772 non-null  object        
 11  inst_id       25772 non-null  object        
 12  ssa_1         25772 non-null  int64         
 13  sss_1         25772 non-null  int64         
 14  ss_ac_1       25772 non-null  float64       
 15  time          25772 non-null  object

### Calculate assa-m

before calculating, we need to convert the round length column into a timedelta object.

In [75]:
new_data.round_length = '00:0' + new_data.round_length

In [76]:
new_data.round_length = pd.to_timedelta(new_data.round_length)

In [77]:
new_data.round_length.describe()

count                     25772
mean     0 days 00:04:28.456774
std      0 days 00:01:10.459661
min             0 days 00:00:05
25%             0 days 00:05:00
50%             0 days 00:05:00
75%             0 days 00:05:00
max             0 days 00:05:00
Name: round_length, dtype: object

In [78]:
new_data['minutes'] = new_data.round_length.map(lambda x: x.total_seconds()/60)

In [79]:
new_data['ssa_p1m'] = new_data['ssa'] / new_data.minutes
new_data['sss_p1m'] = new_data['sss'] / new_data.minutes
new_data

Unnamed: 0,index,bout_link,fighter_link,ssa,sss,date,round,ss_ac,fighter_id,bout_id,...,time,timeformat,fighter_id_1,ss_de,sss_di,ssa_di,round_length,minutes,ssa_p1m,sss_p1m
0,0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,11,2020-07-25,1,0.366667,e1147d3d2dabe1ce,11f715fa5e825e51,...,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.500000,-1,6,00:05:00,5.00,6.000000,2.200000
13107,0,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,24,12,2020-07-25,1,0.500000,9ce6d5a03af801b7,11f715fa5e825e51,...,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.633333,1,-6,00:05:00,5.00,4.800000,2.400000
1,1,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,30,15,2020-07-25,2,0.500000,e1147d3d2dabe1ce,11f715fa5e825e51,...,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.750000,12,18,00:05:00,5.00,6.000000,3.000000
13108,1,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/9ce6d5...,12,3,2020-07-25,2,0.250000,9ce6d5a03af801b7,11f715fa5e825e51,...,5:00,5 Rnd (5-5-5-5-5),e1147d3d2dabe1ce,0.500000,-12,-18,00:05:00,5.00,2.400000,0.600000
2,2,http://www.ufcstats.com/fight-details/11f715fa...,http://www.ufcstats.com/fighter-details/e1147d...,32,13,2020-07-25,3,0.406250,e1147d3d2dabe1ce,11f715fa5e825e51,...,5:00,5 Rnd (5-5-5-5-5),9ce6d5a03af801b7,0.714286,7,11,00:05:00,5.00,6.400000,2.600000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25990,12883,http://www.ufcstats.com/fight-details/c6f85419...,http://www.ufcstats.com/fighter-details/a54a35...,20,10,1999-07-16,1,0.500000,a54a35a670d8e852,c6f8541973f69080,...,2:51,2 Rnd (5-5),efaf544314bb5c2e,0.541667,-12,-28,00:02:51,2.85,7.017544,3.508772
12884,12884,http://www.ufcstats.com/fight-details/1db8bed8...,http://www.ufcstats.com/fighter-details/c2a762...,0,0,1999-07-16,1,0.000000,c2a7623f398d9bd7,1db8bed8b4d30fb3,...,5:00,2 Rnd (5-5),911fb265462f0d94,0.307692,-9,-13,00:05:00,5.00,0.000000,0.000000
25991,12884,http://www.ufcstats.com/fight-details/1db8bed8...,http://www.ufcstats.com/fighter-details/911fb2...,13,9,1999-07-16,1,0.692308,911fb265462f0d94,1db8bed8b4d30fb3,...,5:00,2 Rnd (5-5),c2a7623f398d9bd7,1.000000,9,13,00:05:00,5.00,2.600000,1.800000
12885,12885,http://www.ufcstats.com/fight-details/1db8bed8...,http://www.ufcstats.com/fighter-details/c2a762...,0,0,1999-07-16,2,0.000000,c2a7623f398d9bd7,1db8bed8b4d30fb3,...,5:00,2 Rnd (5-5),911fb265462f0d94,0.157895,-16,-19,00:05:00,5.00,0.000000,0.000000


In [80]:
new_data.ssa_p1m.describe()

count    25772.000000
mean         7.758239
std          5.656875
min          0.000000
25%          3.800000
50%          6.600000
75%         10.400000
max         95.000000
Name: ssa_p1m, dtype: float64

In [81]:
data=new_data

## Clean up
Clean the new dataframe so that it has all of the info in the correct places and then I'm going to merge it with the original strikes table.

In [82]:
data.columns

Index(['index', 'bout_link', 'fighter_link', 'ssa', 'sss', 'date', 'round',
       'ss_ac', 'fighter_id', 'bout_id', 'round_id', 'inst_id', 'ssa_1',
       'sss_1', 'ss_ac_1', 'time', 'timeformat', 'fighter_id_1', 'ss_de',
       'sss_di', 'ssa_di', 'round_length', 'minutes', 'ssa_p1m', 'sss_p1m'],
      dtype='object')

In [83]:
to_drop = ['index', 'round_length', 'timeformat', 'time']
data.drop(to_drop, axis=1, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25772 entries, 0 to 25992
Data columns (total 21 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   bout_link     25772 non-null  object        
 1   fighter_link  25772 non-null  object        
 2   ssa           25772 non-null  int64         
 3   sss           25772 non-null  int64         
 4   date          25772 non-null  datetime64[ns]
 5   round         25772 non-null  object        
 6   ss_ac         25772 non-null  float64       
 7   fighter_id    25772 non-null  object        
 8   bout_id       25772 non-null  object        
 9   round_id      25772 non-null  object        
 10  inst_id       25772 non-null  object        
 11  ssa_1         25772 non-null  int64         
 12  sss_1         25772 non-null  int64         
 13  ss_ac_1       25772 non-null  float64       
 14  fighter_id_1  25772 non-null  object        
 15  ss_de         25772 non-null  float6

In [84]:
data.columns

Index(['bout_link', 'fighter_link', 'ssa', 'sss', 'date', 'round', 'ss_ac',
       'fighter_id', 'bout_id', 'round_id', 'inst_id', 'ssa_1', 'sss_1',
       'ss_ac_1', 'fighter_id_1', 'ss_de', 'sss_di', 'ssa_di', 'minutes',
       'ssa_p1m', 'sss_p1m'],
      dtype='object')

In [85]:
data = data.loc[:,['date', 'bout_link', 'fighter_link', 'round', 'minutes',
            'ssa', 'sss', 'ss_ac', 'ss_de', 'sss_di', 'ssa_di', 'ssa_p1m', 'sss_p1m',
            'fighter_id_1', 'ssa_1', 'sss_1', 'ss_ac_1']]

In [86]:
data.to_csv('../../data/ufcstats_data/fighter_round_performance.csv', index=False)