# Purpose
Separate the columns for each strike statistic into attempted and landed columns. 
For example, the sig_str column will be split into:
 - sig_str_a: how many significant strikes were attempted
 - sig_str_s: how many significant strikes were successful

The outcome of this notebook will be a new sql table witht eh new columns replacing the old ones.
The new table will be called strikes_clean

In [16]:
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir, os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)

import pandas as pd
from sqlalchemy import create_engine
from src import local

In [17]:
# Credentials
USER = local.user 
PASS = local.password
HOST = local.host
PORT = local.port

#create engine
engine = create_engine(f'postgresql://{USER}:{PASS}@{HOST}:{PORT}/match_finder')

# Get strikes table from postgres database

In [18]:
query = """
SELECT *
FROM strikes
"""

data = pd.read_sql(query, engine)

In [19]:
data

Unnamed: 0,fighter,sig_str,sig_str_prcnt,head,body,leg,distance,clinch,ground,round,bout_link,outcome,fighter_link
0,Robert Whittaker,11 of 30,36%,4 of 22,1 of 2,6 of 6,10 of 29,1 of 1,0 of 0,1,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
1,Robert Whittaker,15 of 30,50%,7 of 22,1 of 1,7 of 7,10 of 23,0 of 0,5 of 7,2,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
2,Robert Whittaker,13 of 32,40%,8 of 26,0 of 1,5 of 5,11 of 30,2 of 2,0 of 0,3,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
3,Robert Whittaker,13 of 34,38%,7 of 26,1 of 2,5 of 6,12 of 31,1 of 3,0 of 0,4,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
4,Robert Whittaker,17 of 31,54%,6 of 20,4 of 4,7 of 7,14 of 26,3 of 5,0 of 0,5,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26239,Art Jimmerson,0 of 0,0%,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,1,http://www.ufcstats.com/fight-details/cecdc0da...,L,http://www.ufcstats.com/fighter-details/a5c53b...
26240,Kevin Rosier,15 of 27,55%,12 of 23,3 of 4,0 of 0,4 of 10,4 of 9,7 of 8,1,http://www.ufcstats.com/fight-details/2d2bbc86...,W,http://www.ufcstats.com/fighter-details/598a58...
26241,Zane Frazier,12 of 28,42%,7 of 19,3 of 6,2 of 3,0 of 7,10 of 19,2 of 2,1,http://www.ufcstats.com/fight-details/2d2bbc86...,L,http://www.ufcstats.com/fighter-details/d3711d...
26242,Gerard Gordeau,3 of 5,60%,3 of 5,0 of 0,0 of 0,1 of 3,0 of 0,2 of 2,1,http://www.ufcstats.com/fight-details/567a09fd...,W,http://www.ufcstats.com/fighter-details/279093...


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26244 entries, 0 to 26243
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   fighter        26244 non-null  object
 1   sig_str        26244 non-null  object
 2   sig_str_prcnt  26244 non-null  object
 3   head           26244 non-null  object
 4   body           26244 non-null  object
 5   leg            26244 non-null  object
 6   distance       26244 non-null  object
 7   clinch         26244 non-null  object
 8   ground         26244 non-null  object
 9   round          26244 non-null  int64 
 10  bout_link      26244 non-null  object
 11  outcome        26244 non-null  object
 12  fighter_link   26244 non-null  object
dtypes: int64(1), object(12)
memory usage: 2.6+ MB


# Data cleaning

## Create successful significant strikes column
The following steps will be turned into a function that can be applied to all columns.
#### First, remove whitespace from the column

In [21]:
data['sig_str'] = data['sig_str'].map(lambda x: x.strip())

#### Create functions that return the first number in a string as successes, and the second number as attempts

In [22]:
def get_successful(string):
    """
    input: a string that follows the format: [number landed] of [number thrown] 
                                            ie 11 of 30 means they landed 11 strikes out of 30
    output: an integer of the number landed
                                            ie 11 from the example above
    """
    return int(string.split(' ')[0])

In [23]:
def get_attempts(string):
    """
    input: a string that follows the format: [number landed] of [number thrown] 
                                            ie 11 of 30 means they landed 11 strikes out of 30
    output: an integer of the number attempted
                                            ie 30 from the example above
    """
    return int(string.split(' ')[-1])

In [24]:
data['sig_str'+'_s'] = data['sig_str'].map(get_successful)
data['sig_str'+'_a'] = data['sig_str'].map(get_attempts)

In [25]:
data

Unnamed: 0,fighter,sig_str,sig_str_prcnt,head,body,leg,distance,clinch,ground,round,bout_link,outcome,fighter_link,sig_str_s,sig_str_a
0,Robert Whittaker,11 of 30,36%,4 of 22,1 of 2,6 of 6,10 of 29,1 of 1,0 of 0,1,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,11,30
1,Robert Whittaker,15 of 30,50%,7 of 22,1 of 1,7 of 7,10 of 23,0 of 0,5 of 7,2,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,15,30
2,Robert Whittaker,13 of 32,40%,8 of 26,0 of 1,5 of 5,11 of 30,2 of 2,0 of 0,3,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,32
3,Robert Whittaker,13 of 34,38%,7 of 26,1 of 2,5 of 6,12 of 31,1 of 3,0 of 0,4,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,34
4,Robert Whittaker,17 of 31,54%,6 of 20,4 of 4,7 of 7,14 of 26,3 of 5,0 of 0,5,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,17,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26239,Art Jimmerson,0 of 0,0%,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,1,http://www.ufcstats.com/fight-details/cecdc0da...,L,http://www.ufcstats.com/fighter-details/a5c53b...,0,0
26240,Kevin Rosier,15 of 27,55%,12 of 23,3 of 4,0 of 0,4 of 10,4 of 9,7 of 8,1,http://www.ufcstats.com/fight-details/2d2bbc86...,W,http://www.ufcstats.com/fighter-details/598a58...,15,27
26241,Zane Frazier,12 of 28,42%,7 of 19,3 of 6,2 of 3,0 of 7,10 of 19,2 of 2,1,http://www.ufcstats.com/fight-details/2d2bbc86...,L,http://www.ufcstats.com/fighter-details/d3711d...,12,28
26242,Gerard Gordeau,3 of 5,60%,3 of 5,0 of 0,0 of 0,1 of 3,0 of 0,2 of 2,1,http://www.ufcstats.com/fight-details/567a09fd...,W,http://www.ufcstats.com/fighter-details/279093...,3,5


## Create clean dataframe
Now that I successfully did this with the first column, I will select which columns I want to do this to next and put them into a list. With this list I will iterate over the columns adding the new ones.

In [26]:
list(data.columns)

['fighter',
 'sig_str',
 'sig_str_prcnt',
 'head',
 'body',
 'leg',
 'distance',
 'clinch',
 'ground',
 'round',
 'bout_link',
 'outcome',
 'fighter_link',
 'sig_str_s',
 'sig_str_a']

In [27]:
to_be_formatted = ['sig_str',
                 'head',
                 'body',
                 'leg',
                 'distance',
                 'clinch',
                 'ground']

In [28]:
for column in to_be_formatted:
    data[column] = data[column].map(lambda x: x.strip())
    data[column+'_successful'] = data[column].map(get_successful)
    data[column+'_attempts'] = data[column].map(get_attempts)

In [29]:
data

Unnamed: 0,fighter,sig_str,sig_str_prcnt,head,body,leg,distance,clinch,ground,round,...,body_successful,body_attempts,leg_successful,leg_attempts,distance_successful,distance_attempts,clinch_successful,clinch_attempts,ground_successful,ground_attempts
0,Robert Whittaker,11 of 30,36%,4 of 22,1 of 2,6 of 6,10 of 29,1 of 1,0 of 0,1,...,1,2,6,6,10,29,1,1,0,0
1,Robert Whittaker,15 of 30,50%,7 of 22,1 of 1,7 of 7,10 of 23,0 of 0,5 of 7,2,...,1,1,7,7,10,23,0,0,5,7
2,Robert Whittaker,13 of 32,40%,8 of 26,0 of 1,5 of 5,11 of 30,2 of 2,0 of 0,3,...,0,1,5,5,11,30,2,2,0,0
3,Robert Whittaker,13 of 34,38%,7 of 26,1 of 2,5 of 6,12 of 31,1 of 3,0 of 0,4,...,1,2,5,6,12,31,1,3,0,0
4,Robert Whittaker,17 of 31,54%,6 of 20,4 of 4,7 of 7,14 of 26,3 of 5,0 of 0,5,...,4,4,7,7,14,26,3,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26239,Art Jimmerson,0 of 0,0%,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,0 of 0,1,...,0,0,0,0,0,0,0,0,0,0
26240,Kevin Rosier,15 of 27,55%,12 of 23,3 of 4,0 of 0,4 of 10,4 of 9,7 of 8,1,...,3,4,0,0,4,10,4,9,7,8
26241,Zane Frazier,12 of 28,42%,7 of 19,3 of 6,2 of 3,0 of 7,10 of 19,2 of 2,1,...,3,6,2,3,0,7,10,19,2,2
26242,Gerard Gordeau,3 of 5,60%,3 of 5,0 of 0,0 of 0,1 of 3,0 of 0,2 of 2,1,...,0,0,0,0,1,3,0,0,2,2


## Remove old rows

In [31]:
to_be_dropped = ['sig_str',
             'head',
             'body',
             'leg',
             'distance',
             'clinch',
             'ground',
             'sig_str_prcnt']

data.drop(to_be_dropped, axis=1, inplace=True)

In [32]:
data

Unnamed: 0,fighter,round,bout_link,outcome,fighter_link,sig_str_s,sig_str_a,sig_str_successful,sig_str_attempts,head_successful,...,body_successful,body_attempts,leg_successful,leg_attempts,distance_successful,distance_attempts,clinch_successful,clinch_attempts,ground_successful,ground_attempts
0,Robert Whittaker,1,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,11,30,11,30,4,...,1,2,6,6,10,29,1,1,0,0
1,Robert Whittaker,2,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,15,30,15,30,7,...,1,1,7,7,10,23,0,0,5,7
2,Robert Whittaker,3,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,32,13,32,8,...,0,1,5,5,11,30,2,2,0,0
3,Robert Whittaker,4,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,34,13,34,7,...,1,2,5,6,12,31,1,3,0,0
4,Robert Whittaker,5,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,17,31,17,31,6,...,4,4,7,7,14,26,3,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26239,Art Jimmerson,1,http://www.ufcstats.com/fight-details/cecdc0da...,L,http://www.ufcstats.com/fighter-details/a5c53b...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26240,Kevin Rosier,1,http://www.ufcstats.com/fight-details/2d2bbc86...,W,http://www.ufcstats.com/fighter-details/598a58...,15,27,15,27,12,...,3,4,0,0,4,10,4,9,7,8
26241,Zane Frazier,1,http://www.ufcstats.com/fight-details/2d2bbc86...,L,http://www.ufcstats.com/fighter-details/d3711d...,12,28,12,28,7,...,3,6,2,3,0,7,10,19,2,2
26242,Gerard Gordeau,1,http://www.ufcstats.com/fight-details/567a09fd...,W,http://www.ufcstats.com/fighter-details/279093...,3,5,3,5,3,...,0,0,0,0,1,3,0,0,2,2


## Send to sql

In [33]:
data.to_sql('strikes_cleaned', engine, index=False)

## Clean general table

In [34]:
query = """
SELECT *
FROM general
"""

general = pd.read_sql(query, engine)

In [36]:
general.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26244 entries, 0 to 26243
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   fighter        26244 non-null  object
 1   kd             26244 non-null  int64 
 2   sig_str        26244 non-null  object
 3   sig_str_prcnt  26244 non-null  object
 4   total_str      26244 non-null  object
 5   td_count       26244 non-null  object
 6   td_prcnt       26244 non-null  object
 7   sub_att        26244 non-null  int64 
 8   pass           26244 non-null  int64 
 9   rev            26244 non-null  int64 
 10  round          26244 non-null  int64 
 11  bout_id        26244 non-null  object
 12  outcome        26244 non-null  object
 13  fighter_link   26244 non-null  object
dtypes: int64(5), object(9)
memory usage: 2.8+ MB


In [37]:
general.head()

Unnamed: 0,fighter,kd,sig_str,sig_str_prcnt,total_str,td_count,td_prcnt,sub_att,pass,rev,round,bout_id,outcome,fighter_link
0,Robert Whittaker,0,11 of 30,36%,13 of 32,0 of 2,0%,0,0,0,1,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
1,Robert Whittaker,1,15 of 30,50%,42 of 57,0 of 0,0%,0,1,0,2,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
2,Robert Whittaker,0,13 of 32,40%,13 of 32,0 of 2,0%,0,0,0,3,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
3,Robert Whittaker,0,13 of 34,38%,14 of 35,0 of 2,0%,0,0,0,4,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...
4,Robert Whittaker,0,17 of 31,54%,18 of 32,2 of 7,28%,0,0,0,5,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...


In [35]:
list(general.columns)

['fighter',
 'kd',
 'sig_str',
 'sig_str_prcnt',
 'total_str',
 'td_count',
 'td_prcnt',
 'sub_att',
 'pass',
 'rev',
 'round',
 'bout_id',
 'outcome',
 'fighter_link']

In [38]:
to_be_formatted = ['total_str',
                     'td_count']

In [39]:
for column in to_be_formatted:
    general[column] = general[column].map(lambda x: x.strip())
    general[column+'_successful'] = general[column].map(get_successful)
    general[column+'_attempts'] = general[column].map(get_attempts)

In [40]:
general

Unnamed: 0,fighter,kd,sig_str,sig_str_prcnt,total_str,td_count,td_prcnt,sub_att,pass,rev,round,bout_id,outcome,fighter_link,total_str_successful,total_str_attempts,td_count_successful,td_count_attempts
0,Robert Whittaker,0,11 of 30,36%,13 of 32,0 of 2,0%,0,0,0,1,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,32,0,2
1,Robert Whittaker,1,15 of 30,50%,42 of 57,0 of 0,0%,0,1,0,2,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,42,57,0,0
2,Robert Whittaker,0,13 of 32,40%,13 of 32,0 of 2,0%,0,0,0,3,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,32,0,2
3,Robert Whittaker,0,13 of 34,38%,14 of 35,0 of 2,0%,0,0,0,4,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,14,35,0,2
4,Robert Whittaker,0,17 of 31,54%,18 of 32,2 of 7,28%,0,0,0,5,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,18,32,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26239,Art Jimmerson,0,0 of 0,0%,0 of 0,0 of 0,0%,0,0,0,1,http://www.ufcstats.com/fight-details/cecdc0da...,L,http://www.ufcstats.com/fighter-details/a5c53b...,0,0,0,0
26240,Kevin Rosier,2,15 of 27,55%,38 of 53,0 of 0,0%,0,0,0,1,http://www.ufcstats.com/fight-details/2d2bbc86...,W,http://www.ufcstats.com/fighter-details/598a58...,38,53,0,0
26241,Zane Frazier,0,12 of 28,42%,13 of 29,0 of 0,0%,0,0,0,1,http://www.ufcstats.com/fight-details/2d2bbc86...,L,http://www.ufcstats.com/fighter-details/d3711d...,13,29,0,0
26242,Gerard Gordeau,0,3 of 5,60%,3 of 5,0 of 0,0%,0,0,0,1,http://www.ufcstats.com/fight-details/567a09fd...,W,http://www.ufcstats.com/fighter-details/279093...,3,5,0,0


In [41]:
to_be_dropped = ['sig_str',
                 'sig_str_prcnt',
                 'total_str',
                 'td_count',
                 'td_prcnt',]

general.drop(to_be_dropped, axis=1, inplace=True)

In [42]:
general

Unnamed: 0,fighter,kd,sub_att,pass,rev,round,bout_id,outcome,fighter_link,total_str_successful,total_str_attempts,td_count_successful,td_count_attempts
0,Robert Whittaker,0,0,0,0,1,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,32,0,2
1,Robert Whittaker,1,0,1,0,2,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,42,57,0,0
2,Robert Whittaker,0,0,0,0,3,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,13,32,0,2
3,Robert Whittaker,0,0,0,0,4,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,14,35,0,2
4,Robert Whittaker,0,0,0,0,5,http://www.ufcstats.com/fight-details/11f715fa...,W,http://www.ufcstats.com/fighter-details/e1147d...,18,32,2,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26239,Art Jimmerson,0,0,0,0,1,http://www.ufcstats.com/fight-details/cecdc0da...,L,http://www.ufcstats.com/fighter-details/a5c53b...,0,0,0,0
26240,Kevin Rosier,2,0,0,0,1,http://www.ufcstats.com/fight-details/2d2bbc86...,W,http://www.ufcstats.com/fighter-details/598a58...,38,53,0,0
26241,Zane Frazier,0,0,0,0,1,http://www.ufcstats.com/fight-details/2d2bbc86...,L,http://www.ufcstats.com/fighter-details/d3711d...,13,29,0,0
26242,Gerard Gordeau,0,0,0,0,1,http://www.ufcstats.com/fight-details/567a09fd...,W,http://www.ufcstats.com/fighter-details/279093...,3,5,0,0


In [43]:
general.to_sql('general_cleaned', engine, index=False)