## Pipeline 2 - Handling Skew 

In [85]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [86]:
# read the excel file
df = pd.read_csv('../csv_files/Model_Ready_Classification.csv',index_col=0)
df.head(10)

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.833,0.461538,0.395604,0.32967,0.506173,0.036585,1.785714,0.952,0.628049,0.463415,0.256098,0.493827,0.10989,2.142857,0
1,0.885,0.430851,0.404255,0.276596,0.538462,0.02439,1.133333,0.87,0.542683,0.414634,0.280488,0.461538,0.053191,2.266667,0
2,0.844,0.590909,0.428571,0.415584,0.486486,0.076923,2.5,0.935,0.512821,0.397436,0.397436,0.513514,0.025974,1.8125,1
3,0.727,0.567568,0.364865,0.297297,0.475,0.053333,2.0,0.71,0.48,0.453333,0.413333,0.525,0.094595,1.727273,1
4,0.722,0.511905,0.369048,0.214286,0.540816,0.02381,2.363636,0.806,0.47619,0.511905,0.369048,0.459184,0.047619,3.166667,0
5,0.889,0.551724,0.471264,0.310345,0.52381,0.0625,2.909091,0.654,0.53125,0.4375,0.325,0.47619,0.08046,2.5,1
6,0.7,0.561224,0.418367,0.102041,0.404255,0.060606,2.583333,0.789,0.525253,0.393939,0.191919,0.595745,0.05102,2.0,0
7,0.808,0.47549,0.441176,0.254902,0.466102,0.098039,1.647059,0.667,0.441176,0.431373,0.323529,0.533898,0.04902,1.1,1
8,0.606,0.464706,0.411765,0.388235,0.455556,0.035714,2.0,0.87,0.535714,0.309524,0.27381,0.544444,0.058824,2.076923,0
9,0.743,0.538043,0.445652,0.380435,0.569892,0.045977,3.857143,0.76,0.482759,0.367816,0.287356,0.430108,0.065217,1.461538,1


In [87]:
# Dropping the target variable from the dataset in order to only have the variables
df = df.drop(['Target'], axis=1) 

In [88]:
df.head()

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio
0,0.833,0.461538,0.395604,0.32967,0.506173,0.036585,1.785714,0.952,0.628049,0.463415,0.256098,0.493827,0.10989,2.142857
1,0.885,0.430851,0.404255,0.276596,0.538462,0.02439,1.133333,0.87,0.542683,0.414634,0.280488,0.461538,0.053191,2.266667
2,0.844,0.590909,0.428571,0.415584,0.486486,0.076923,2.5,0.935,0.512821,0.397436,0.397436,0.513514,0.025974,1.8125
3,0.727,0.567568,0.364865,0.297297,0.475,0.053333,2.0,0.71,0.48,0.453333,0.413333,0.525,0.094595,1.727273
4,0.722,0.511905,0.369048,0.214286,0.540816,0.02381,2.363636,0.806,0.47619,0.511905,0.369048,0.459184,0.047619,3.166667


In [89]:
# Looking at the skew of the variables
df.skew()

H_FTPct           -0.385370
H_EFGPct           0.158915
H_ThreePARt        0.336457
H_FTR              0.727429
H_REBPct          -0.014475
H_BLKPct           0.626443
H_AST_TOV_Ratio    2.300406
A_FTPct           -0.357750
A_EFGPct           0.137635
A_ThreePARt        0.352216
A_FTR              0.650330
A_REBPct           0.014475
A_BLKPct           0.571297
A_AST_TOV_Ratio    1.862044
dtype: float64

## Working to fix the skew of the dataset

In [90]:
# Using the squared root 
df['H_FTR'] = np.sqrt(df['H_FTR'])
df['H_FTR'].skew()

0.11379656149194062

In [91]:
# Using the squared root 
df['A_FTR'] = np.sqrt(df['A_FTR'])
df['A_FTR'].skew()

0.07544048775796099

In [92]:
# Using the squared root 
df['H_BLKPct'] = np.sqrt(df['H_BLKPct'])
df['H_BLKPct'].skew()

-0.45456891724284504

In [93]:
# Using the squared root of the cubed root 
df['H_AST_TOV_Ratio'] = np.sqrt(np.cbrt(df['H_AST_TOV_Ratio']))
df['H_AST_TOV_Ratio'].skew()

0.4969422446662296

In [94]:
# Using the squared root 
df['A_FTR'] = np.sqrt(df['A_FTR'])
df['A_FTR'].skew()

-0.2413223629076931

In [95]:
# Using the squared root 
df['A_AST_TOV_Ratio'] = np.sqrt(np.cbrt(df['A_AST_TOV_Ratio']))
df['A_AST_TOV_Ratio'].skew()

0.37018359414000407

In [96]:
# Checking to make sure that it is no longer skewed
df.skew()

H_FTPct           -0.385370
H_EFGPct           0.158915
H_ThreePARt        0.336457
H_FTR              0.113797
H_REBPct          -0.014475
H_BLKPct          -0.454569
H_AST_TOV_Ratio    0.496942
A_FTPct           -0.357750
A_EFGPct           0.137635
A_ThreePARt        0.352216
A_FTR             -0.241322
A_REBPct           0.014475
A_BLKPct           0.571297
A_AST_TOV_Ratio    0.370184
dtype: float64

In [97]:
# Saving to a csv
df.to_csv('../csv_files/Capstone_p2_handled_skew.csv')