In [1]:
import pandas as pd
from sklearn.ensemble.partial_dependence import plot_partial_dependence
from sklearn.ensemble.partial_dependence import partial_dependence
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix,roc_auc_score,roc_curve
import seaborn as sns
import matplotlib.pyplot as plt

In [41]:
def get_data(file):
    '''
    INPUT: Takes in a path to a pickle file that you want to load in.
    
    OUTPUT: Returns a pandas dataframe of the pickle file that was uploaded.
    '''
    data = pd.read_pickle(file)
    return data

In [43]:
salary = get_data('data/salary.p')
stats = get_data('data/stats.p')
free_agents = get_data('data/free_agents.p')

In [47]:
def combine_data(salary_data,stats_data,free_agent_data):
    '''
    INPUT: Takes in all salary data, stats data, and free agent data in that order and 
            combines them into one big dataframe.
    OUTPUT: One pandas dataframe.
    '''
    final_data = pd.DataFrame()
    for player in free_agent_data.index.unique():
        p = free_agent_data[free_agent_data.index==player]
        for i in range(len(p)):
            year = p['Year of Free Agency'].iloc[i]
            df1 = p[p['Year of Free Agency']==year]
            df2 = stats_data[(stats_data.index==player)&(stats_data['Year']==year)]
            df3 = salary_data[(salary_data.index==player)&(salary_data['Year']==year)]
            temp = pd.concat([df1,df2,df3],axis=1)
            if len(final_data) == 0:
                final_data = temp
            else:
                final_data = pd.concat([final_data,temp])
    return final_data

In [73]:
final_data = combine_data(salary,stats,free_agents)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  from ipykernel import kernelapp as app


In [74]:
final_data['Churn'] = (final_data['From'] != final_data['To']).astype(int)
final_data.drop(['Avg. Salary','Pos.','To','From'],axis=1,inplace=True)
final_data['PTS'] = final_data['PTS'].astype(float)
final_data['PF'] = final_data['PF'].astype(float)
final_data['TOV'] = final_data['TOV'].astype(float)
final_data['BLK'] = final_data['BLK'].astype(float)
final_data['STL'] = final_data['STL'].astype(float)
final_data['AST'] = final_data['AST'].astype(float)
final_data['TRB'] = final_data['TRB'].astype(float)
final_data['FT%'] = final_data['FT%'].astype(float)
final_data['FTA'] = final_data['FTA'].astype(float)
final_data['FT'] = final_data['FT'].astype(float)
final_data['eFG%'] = final_data['eFG%'].astype(float)
final_data['2P%'] = final_data['2P%'].astype(float)
final_data['2PA'] = final_data['2PA'].astype(float)
final_data['2P'] = final_data['2P'].astype(float)
final_data['3P%'] = final_data['3P%'].astype(float)
final_data['3PA'] = final_data['3PA'].astype(float)
final_data['3P'] = final_data['3P'].astype(float)
final_data['FG%'] = final_data['FG%'].astype(float)
final_data['FGA'] = final_data['FGA'].astype(float)
final_data['FG'] = final_data['FG'].astype(float)
final_data['MP'] = final_data['MP'].astype(float)
final_data['GS'] = final_data['GS'].astype(float)
final_data['G'] = final_data['G'].astype(float)
final_data['Salary'] = final_data['Salary'].astype(float)
final_data['Pos'] = final_data['Pos'].astype('category')
final_data['Yrs'] = final_data['Yrs'].astype('category')
final_data['Type'] = final_data['Type'].astype('category')
final_data['Year of Free Agency'] = final_data['Year of Free Agency'].astype('category')

In [75]:
final_data.corr()

Unnamed: 0,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,3P%,...,STL,BLK,TOV,PF,PTS,Year,Playoffs,Salary,Year.1,Churn
Age,1.0,0.143069,0.088214,0.11496,0.055669,0.062576,-0.00045,0.135273,0.12809,0.096021,...,0.068441,0.000201,0.049645,0.07766,0.062312,0.05079,0.161648,0.319562,0.049266,0.012703
G,0.143069,1.0,0.555234,0.613372,0.554314,0.516469,0.283492,0.353948,0.330552,0.160704,...,0.458049,0.284116,0.410359,0.469044,0.546026,0.099058,0.066006,0.305489,0.074144,-0.202486
GS,0.088214,0.555234,1.0,0.75317,0.702934,0.668642,0.199186,0.361438,0.350198,0.107163,...,0.571584,0.385975,0.581081,0.508388,0.6927,0.030882,0.030006,0.485978,0.018134,-0.205693
MP,0.11496,0.613372,0.75317,1.0,0.88737,0.898502,0.163639,0.577396,0.59018,0.204168,...,0.759199,0.354404,0.768715,0.694463,0.890474,0.0332,-0.034043,0.515219,0.025684,-0.218667
FG,0.055669,0.554314,0.702934,0.88737,1.0,0.973858,0.263352,0.527572,0.528774,0.177865,...,0.658476,0.380945,0.789416,0.584245,0.98853,0.09218,-0.004902,0.586436,0.083091,-0.23385
FGA,0.062576,0.516469,0.668642,0.898502,0.973858,1.0,0.101327,0.613618,0.636278,0.203725,...,0.676991,0.278716,0.797531,0.555021,0.976662,0.07416,-0.038108,0.565368,0.068024,-0.207793
FG%,-0.00045,0.283492,0.199186,0.163639,0.263352,0.101327,1.0,-0.122637,-0.192189,0.040021,...,0.083966,0.386568,0.119885,0.288014,0.219946,0.095975,0.083927,0.126003,0.084438,-0.135442
3P,0.135273,0.353948,0.361438,0.577396,0.527572,0.613618,-0.122637,1.0,0.978442,0.506128,...,0.464187,-0.126125,0.40095,0.229119,0.60104,0.1349,0.018348,0.262366,0.135758,-0.136776
3PA,0.12809,0.330552,0.350198,0.59018,0.528774,0.636278,-0.192189,0.978442,1.0,0.443129,...,0.487742,-0.148229,0.425518,0.22758,0.602253,0.143276,0.001141,0.271259,0.146183,-0.123569
3P%,0.096021,0.160704,0.107163,0.204168,0.177865,0.203725,0.040021,0.506128,0.443129,1.0,...,0.138205,-0.147668,0.101127,0.022198,0.214311,0.056088,0.040025,0.035212,0.052552,-0.005691


In [76]:
final_data['PPM'] = final_data['PTS'] / final_data['MP']
final_data.drop(['3P%','3PA','FG','FGA','2P%','2PA','FTA','FT%','FG%','FT%'],axis=1,inplace=True)

In [77]:
final_data.corr()

Unnamed: 0,Age,G,GS,MP,FG%,3P,2P,eFG%,FT,FT%,...,BLK,TOV,PF,PTS,Year,Playoffs,Salary,Year.1,Churn,PPM
Age,1.0,0.143069,0.088214,0.11496,-0.00045,0.135273,0.005925,0.077644,0.01041,0.095147,...,0.000201,0.049645,0.07766,0.062312,0.05079,0.161648,0.319562,0.049266,0.012703,-0.033169
G,0.143069,1.0,0.555234,0.613372,0.283492,0.353948,0.490948,0.343652,0.408251,0.1778,...,0.284116,0.410359,0.469044,0.546026,0.099058,0.066006,0.305489,0.074144,-0.202486,0.324998
GS,0.088214,0.555234,1.0,0.75317,0.199186,0.361438,0.65918,0.21134,0.57887,0.115781,...,0.385975,0.581081,0.508388,0.6927,0.030882,0.030006,0.485978,0.018134,-0.205693,0.323606
MP,0.11496,0.613372,0.75317,1.0,0.163639,0.577396,0.780672,0.232269,0.72737,0.228465,...,0.354404,0.768715,0.694463,0.890474,0.0332,-0.034043,0.515219,0.025684,-0.218667,0.44402
FG%,-0.00045,0.283492,0.199186,0.163639,1.0,-0.122637,0.356237,0.884972,0.194953,-0.055888,...,0.386568,0.119885,0.288014,0.219946,0.095975,0.083927,0.126003,0.084438,-0.135442,0.373791
3P,0.135273,0.353948,0.361438,0.577396,-0.122637,1.0,0.185434,0.226036,0.353285,0.364653,...,-0.126125,0.40095,0.229119,0.60104,0.1349,0.018348,0.262366,0.135758,-0.136776,0.428729
2P,0.005925,0.490948,0.65918,0.780672,0.356237,0.185434,1.0,0.235634,0.80882,0.138634,...,0.493228,0.742938,0.577815,0.887953,0.048379,-0.014161,0.566174,0.037697,-0.211246,0.658722
eFG%,0.077644,0.343652,0.21134,0.232269,0.884972,0.226036,0.235634,1.0,0.157174,0.10226,...,0.234262,0.116904,0.240563,0.276815,0.163897,0.096095,0.115087,0.155528,-0.144654,0.438144
FT,0.01041,0.408251,0.57887,0.72737,0.194953,0.353285,0.80882,0.157174,1.0,0.278592,...,0.314047,0.734064,0.467681,0.874878,0.04933,-0.012263,0.535138,0.043571,-0.175912,0.657139
FT%,0.095147,0.1778,0.115781,0.228465,-0.055888,0.364653,0.138634,0.10226,0.278592,1.0,...,-0.152368,0.190902,0.018873,0.29828,0.121142,-0.038146,0.096351,0.117898,-0.046131,0.380123


In [78]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1713 entries, Marc Gasol to Marcus Paige
Data columns (total 28 columns):
Age                    1623 non-null float64
Type                   1711 non-null category
Yrs                    1713 non-null category
Year of Free Agency    1713 non-null category
Pos                    1408 non-null category
Tm                     1408 non-null object
G                      1408 non-null float64
GS                     1408 non-null float64
MP                     1408 non-null float64
FG%                    1406 non-null float64
3P                     1408 non-null float64
2P                     1408 non-null float64
eFG%                   1406 non-null float64
FT                     1408 non-null float64
FT%                    1366 non-null float64
TRB                    1408 non-null float64
AST                    1408 non-null float64
STL                    1408 non-null float64
BLK                    1408 non-null float64
TOV                    

In [70]:
subset = final_data[~final_data['MP'].isna()]

In [72]:
subset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1408 entries, Marc Gasol to Marcus Paige
Data columns (total 28 columns):
Age                    1339 non-null float64
Type                   1408 non-null category
Yrs                    1408 non-null category
Year of Free Agency    1408 non-null category
Pos                    1408 non-null category
Tm                     1408 non-null object
G                      1408 non-null float64
GS                     1408 non-null float64
MP                     1408 non-null float64
FG%                    1406 non-null float64
3P                     1408 non-null float64
2P%                    1401 non-null float64
eFG%                   1406 non-null float64
FT                     1408 non-null float64
FT%                    1366 non-null float64
TRB                    1408 non-null float64
AST                    1408 non-null float64
STL                    1408 non-null float64
BLK                    1408 non-null float64
TOV                    

In [93]:
salary[salary.index.str.endswith('Barea')]

Unnamed: 0_level_0,Salary,Year
Player,Unnamed: 1_level_1,Unnamed: 2_level_1


In [83]:
subset[subset['Salary'].isna()]

Unnamed: 0,Age,Type,Yrs,Year of Free Agency,Pos,Tm,G,GS,MP,FG%,...,BLK,TOV,PF,PTS,Year,Playoffs,Salary,Year.1,Churn,PPM
Luc Richard Mbah a Moute,25.0,RFA,4,2011,PF,MIL,79.0,52.0,26.5,0.463,...,0.4,1.0,2.3,6.7,2011.0,0.0,,,0,0.25283
Luc Richard Mbah a Moute,29.0,UFA,-,2015,PF,PHI,67.0,61.0,28.6,0.395,...,0.3,1.5,1.6,9.9,2015.0,0.0,,,1,0.346154
Luc Richard Mbah a Moute,30.0,UFA,2,2016,PF,LAC,75.0,61.0,17.0,0.454,...,0.3,0.5,1.3,3.1,2016.0,1.0,,,0,0.182353
Luc Richard Mbah a Moute,31.0,UFA,1,2017,SF,LAC,80.0,76.0,22.3,0.505,...,0.4,0.6,1.5,6.1,2017.0,1.0,,,1,0.273543
Luc Richard Mbah a Moute,32.0,UFA,1,2018,PF,HOU,61.0,15.0,25.6,0.481,...,0.4,1.1,1.6,7.5,2018.0,1.0,,,1,0.292969
Jose Barea,27.0,UFA,4,2011,PG,DAL,81.0,2.0,20.6,0.439,...,0.0,1.7,1.7,9.5,2011.0,1.0,,,1,0.461165
Jose Barea,31.0,UFA,4,2015,PG,DAL,77.0,10.0,17.7,0.42,...,0.0,0.9,1.4,7.5,2015.0,1.0,,,0,0.423729
Sasha Pavlovic,27.0,UFA,1,2011,SG-SF,TOT,31.0,7.0,11.7,0.403,...,0.2,0.4,1.1,2.5,2011.0,0.0,,,0,0.213675
Sasha Pavlovic,28.0,UFA,-,2012,SF,BOS,45.0,7.0,11.7,0.391,...,0.3,0.4,1.2,2.7,2012.0,1.0,,,1,0.230769
Sasha Pavlovic,29.0,UFA,-,2013,SG,POR,39.0,1.0,13.5,0.353,...,0.1,0.4,1.5,2.6,2013.0,0.0,,,1,0.192593
