## Some Cleaning

In [2]:
from basketball import *

import numpy as np
import pickle

In [3]:
with open('databases.pickle', 'rb') as f:
    databases = pickle.load(f)

In [7]:
dataframes = []

# unpack database
for year, database in databases.items():
    if year != 2015:
        dfs, sals = database_to_36min_and_salaries(database=databases[year])
        dataframes.append(stats_salary_join(year=year, dfs=dfs, targets=sals))
    
df = pd.concat(dataframes)
df.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Rk,Age,G,GS,MP,FG,FGA,FG%,3P,3PA,...,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Salary
Name,Team,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Paul Pierce,BOS,2008,1,30,80,80,2874,6.4,13.8,0.464,1.8,4.6,...,0.7,4.5,5.1,4.5,1.3,0.5,2.8,2.5,19.7,16360094
Ray Allen,BOS,2008,2,32,73,73,2624,6.0,13.5,0.445,2.5,6.2,...,1.0,2.6,3.7,3.1,0.9,0.2,1.7,2.0,17.5,16000000
Kevin Garnett,BOS,2008,3,31,71,71,2328,8.3,15.3,0.539,0.0,0.2,...,2.1,8.0,10.1,3.8,1.5,1.4,2.1,2.5,20.7,23750000
Rajon Rondo,BOS,2008,4,21,77,77,2306,5.5,11.1,0.492,0.1,0.3,...,1.2,3.8,5.0,6.1,2.0,0.2,2.3,2.9,12.7,1229280
Kendrick Perkins,BOS,2008,5,23,78,78,1912,4.0,6.6,0.615,0.0,0.0,...,2.7,6.2,8.9,1.6,0.6,2.1,2.4,4.6,10.2,4480912
James Posey,BOS,2008,6,31,74,2,1821,3.4,8.2,0.418,2.1,5.5,...,0.6,5.8,6.4,2.3,1.4,0.4,1.3,3.6,10.8,3206000
Eddie House,BOS,2008,7,29,78,2,1480,5.3,12.9,0.409,2.8,7.2,...,0.5,3.6,4.1,3.7,1.4,0.2,1.8,2.8,14.2,1500000
Tony Allen,BOS,2008,8,26,75,11,1373,4.4,10.2,0.434,0.5,1.5,...,0.9,3.5,4.4,3.0,1.6,0.6,2.9,4.4,13.0,1868141
Glen Davis,BOS,2008,9,22,69,1,940,4.1,8.5,0.484,0.0,0.0,...,3.6,4.3,8.0,1.1,1.2,0.8,2.5,6.1,12.0,427163
Leon Powe,BOS,2008,10,24,56,5,809,6.9,12.0,0.572,0.0,0.0,...,4.2,5.9,10.1,0.7,0.7,0.7,1.9,5.7,19.8,687456


In [23]:
def prepare_dataframe(df):
    
    mapper = {'2P':'TwoPoint',
              '2PA':'TwoPointAttempt',
              '2P%':'TwoPointPercent',
              'FG%': 'FieldGoalPercent',
              'FT%':'FreeThrowPercent',
              '3P%': 'ThreePointPercent'
         }

    df.rename(columns=mapper, inplace=True)

    preliminary_columns = ['DRB', 'TwoPoint', 'PTS', 'FG', 'BLK', 'FT',
                           'TwoPointAttempt', 'Age', 'PF', 'Salary']

    to_remove = ['Salary', 'FreeThrowPercent', 'ThreePointPercent',
                 'TwoPointPercent', 'FieldGoalPercent']

    columns = list(df.columns)
    for column in to_remove:
        columns.remove(column)

    # let's drop players with low minutes played per game, say less than 10
    df['MPperG'] = df['MP']/df['G']
    
    # We want over 10 minutes played per game and nonzero salary
    mask = ((df['MPperG'] > 10) & (df['Salary'] > 0))
    df = df[mask]
    
    X = df[columns].values
    y = df['Salary'].values
    y = np.log(y)
    
    return X, y, df

X, y, df = prepare_dataframe(df) 

## Training

We have the years between 2008 and 2018 excluding 2015. Let's run this through a model

In [8]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

In [9]:
X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=.2,
                                                   random_state = 1999)

lm_pipe = Pipeline([
    ('imp', SimpleImputer()),
    ('std_scl', StandardScaler()), 
    ('lm', LinearRegression())])

kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X_cv):
    X_train, X_val = X_cv[train_index], X_cv[test_index]
    y_train, y_val = y_cv[train_index], y_cv[test_index]
    
    imp = SimpleImputer(missing_values=np.nan)
    X_train = imp.fit_transform(X_train)
    
    scl = StandardScaler()
    X_train = scl.fit_transform(X_train)
    
    lm = LinearRegression()
    lm.fit(X_train, y_train)
    y_val_pred = lm.predict(X_val)
    
    print('r^2:\t', r2_score(y_val_pred, y_val))
    


r^2:	 -3.8187266071850283
r^2:	 -3.3449344206280047
r^2:	 -2.7715030835199306
r^2:	 -3.1133590243370772
r^2:	 -3.2436160308814834


In [11]:
from sklearn.model_selection import GridSearchCV
    
parameters = {'lass__alpha': [.0095, .015]}

lass_pipe = Pipeline([
    ('std_scl', StandardScaler()),
    ('lass', Lasso(max_iter=100000))
])

gcv = GridSearchCV(lass_pipe, parameters, cv=5, verbose=0)
gcv.fit(X_train, y_train)

print(gcv.best_score_, '\n', gcv.best_params_)

0.4058933812733617 
 {'lass__alpha': 0.0095}


In [12]:
parameters = {'ridge__alpha': [1, 4, 5,6, 15]}

ridge_pipe = Pipeline([
    ('std_scl', StandardScaler()),
    ('ridge', Ridge())
])

gcv = GridSearchCV(ridge_pipe, parameters, cv=5, verbose=1)
gcv.fit(X_train, y_train)

print(gcv.best_score_, '\n', gcv.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.40439576589225557 
 {'ridge__alpha': 15}


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    0.2s finished


In [13]:
from sklearn.linear_model import ElasticNet

parameters = {'enet__alpha': [0.0001, 0.001, 0.01, 0.03, 0.06, 0.1, 0.15],
             'enet__l1_ratio': [0.01, .05, .1, .2, .3, .4, .5, .6, .7, .8, .9]}

enet_pipe = Pipeline([
    ('std_scl', StandardScaler()),
    ('enet', ElasticNet(max_iter=100000))
])

gcv = GridSearchCV(enet_pipe, parameters, cv=5, verbose=1)
gcv.fit(X_train, y_train)

print(gcv.best_score_, '\n', gcv.best_params_)

Fitting 5 folds for each of 77 candidates, totalling 385 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0.4061563305927372 
 {'enet__alpha': 0.03, 'enet__l1_ratio': 0.1}


[Parallel(n_jobs=1)]: Done 385 out of 385 | elapsed:   53.2s finished


In [14]:
from sklearn.ensemble import RandomForestRegressor

parameters = {
    'rf_reg__n_estimators': [400, 600, 800, 1000],
    'rf_reg__max_depth': [None]
}

rf_reg_pipe = Pipeline([
    ('std_scl', StandardScaler()),
    ('rf_reg', RandomForestRegressor())
])

gcv = GridSearchCV(rf_reg_pipe, parameters, cv=5, verbose=1)
gcv.fit(X_train, y_train)

print(gcv.best_score_, '\n', gcv.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  4.0min finished


0.4827403072623552 
 {'rf_reg__max_depth': None, 'rf_reg__n_estimators': 400}


In [15]:
best_params = gcv.best_params_

rf_reg_pipe = Pipeline([
    ('std_scl', StandardScaler()),
    ('rf_reg', RandomForestRegressor(max_depth=None, n_estimators=800))
])

rf_reg_pipe.fit(X_cv, y_cv)
y_test_pred = rf_reg_pipe.predict(X_test)

r2_score(y_test, y_test_pred)


0.5519006380550322

In [26]:
rf_reg_pipe.fit(X, y)
y_pred = rf_reg_pipe.predict(X)

df['Predicted Salary'] = np.round(np.exp(y_pred).astype(int), -4)
df['Salary Error'] = df['Salary'] - df['Predicted Salary']

df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Rk,Age,G,GS,MP,FG,FGA,FieldGoalPercent,3P,3PA,...,AST,STL,BLK,TOV,PF,PTS,Salary,MPperG,Predicted Salary,Salary Error
Name,Team,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Paul Pierce,BOS,2008,1,30,80,80,2874,6.4,13.8,0.464,1.8,4.6,...,4.5,1.3,0.5,2.8,2.5,19.7,16360094,35.925,15250000,1110094
Ray Allen,BOS,2008,2,32,73,73,2624,6.0,13.5,0.445,2.5,6.2,...,3.1,0.9,0.2,1.7,2.0,17.5,16000000,35.945205,14600000,1400000
Kevin Garnett,BOS,2008,3,31,71,71,2328,8.3,15.3,0.539,0.0,0.2,...,3.8,1.5,1.4,2.1,2.5,20.7,23750000,32.788732,20650000,3100000
Rajon Rondo,BOS,2008,4,21,77,77,2306,5.5,11.1,0.492,0.1,0.3,...,6.1,2.0,0.2,2.3,2.9,12.7,1229280,29.948052,1480000,-250720
Kendrick Perkins,BOS,2008,5,23,78,78,1912,4.0,6.6,0.615,0.0,0.0,...,1.6,0.6,2.1,2.4,4.6,10.2,4480912,24.512821,3440000,1040912


In [27]:
df_sal = df[['Salary', 'Predicted Salary', 'Salary Error']]
df_sal.sort_values('Salary Error', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Salary,Predicted Salary,Salary Error
Name,Team,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Kobe Bryant,LAL,2014,30453805,11020000,19433805
Mike Conley,MEM,2018,28530608,13230000,15300608
Luol Deng,LAL,2018,17190000,2420000,14770000
Michael Redd,MIL,2011,18300000,5650000,12650000
Chandler Parsons,MEM,2018,23112004,10580000,12532004
Rashard Lewis,WAS,2012,21136631,9690000,11446631
Blake Griffin,DET,2018,29727900,18450000,11277900
Otto Porter,WAS,2018,24773250,13660000,11113250
Andre Drummond,DET,2017,22116750,11070000,11046750
Gilbert Arenas,WAS,2009,14653466,3710000,10943466


In [29]:
# reset index
df_sal.reset_index(inplace=True)
# look at 2018
mask = (df_sal['Year'] == 2018)
df_sal[mask].sort_values('Salary Error', ascending=False).tail(10)


Unnamed: 0,index,Name,Team,Year,Salary,Predicted Salary,Salary Error
3499,3499,Jonathon Simmons,ORL,2018,6300000,7340000,-1040000
3674,3674,Zach Randolph,SAC,2018,12307692,13360000,-1052308
3503,3503,Nikola Vucevic,ORL,2018,12250000,13340000,-1090000
3601,3601,Rajon Rondo,NOP,2018,3300000,4490000,-1190000
3484,3484,Justin Holiday,CHI,2018,4615385,5810000,-1194615
3630,3630,Will Barton,DEN,2018,3533333,4900000,-1366667
3714,3714,T.J. Warren,PHO,2018,3152931,4600000,-1447069
3457,3457,Michael Beasley,NYK,2018,1471382,2940000,-1468618
3698,3698,Tyreke Evans,MEM,2018,3290000,5900000,-2610000
3685,3685,Dirk Nowitzki,DAL,2018,5000000,7640000,-2640000


## A second look at databases

In [5]:
# I edited the d

sea_adv = databases[2008]['SEA'][8].copy()
sea_sal = databases[2008]['SEA'][-1]

dfs, sals = database_to_stats_and_salaries(pos=8, year=2008, database=databases[2008])

df_adv = stats_salary_join(year=2008, dfs=dfs, targets=sals)

In [6]:
# error is from difference between per36 columns and advanced columns

X, y, df_adv = prepare_dataframe(df_adv) 
                                            

ValueError: list.remove(x): x not in list