In [1]:
from basketball import *

import numpy as np
import pickle
import pandas as pd

## Get Salary Cap info

In [2]:
url = 'https://www.basketball-reference.com/contracts/salary-cap-history.html'

tables = get_tables(url)
df_caps = tables[0]

df_caps.columns = ['Year', 'Salary Cap', 'Salary Cap (2015)']
df_caps.drop(0, inplace=True)
df_caps.reset_index(inplace=True, drop=True)

# Money to float
df_caps['Salary Cap'].replace('\D', '', regex=True, inplace=True)
df_caps['Salary Cap (2015)'].replace('\D', '', regex=True, inplace=True)

df_caps['Salary Cap'] = pd.to_numeric(df_caps['Salary Cap'])
df_caps['Salary Cap (2015)'] = pd.to_numeric(df_caps['Salary Cap (2015)'])

df_caps['Year'] = df_caps['Year'].apply(lambda x: x[:2] + x[-2:])

In [3]:
with open('df_caps.pickle', 'wb') as f:
    pickle.dump(df_caps, f)

In [4]:
df_caps.head()

Unnamed: 0,Year,Salary Cap,Salary Cap (2015)
0,1985,3600000,7934034.0
1,1986,4233000,9153509.0
2,1987,4945000,10317292.0
3,1988,6164000,12354015.0
4,1989,7232000,13829137.0


## Get Stats

In [5]:
# Advanced tables
with open('databases.pickle', 'rb') as f:
    databases = pickle.load(f)


yearly_dfs = []

for year in databases.keys():
    if year != 2015:
        adv_stats, adv_sals = database_to_stats_and_salaries(pos=8, year=year, database=databases[year])
        per36_stats, per_sals = database_to_stats_and_salaries(pos=6, year=year, database=databases[year])

        df_adv = stats_salary_join(year=year, dfs=adv_stats, targets=adv_sals)
        df_per_36 = stats_salary_join(year=year, dfs=per36_stats, targets=per_sals)
        
        # drop duplicate columns
        cols_to_drop = {'Salary', 'MP', 'G', 'Age'}
        cols_to_keep = list(set(df_per_36.columns) - cols_to_drop)
        df_per_36 = df_per_36[cols_to_keep]
        
        # concat and append the normal stats and advanced stats
        yearly_dfs.append(pd.concat([df_per_36, df_adv], join='inner', axis=1))
        
df_stats = pd.concat(yearly_dfs)
df_stats.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PTS,FG,STL,2PA,ORB,BLK,FT,Rk,TRB,FTA,...,OWS,DWS,WS,WS/48,Unnamed: 18_level_0,OBPM,DBPM,BPM,VORP,Salary
Name,Team,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Paul Pierce,BOS,2008,19.7,6.4,1.3,9.2,0.7,0.5,5.1,1,5.1,6.1,...,6.7,5.7,12.4,0.207,,3.3,1.5,4.7,4.9,16360094
Ray Allen,BOS,2008,17.5,6.0,0.9,7.3,1.0,0.2,2.9,2,3.7,3.3,...,5.6,4.1,9.7,0.177,,3.2,-0.1,3.1,3.4,16000000
Kevin Garnett,BOS,2008,20.7,8.3,1.5,15.1,2.1,1.4,4.2,3,10.1,5.2,...,6.6,6.2,12.9,0.265,,2.7,4.7,7.4,5.5,23750000
Rajon Rondo,BOS,2008,12.7,5.5,2.0,10.8,1.2,0.2,1.7,4,5.0,2.7,...,2.3,4.9,7.2,0.15,,-0.1,2.7,2.6,2.7,1229280
Kendrick Perkins,BOS,2008,10.2,4.0,0.6,6.5,2.7,2.1,2.1,5,8.9,3.4,...,1.9,4.3,6.2,0.156,,-1.7,4.3,2.6,2.2,4480912


## Joining Stats and Caps

In [6]:
# set index to join (year)
df_caps.set_index('Year', inplace=True)

# Loop through all (player, team, year) combos and get the salary cap for that year

caps = [df_caps.loc[str(ix[2]), 'Salary Cap'] for ix in df_stats.index] 
poc = [df_stats.loc[ix]['Salary'] / df_caps.loc[str(ix[2]), 'Salary Cap'] for ix in df_stats.index]

df_stats['Percentage of Cap'] = poc
df_stats['Caps'] = caps


In [7]:
df_stats.head() # integrated cap info into our dataframe

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,PTS,FG,STL,2PA,ORB,BLK,FT,Rk,TRB,FTA,...,WS,WS/48,Unnamed: 16_level_0,OBPM,DBPM,BPM,VORP,Salary,Percentage of Cap,Caps
Name,Team,Year,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Paul Pierce,BOS,2008,19.7,6.4,1.3,9.2,0.7,0.5,5.1,1,5.1,6.1,...,12.4,0.207,,3.3,1.5,4.7,4.9,16360094,0.294088,55630000
Ray Allen,BOS,2008,17.5,6.0,0.9,7.3,1.0,0.2,2.9,2,3.7,3.3,...,9.7,0.177,,3.2,-0.1,3.1,3.4,16000000,0.287615,55630000
Kevin Garnett,BOS,2008,20.7,8.3,1.5,15.1,2.1,1.4,4.2,3,10.1,5.2,...,12.9,0.265,,2.7,4.7,7.4,5.5,23750000,0.426928,55630000
Rajon Rondo,BOS,2008,12.7,5.5,2.0,10.8,1.2,0.2,1.7,4,5.0,2.7,...,7.2,0.15,,-0.1,2.7,2.6,2.7,1229280,0.022097,55630000
Kendrick Perkins,BOS,2008,10.2,4.0,0.6,6.5,2.7,2.1,2.1,5,8.9,3.4,...,6.2,0.156,,-1.7,4.3,2.6,2.2,4480912,0.080548,55630000


## Feature Engineering

In [8]:
def prepare_dataframe(df, features):
    df = df.copy()

    # let's drop players with low minutes played per game, say less than 10
    df['MPperG'] = df['MP'] / df['G']
    
    # drop blank columns
    df.drop('\xa0', inplace=True, axis=1)

    # We want over 10 minutes played per game and nonzero salary
    mask = ((df['MPperG'] > 10) & (df['Salary'] > 0))
    df = df[mask]

    X = df[features].values
    y = df['Percentage of Cap'].values
    
    return X, y, df

## Modelling

In [9]:
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

### Defining linear model

In [10]:
def run_linear_model(features, scale=False):
    X, y, df = prepare_dataframe(df_stats, features)

    X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=.2,
                                                       random_state = 1999)

    kf = KFold(n_splits=5, shuffle=True, random_state=1999)

    train_scores = []
    test_scores = []

    for train_index, val_index in kf.split(X_cv):

        X_train, X_val = X_cv[train_index], X_cv[val_index]
        y_train, y_val = y_cv[train_index], y_cv[val_index]
        
        if scale:
            std = StandardScaler()
            X_train = std.fit_transform(X_train)
            X_val = std.transform(X_val)

        lm = LinearRegression()
        lm.fit(X_train, y_train)

        y_train_pred = lm.predict(X_train)
        y_val_pred = lm.predict(X_val)

        train_scores.append(np.round(r2_score(y_train, y_train_pred), 4))
        test_scores.append(np.round(r2_score(y_val, y_val_pred), 4))
        
    lm.fit(X_cv, y_cv)
    
    # pair names of features with their value then sort descending by absolute value
    coefficients = sorted(list(zip(features, np.round(lm.coef_, 4))), key=lambda x: -np.abs(x[1]))
    
    print('Training Scores:\t', train_scores)
    print('Test Scores:\t\t', test_scores)
        
    return train_scores, test_scores, coefficients, df 

### Advanced Features

In [11]:
adv_features = ['Age','PER', 'TS%', '3PAr', 'FTr', 'ORB%', 'DRB%',
       'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%',
       'WS/48', 'BPM', 'VORP']

train_scores, test_scores, coeffs, df = run_linear_model(adv_features, scale=True)

Training Scores:	 [0.4796, 0.4731, 0.4573, 0.4722, 0.4716]
Test Scores:		 [0.4137, 0.4528, 0.5083, 0.4552, 0.459]


In [12]:
coeffs

[('WS/48', -0.5283),
 ('TS%', -0.3675),
 ('3PAr', -0.0529),
 ('FTr', 0.0316),
 ('STL%', -0.0276),
 ('TRB%', 0.0237),
 ('ORB%', -0.0179),
 ('PER', 0.0136),
 ('DRB%', -0.0111),
 ('VORP', 0.0105),
 ('BPM', 0.0092),
 ('Age', 0.0063),
 ('BLK%', -0.0063),
 ('AST%', -0.0024),
 ('TOV%', 0.0023)]

In [13]:
np.mean(test_scores)

0.45780000000000004

### Feature Selection with Lasso

In [14]:
df_stats.columns

Index(['PTS', 'FG', 'STL', '2PA', 'ORB', 'BLK', 'FT', 'Rk', 'TRB', 'FTA',
       'FG%', '3PA', '3P', 'AST', 'TOV', 'DRB', '3P%', '2P', 'FT%', 'GS',
       '2P%', 'PF', 'FGA', 'Rk', 'Age', 'G', 'MP', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', ' ',
       'OWS', 'DWS', 'WS', 'WS/48', ' ', 'OBPM', 'DBPM', 'BPM', 'VORP',
       'Salary', 'Percentage of Cap', 'Caps'],
      dtype='object', name=0)

In [15]:
features = ['TRB', 'FT', 'FG', 'FTA', 'DRB', 'STL', 'PTS',
       '2P%', '3P', '2PA', '2P', 'BLK', '3PA', 'AST', 'FGA', 'FG%', 
       'ORB', 'PF', 'TOV', 'Age', 'PER', 'TS%', '3PAr', 'FTr',
       'ORB%', 'DRB%', 'TRB%', 'AST%', 'STL%', 'BLK%', 'TOV%', 'USG%', 
       'WS/48', 'OBPM', 'DBPM', 'BPM', 'VORP']

X, y, df = prepare_dataframe(df_stats, features)

X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=.2,
                                                       random_state = 1999)

In [16]:
# examine dataframe for nans

# df.isna().sum()

In [17]:
from sklearn.model_selection import GridSearchCV

lass_pipe = Pipeline([
    ('std_scl', StandardScaler()),
    ('lass', Lasso())
])

param_grid = {
    'lass__alpha': [.001, .01, .1, 1, 10]
}

gcv = GridSearchCV(lass_pipe, param_grid, cv=5)
gcv.fit(X_cv, y_cv)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('std_scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lass', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'lass__alpha': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [18]:
coefficients = sorted(list(zip(features, gcv.best_estimator_.named_steps['lass'].coef_)), key=lambda x: -np.abs(x[1]))
coefficients

[('Age', 0.02520252140196472),
 ('VORP', 0.01661087168283929),
 ('2PA', 0.015085552640407531),
 ('BPM', 0.013426769088038647),
 ('PF', -0.012574586309815623),
 ('USG%', 0.00903511701669181),
 ('DRB%', 0.008494112207228061),
 ('STL', -0.008060425053359533),
 ('FTA', 0.006285482057753485),
 ('TOV%', 0.00524888363945384),
 ('TS%', -0.0036022560834619113),
 ('ORB', -0.0035592394204430828),
 ('AST', -0.002504729778789116),
 ('WS/48', -0.0010859498213744373),
 ('BLK', 0.0005236545454175395),
 ('TRB', 0.0),
 ('FT', 0.0),
 ('FG', 0.0),
 ('DRB', 0.0),
 ('PTS', 0.0),
 ('2P%', -0.0),
 ('3P', -0.0),
 ('2P', 0.0),
 ('3PA', -0.0),
 ('FGA', 0.0),
 ('FG%', -0.0),
 ('TOV', 0.0),
 ('PER', 0.0),
 ('3PAr', -0.0),
 ('FTr', 0.0),
 ('ORB%', -0.0),
 ('TRB%', 0.0),
 ('AST%', -0.0),
 ('STL%', -0.0),
 ('BLK%', 0.0),
 ('OBPM', 0.0),
 ('DBPM', 0.0)]

### Last Coefficients left standing

In [19]:
best_tuples = [('Age', 0.025202521401964714),
('VORP', 0.01661087168283929),
('2PA', 0.015085552640407535),
('BPM', 0.013426769088038647),
('PF', -0.012574586309815628),
('USG%', 0.009035117016691805),
('DRB%', 0.008494112207228063),
('STL', -0.008060425053359528),
('FTA', 0.006285482057753487),
('TOV%', 0.005248883639453836),
('TS%', -0.003602256083461905),
('ORB', -0.0035592394204430806),
('AST', -0.002504729778789113),
('WS/48', -0.0010859498213744486),
('BLK', 0.0005236545454175383)]

best_coefs = [t[0] for t in best_tuples]
best_coefs

['Age',
 'VORP',
 '2PA',
 'BPM',
 'PF',
 'USG%',
 'DRB%',
 'STL',
 'FTA',
 'TOV%',
 'TS%',
 'ORB',
 'AST',
 'WS/48',
 'BLK']

In [20]:
tr_scores, te_scores, coeffs, df_lin = run_linear_model(best_coefs)

Training Scores:	 [0.4828, 0.4781, 0.4636, 0.4779, 0.474]
Test Scores:		 [0.4242, 0.4561, 0.5078, 0.4553, 0.4724]


In [21]:
np.mean(te_scores)

0.46316000000000007

In [22]:
coeffs

[('WS/48', -0.1234),
 ('TS%', -0.1211),
 ('STL', -0.026),
 ('PF', -0.0114),
 ('VORP', 0.0097),
 ('BPM', 0.0089),
 ('ORB', -0.0084),
 ('Age', 0.0063),
 ('2PA', 0.0061),
 ('FTA', 0.0052),
 ('AST', -0.0044),
 ('TOV%', 0.0021),
 ('DRB%', 0.0016),
 ('USG%', 0.0012),
 ('BLK', -0.0005)]

### Let's take a look at df_lin

In [23]:
import seaborn as sns

sns.heatmap(df_lin[best_coefs].corr())

<matplotlib.axes._subplots.AxesSubplot at 0x1993bef5588>

In [24]:
df_lin[best_coefs + ['Percentage of Cap']].corr().sort_values('Percentage of Cap', ascending=False)

Unnamed: 0_level_0,Age,VORP,2PA,BPM,PF,USG%,DRB%,STL,FTA,TOV%,TS%,ORB,AST,WS/48,BLK,Percentage of Cap
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Percentage of Cap,0.304116,0.491251,0.365955,0.427939,-0.262131,0.41661,0.203888,-0.010354,0.38375,-0.070244,0.192478,0.01889,0.176791,0.363371,0.068804,1.0
VORP,0.038925,1.0,0.247444,0.841163,-0.261569,0.361824,0.19996,0.249961,0.421765,-0.084682,0.418376,0.047904,0.28751,0.697643,0.113526,0.491251
BPM,0.045325,0.841163,0.176789,1.0,-0.224003,0.243227,0.252109,0.306245,0.33336,-0.152563,0.606885,0.131627,0.215957,0.857533,0.217993,0.427939
USG%,-0.123269,0.361824,0.768327,0.243227,-0.281692,1.0,-0.002609,0.069298,0.661431,-0.198987,0.106723,-0.143822,0.335388,0.212825,-0.12872,0.41661
FTA,-0.139645,0.421765,0.623788,0.33336,-0.033976,0.661431,0.240847,0.030766,1.0,-0.04642,0.284567,0.209463,0.118793,0.42487,0.139465,0.38375
2PA,-0.174568,0.247444,1.0,0.176789,-0.028426,0.768327,0.296278,-0.027689,0.623788,-0.156922,0.052151,0.295301,0.09277,0.245136,0.196593,0.365955
WS/48,0.050774,0.697643,0.245136,0.857533,-0.095399,0.212825,0.34004,0.077332,0.42487,-0.255899,0.748246,0.278023,0.056812,1.0,0.255959,0.363371
Age,1.0,0.038925,-0.174568,0.045325,-0.069825,-0.123269,-0.01277,-0.046386,-0.139645,0.053878,0.020197,-0.112968,0.085471,0.050774,-0.077768,0.304116
DRB%,-0.01277,0.19996,0.296278,0.252109,0.339699,-0.002609,1.0,-0.16327,0.240847,0.041778,0.168161,0.716368,-0.370922,0.34004,0.576305,0.203888
TS%,0.020197,0.418376,0.052151,0.606885,-0.042859,0.106723,0.168161,-0.067697,0.284567,-0.077733,1.0,0.110645,-0.06047,0.748246,0.144762,0.192478


### Running on full data

Let's run the model on the testing set

In [25]:
X, y, df = prepare_dataframe(df_stats, best_coefs)

X_cv, X_test, y_cv, y_test = train_test_split(X, y, test_size=.2,
                                                       random_state = 1999)

In [26]:
lm = LinearRegression()
lm.fit(X_cv, y_cv)
y_cv_pred = lm.predict(X_cv)
y_test_pred = lm.predict(X_test)


print('Training Score:\t\t', r2_score(y_cv, y_cv_pred))
print('Testing Score:\t\t', r2_score(y_test, y_test_pred))

Training Score:		 0.47449275208314656
Testing Score:		 0.5140028931488684


It is interesting that this is performing better on the testing set

Let's train the model on all the data

In [27]:
lm.fit(X, y)
y_pred = lm.predict(X)

r2_score(y, y_pred)

0.4836456929692491

In [39]:
df['Predicted Percentage of Cap'] = np.round(y_pred, 4)
df['Predicted Percentage of Cap Error'] = np.round(df['Predicted Percentage of Cap'] - df['Percentage of Cap'], 4)

df['Predicted Salary'] = np.round(df['Predicted Percentage of Cap'] * df['Caps'], -3)
df['Predicted Salary Error'] = np.round(df['Predicted Salary'] - df['Salary'], -3)

In [47]:
df.reset_index(inplace=True)

results = ['Name', 'Team', 'Year', 'Salary', 'Predicted Salary Error', 'Percentage of Cap', 'Predicted Percentage of Cap', 'Predicted Percentage of Cap Error']

df[results].head()

Unnamed: 0,Name,Team,Year,Salary,Predicted Salary Error,Percentage of Cap,Predicted Percentage of Cap,Predicted Percentage of Cap Error
0,Paul Pierce,BOS,2008,16360094,-4995000.0,0.294088,0.2043,-0.0898
1,Ray Allen,BOS,2008,16000000,-6877000.0,0.287615,0.164,-0.1236
2,Kevin Garnett,BOS,2008,23750000,-9386000.0,0.426928,0.2582,-0.1687
3,Rajon Rondo,BOS,2008,1229280,2754000.0,0.022097,0.0716,0.0495
4,Kendrick Perkins,BOS,2008,4480912,131000.0,0.080548,0.0829,0.0024


In [57]:
df[results][df['Year'] == 2014].sort_values('Predicted Percentage of Cap Error').head(15)

Unnamed: 0,Name,Team,Year,Salary,Predicted Salary Error,Percentage of Cap,Predicted Percentage of Cap,Predicted Percentage of Cap Error
2441,Kobe Bryant,LAL,2014,30453805,-20396000.0,0.51899,0.1714,-0.3476
2207,Amar'e Stoudemire,NYK,2014,21679893,-14914000.0,0.369466,0.1153,-0.2542
2187,Joe Johnson,BRK,2014,21466718,-13088000.0,0.365833,0.1428,-0.223
2189,Deron Williams,BRK,2014,18466130,-12522000.0,0.314697,0.1013,-0.2134
2280,Chris Bosh,MIA,2014,19067500,-11034000.0,0.324946,0.1369,-0.188
2246,Derrick Rose,CHI,2014,17632688,-10580000.0,0.300494,0.1202,-0.1803
2390,Chris Paul,LAC,2014,18668431,-9766000.0,0.318145,0.1517,-0.1664
2488,Eric Gordon,NOP,2014,14283844,-9220000.0,0.243423,0.0863,-0.1571
2456,Dwight Howard,HOU,2014,20513178,-9159000.0,0.349583,0.1935,-0.1561
2201,Carmelo Anthony,NYK,2014,22407474,-8694000.0,0.381865,0.2337,-0.1482


In [56]:
### The average error was 60% of the mean salary

df[results].groupby('Name').mean()['Predicted Percentage of Cap Error'].std() / df[results].groupby('Name').mean()['Predicted Percentage of Cap'].mean()

0.5813459674609499