In [1]:
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor

In [2]:
nba = pd.read_csv("third_season_advanced.csv")
uni = pd.read_csv("last_ncaa_season.csv")
ratio = pd.read_csv("player_ratio_scores.csv")
athletics = pd.read_csv("final_athletics.csv")

In [3]:
print("Uni shape:", uni.shape)
print("NBA shape:", nba.shape)
print("Ratio shape:", ratio.shape)

Uni shape: (334, 30)
NBA shape: (249, 27)
Ratio shape: (334, 9)


In [4]:
nba.pos = nba.pos.apply(lambda x: x.split(",")[0])
nba.pos.unique()

array(['PF', 'SF', 'SG', 'PG', 'C'], dtype=object)

In [5]:
uni.columns

Index(['player_name', 'season', 'school_name', 'conf_abbr', 'g', 'gs', 'mp',
       'per', 'ts_pct', 'efg_pct', 'fg3a_per_fga_pct', 'fta_per_fga_pct',
       'pprod', 'orb_pct', 'drb_pct', 'trb_pct', 'ast_pct', 'stl_pct',
       'blk_pct', 'tov_pct', 'usg_pct', 'ows', 'dws', 'ws', 'ws_per_40',
       'obpm', 'dbpm', 'bpm', 'years', 'player_id'],
      dtype='object')

In [6]:
uni.head()

Unnamed: 0,player_name,season,school_name,conf_abbr,g,gs,mp,per,ts_pct,efg_pct,...,usg_pct,ows,dws,ws,ws_per_40,obpm,dbpm,bpm,years,player_id
0,Anthony Davis,2011-12,Kentucky,SEC,40,40,1281,35.1,0.654,0.628,...,18.8,5.9,4.1,9.9,0.31,9.1,8.1,17.2,1,anthony-davis-5
1,Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,40,39,1245,21.2,0.57,0.511,...,20.6,3.4,2.6,6.0,0.192,4.1,3.6,7.7,1,michael-kidd-gilchrist-1
2,Bradley Beal,2011-12,Florida,SEC,37,37,1267,22.0,0.575,0.525,...,23.0,3.6,2.1,5.7,0.18,5.4,2.9,8.4,1,bradley-beal-1
3,Dion Waiters,2011-12,Syracuse,Big East,37,0,891,26.3,0.565,0.534,...,26.7,3.2,1.7,5.0,0.223,6.0,4.8,10.8,2,dion-waiters-1
4,Thomas Robinson,2011-12,Kansas,Big 12,39,39,1242,27.4,0.549,0.512,...,30.0,3.7,3.5,7.2,0.231,5.4,3.4,8.8,3,thomas-robinson-2


In [7]:
athletics.head()

Unnamed: 0,position,body_fat_pct,hand_length,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,player_id
0,PF,7.9,229,216,2064,2096,2743,221.8,2273,anthony-davis-5
1,SF,7.0,229,260,1975,2019,2654,232.8,2134,michael-kidd-gilchrist-1
2,SG,6.0,216,229,1911,1949,2540,201.8,2032,bradley-beal-1
3,SG,8.5,216,241,1892,1930,2489,221.0,2013,dion-waiters-1
4,PF,5.0,248,267,2026,2051,2692,244.2,2216,thomas-robinson-2


In [8]:
trad_df = uni[["player_id"]].merge(nba[["age", "pos", "player_id"]], how = 'inner', on = 'player_id')
trad_df.rename(columns={"age": "nba_3rd_year_age"}, inplace=True)

trad_df["last_uni_age"] = trad_df.nba_3rd_year_age - 3

trad_df

Unnamed: 0,player_id,nba_3rd_year_age,pos,last_uni_age
0,anthony-davis-5,21,PF,18
1,michael-kidd-gilchrist-1,21,SF,18
2,bradley-beal-1,21,SG,18
3,dion-waiters-1,23,SG,20
4,thomas-robinson-2,23,PF,20
...,...,...,...,...
243,keita-bates-diop-1,25,SF,22
244,chimezie-metu-1,23,C,20
245,alize-johnson-1,24,PF,21
246,malik-milton-1,24,SG,21


In [9]:
uni = uni.merge(trad_df[["player_id", "pos", "last_uni_age"]], how = 'inner', on = 'player_id')
uni.head()

Unnamed: 0,player_name,season,school_name,conf_abbr,g,gs,mp,per,ts_pct,efg_pct,...,dws,ws,ws_per_40,obpm,dbpm,bpm,years,player_id,pos,last_uni_age
0,Anthony Davis,2011-12,Kentucky,SEC,40,40,1281,35.1,0.654,0.628,...,4.1,9.9,0.31,9.1,8.1,17.2,1,anthony-davis-5,PF,18
1,Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,40,39,1245,21.2,0.57,0.511,...,2.6,6.0,0.192,4.1,3.6,7.7,1,michael-kidd-gilchrist-1,SF,18
2,Bradley Beal,2011-12,Florida,SEC,37,37,1267,22.0,0.575,0.525,...,2.1,5.7,0.18,5.4,2.9,8.4,1,bradley-beal-1,SG,18
3,Dion Waiters,2011-12,Syracuse,Big East,37,0,891,26.3,0.565,0.534,...,1.7,5.0,0.223,6.0,4.8,10.8,2,dion-waiters-1,SG,20
4,Thomas Robinson,2011-12,Kansas,Big 12,39,39,1242,27.4,0.549,0.512,...,3.5,7.2,0.231,5.4,3.4,8.8,3,thomas-robinson-2,PF,20


In [10]:
off_features = ['last_uni_age', 'pos', 'per','ts_pct','fg3a_per_fga_pct','fta_per_fga_pct','orb_pct','ast_pct','tov_pct','usg_pct','ows','obpm']
def_features = ['last_uni_age', 'pos', 'stl_pct','blk_pct','dws','drb_pct','dbpm']
athletics_features = list(athletics)[1:-1]

In [11]:
# --- Check which features are unused

[col for col in nba.columns if col not in off_features + def_features]

['player_id',
 'name',
 'season',
 'age',
 'team_id',
 'games',
 'minutes_played',
 'trb_pct',
 'ws',
 'bpm',
 'uni_url']

In [12]:
ratio = ratio[(ratio.player_id.isin(nba.player_id)) & (ratio.ratio_off.isna() == False)]
ratio

Unnamed: 0,player_id,player_name,pos,off_score,def_score,uni_off_score,uni_def_score,ratio_off,ratio_def
0,anthony-davis-5,Anthony Davis,PF,3.96,3.34,3.68,3.79,1.08,0.88
1,michael-kidd-gilchrist-1,Michael Kidd-Gilchrist,SF,2.68,1.89,2.73,1.96,0.98,0.96
2,bradley-beal-1,Bradley Beal,SG,2.73,1.71,2.93,1.99,0.93,0.86
3,dion-waiters-1,Dion Waiters,SG,2.36,1.49,2.97,2.04,0.79,0.73
4,thomas-robinson-2,Thomas Robinson,PF,2.68,2.17,3.03,2.67,0.88,0.81
...,...,...,...,...,...,...,...,...,...
240,keita-bates-diop-1,Keita Bates-Diop,SF,2.36,1.80,3.17,2.28,0.74,0.79
241,chimezie-metu-1,Chimezie Metu,C,2.67,1.42,2.69,1.96,0.99,0.72
242,alize-johnson-1,Alize Johnson,PF,3.29,1.94,3.05,1.67,1.08,1.16
243,malik-milton-1,Shake Milton,SG,2.91,1.23,3.11,1.47,0.94,0.84


In [13]:
# --- Add athletics info to uni df

uni = uni.merge(athletics, how = 'inner', on = 'player_id')
uni.head()

Unnamed: 0,player_name,season,school_name,conf_abbr,g,gs,mp,per,ts_pct,efg_pct,...,last_uni_age,position,body_fat_pct,hand_length,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan
0,Anthony Davis,2011-12,Kentucky,SEC,40,40,1281,35.1,0.654,0.628,...,18,PF,7.9,229,216,2064,2096,2743,221.8,2273
1,Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,40,39,1245,21.2,0.57,0.511,...,18,SF,7.0,229,260,1975,2019,2654,232.8,2134
2,Bradley Beal,2011-12,Florida,SEC,37,37,1267,22.0,0.575,0.525,...,18,SG,6.0,216,229,1911,1949,2540,201.8,2032
3,Dion Waiters,2011-12,Syracuse,Big East,37,0,891,26.3,0.565,0.534,...,20,SG,8.5,216,241,1892,1930,2489,221.0,2013
4,Thomas Robinson,2011-12,Kansas,Big 12,39,39,1242,27.4,0.549,0.512,...,20,PF,5.0,248,267,2026,2051,2692,244.2,2216


In [14]:
dataset = uni[uni.player_id.isin(ratio.player_id)]
dataset = dataset.merge(ratio[["player_id", "ratio_off", "ratio_def"]])

dataset.head()

Unnamed: 0,player_name,season,school_name,conf_abbr,g,gs,mp,per,ts_pct,efg_pct,...,body_fat_pct,hand_length,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,ratio_off,ratio_def
0,Anthony Davis,2011-12,Kentucky,SEC,40,40,1281,35.1,0.654,0.628,...,7.9,229,216,2064,2096,2743,221.8,2273,1.08,0.88
1,Michael Kidd-Gilchrist,2011-12,Kentucky,SEC,40,39,1245,21.2,0.57,0.511,...,7.0,229,260,1975,2019,2654,232.8,2134,0.98,0.96
2,Bradley Beal,2011-12,Florida,SEC,37,37,1267,22.0,0.575,0.525,...,6.0,216,229,1911,1949,2540,201.8,2032,0.93,0.86
3,Dion Waiters,2011-12,Syracuse,Big East,37,0,891,26.3,0.565,0.534,...,8.5,216,241,1892,1930,2489,221.0,2013,0.79,0.73
4,Thomas Robinson,2011-12,Kansas,Big 12,39,39,1242,27.4,0.549,0.512,...,5.0,248,267,2026,2051,2692,244.2,2216,0.88,0.81


In [15]:
X_off = dataset[off_features + athletics_features]
X_off = pd.get_dummies(X_off, columns=["pos"], prefix = "pos")

y_off = dataset.ratio_off

In [16]:
X_def = dataset[def_features + athletics_features]
X_def = pd.get_dummies(X_def, columns=["pos"], prefix = "pos")

y_def = dataset.ratio_def

In [17]:
off_scaler = RobustScaler().fit(X_off)
def_scaler = RobustScaler().fit(X_def)

X_off_scaled = off_scaler.transform(X_off)
X_def_scaled = def_scaler.transform(X_def)

In [18]:
X_off_train, X_off_test, y_off_train, y_off_test = train_test_split(X_off_scaled, y_off, test_size=0.2)
X_def_train, X_def_test, y_def_train, y_def_test = train_test_split(X_def_scaled, y_def, test_size=0.2)

In [19]:
lr_off = LinearRegression().fit(X_off_train, y_off_train)
lr_def = LinearRegression().fit(X_def_train, y_def_train)

In [20]:
print("LR off score:", round(lr_off.score(X_off_test, y_off_test), 3))
print("LR def score:", round(lr_def.score(X_def_test, y_def_test), 3))

LR off score: 0.446
LR def score: -0.363


In [21]:
def evaluate(model, type_ = "off"):
    """
    Fit and evaluate a specific ML model on off / def dataset
    """
    
    print(f"Fitting {type(model).__name__} on {type_} dataset\n")
    
    if type_ != "off":
        model_ = model.fit(X_def_train, y_def_train)
        y_preds = model_.predict(X_def_test)
        
        df_preds_baseline = pd.DataFrame({"pred": [y_off_train.mean()]*len(y_off_test), "true": y_off_test})
        df_preds = pd.DataFrame({"pred": y_preds, "true": y_def_test})
        
        print(f"\tModel score: {round(model_.score(X_def_test, y_def_test), 3)}\n")
        
    else:
        model_ = model.fit(X_off_train, y_off_train)
        y_preds = model_.predict(X_off_test)
        
        df_preds_baseline = pd.DataFrame({"pred": [y_def_train.mean()]*len(y_def_test), "true": y_def_test})
        df_preds = pd.DataFrame({"pred": y_preds, "true": y_off_test})
        
        print(f"\tModel score: {round(model_.score(X_off_test, y_off_test), 3)}\n")

    
    df_preds["squared_error"] = (df_preds.true - df_preds.pred)**2
    df_preds_baseline["squared_error"] = (df_preds_baseline.true - df_preds_baseline.pred)**2

    print(f"\tBaseline MSE: {round(df_preds_baseline.squared_error.mean(), 3)}")
    print(f"\tModel MSE: {round(df_preds.squared_error.mean(), 3)}\n")

In [22]:
# --- Quickly evaluate regression models

evaluate(model=LinearRegression(), type_="def")
evaluate(model=LinearRegression(), type_="off")

evaluate(model=RandomForestRegressor(), type_="off")

Fitting LinearRegression on def dataset

	Model score: -0.363

	Baseline MSE: 0.041
	Model MSE: 0.05

Fitting LinearRegression on off dataset

	Model score: 0.446

	Baseline MSE: 0.041
	Model MSE: 0.023

Fitting RandomForestRegressor on off dataset

	Model score: 0.264

	Baseline MSE: 0.041
	Model MSE: 0.03



In [34]:
def_features

['last_uni_age', 'pos', 'stl_pct', 'blk_pct', 'dws', 'drb_pct', 'dbpm']

In [38]:
pd.get_dummies(dataset, columns=["pos"]).corr().iloc[-6].sort_values()

dws                -0.365974
dbpm               -0.302345
ws                 -0.261840
ws_per_40          -0.259282
bpm                -0.256200
stl_pct            -0.219066
per                -0.186509
g                  -0.161895
ows                -0.143717
years              -0.141834
last_uni_age       -0.133486
mp                 -0.128208
blk_pct            -0.117517
gs                 -0.112716
obpm               -0.105088
pprod              -0.104068
drb_pct            -0.075258
ts_pct             -0.067898
trb_pct            -0.048404
pos_SG             -0.046126
hand_width         -0.046125
usg_pct            -0.037912
efg_pct            -0.030115
pos_SF             -0.021268
weight             -0.004896
ast_pct            -0.003023
orb_pct            -0.002143
standing_reach      0.001079
fg3a_per_fga_pct    0.003969
fta_per_fga_pct     0.006927
pos_C               0.014503
wingspan            0.014849
pos_PF              0.017661
height_wo_shoes     0.019841
height_w_shoes

In [30]:
y_def_preds = lr_def.predict(X_def_test)

pd.DataFrame({"pred": y_def_preds, "true": y_def_test})

Unnamed: 0,pred,true
25,0.811948,1.1
83,0.938823,0.89
165,1.192253,0.96
6,1.15151,1.22
45,0.799304,0.83
138,0.776755,1.15
58,0.815117,1.21
129,0.742849,0.87
68,0.841559,0.66
153,0.658501,1.09
