In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_validate, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.preprocessing import RobustScaler, MinMaxScaler

In [2]:
file = "../nba_forecast/data/player_ratio_scores.csv"
file_2 = "../nba_forecast/data/last_ncaa_season.csv"
file_3 = "../nba_forecast/data/final_athletics.csv"
ratios = pd.read_csv(file)
features = pd.read_csv(file_2)
athletics = pd.read_csv(file_3)

### Common Dataset Update

In [3]:
final_file = '../nba_forecast/data/common_dataset.csv'
common_dataset_updated = pd.read_csv(final_file)

In [4]:
common_dataset_updated.tail()

Unnamed: 0,player_name,season,school_name,conf_abbr,mp,per,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,...,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,ratio_off,ratio_def,conf_rating,gs_pct
240,Keita Bates-Diop,2017-18,Ohio State,Big Ten,1125,27.5,0.577,0.544,0.357,0.274,...,216,2013,2045,2705,223.8,2216,0.74,0.79,0.552393,1.0
241,Chimezie Metu,2017-18,USC,Pac-12,1053,23.5,0.574,0.538,0.102,0.388,...,235,2045,2070,2743,219.6,2146,0.99,0.72,0.550836,0.97
242,Alize Johnson,2017-18,Missouri State,MVC,1028,24.1,0.528,0.481,0.365,0.363,...,248,2013,2032,2616,216.6,2051,1.08,1.16,0.527264,1.0
243,Shake Milton,2017-18,SMU,AAC,800,24.2,0.606,0.551,0.471,0.405,...,241,1943,1968,2527,207.2,2153,0.94,0.84,0.475291,1.0
244,Kostas Antetokounmpo,2017-18,Dayton,A-10,438,13.7,0.575,0.584,0.149,0.634,...,241,2057,2096,2794,194.8,2191,1.0,1.62,0.505994,0.21


In [5]:
ratios_pro = ratios.dropna()

In [6]:
ratios_pro.tail()

Unnamed: 0,player_id,player_name,pos,off_score,def_score,uni_off_score,uni_def_score,ratio_off,ratio_def
240,keita-bates-diop-1,Keita Bates-Diop,SF,2.36,1.8,3.17,2.28,0.74,0.79
241,chimezie-metu-1,Chimezie Metu,C,2.67,1.42,2.69,1.96,0.99,0.72
242,alize-johnson-1,Alize Johnson,PF,3.29,1.94,3.05,1.67,1.08,1.16
243,malik-milton-1,Shake Milton,SG,2.91,1.23,3.11,1.47,0.94,0.84
244,kostas-antetokounmpo-1,Kostas Antetokounmpo,PF,1.86,2.34,1.86,1.44,1.0,1.62


In [7]:
common_dataset_updated[['off_score','def_score','uni_off_score','uni_def_score']] = ratios_pro[['off_score','def_score','uni_off_score','uni_def_score']]

In [8]:
common_dataset_updated.to_csv('../nba_forecast/data/common_dataset_updated.csv', index=False)

### Risk Data Cleaning

In [10]:
all_data = ratios.merge(features, how='inner', on='player_id')
all_data = all_data.merge(athletics, how='inner', on='player_id')

In [11]:
all_data.drop(columns=['bpm','ws_per_40','ws','trb_pct','pprod','conf_abbr','school_name','season','player_name_y','ratio_def','ratio_off','uni_def_score','uni_off_score','def_score','off_score','pos','player_name_x', 'gs','g','player_id'],inplace=True)


### Setting 3rd Season: 1/0

In [12]:
all_data_df = all_data

In [13]:
all_data_df['3rd_NBA_season'] = ratios['ratio_off']
a = all_data_df['3rd_NBA_season']
all_data_df['3rd_NBA_season'] = a.where(a > 0, other= 0)
all_data_df['3rd_NBA_season'] = a.where(a <= 0, other= 1)
all_data_df['3rd_NBA_season'] = [int(i) for i in all_data_df['3rd_NBA_season']]
all_data_df

Unnamed: 0,mp,per,ts_pct,efg_pct,fg3a_per_fga_pct,fta_per_fga_pct,orb_pct,drb_pct,ast_pct,stl_pct,...,position,body_fat_pct,hand_length,hand_width,height_wo_shoes,height_w_shoes,standing_reach,weight,wingspan,3rd_NBA_season
0,1281,35.1,0.654,0.628,0.059,0.602,11.6,25.6,7.5,2.5,...,PF,7.90,229,216,2064,2096,2743,221.8,2273,1
1,1245,21.2,0.570,0.511,0.156,0.589,10.3,17.3,10.8,1.9,...,SF,7.00,229,260,1975,2019,2654,232.8,2134,1
2,1267,22.0,0.575,0.525,0.473,0.440,4.8,20.3,12.7,2.5,...,SG,6.00,216,229,1911,1949,2540,201.8,2032,1
3,891,26.3,0.565,0.534,0.317,0.331,2.4,8.8,21.2,4.6,...,SG,8.50,216,241,1892,1930,2489,221.0,2013,1
4,1242,27.4,0.549,0.512,0.027,0.462,11.1,33.0,13.1,2.0,...,PF,5.00,248,267,2026,2051,2692,244.2,2216,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
329,1091,20.3,0.599,0.552,0.454,0.292,1.4,14.9,19.6,1.8,...,SG,8.45,216,254,1937,1981,2591,206.4,2026,0
330,995,25.6,0.568,0.550,0.056,0.338,13.2,20.9,9.6,3.1,...,PF,5.55,235,229,2057,2089,2769,215.4,2254,0
331,1064,23.7,0.553,0.517,0.417,0.257,8.0,19.9,15.2,2.1,...,PF,5.15,235,260,1994,2026,2718,211.6,2222,0
332,1096,20.3,0.577,0.552,0.335,0.174,9.1,25.8,7.8,1.1,...,PF,10.90,229,267,2121,2146,2781,254.0,2134,0


In [26]:
all_data_df.to_csv('../nba_forecast/data/risk_dataset.csv')

### Scaling + Model Fitting

In [14]:
scaler = RobustScaler()
mm_scaler = MinMaxScaler()

In [15]:
X = all_data_df[['mp','per','ts_pct','efg_pct','fg3a_per_fga_pct','fta_per_fga_pct',
            'orb_pct','drb_pct','ast_pct','stl_pct','blk_pct','tov_pct','usg_pct','ows','dws','obpm','dbpm','years']]
y = all_data_df[['3rd_NBA_season']]

In [16]:
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled,y, train_size=0.8)

In [17]:
model = LogisticRegression(max_iter=300)
cv_results = cross_validate(model, X_train, y_train, cv=5, scoring=['precision'])

  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)


In [18]:
cv_results

{'fit_time': array([0.03613091, 0.01407099, 0.01518679, 0.01164722, 0.01130605]),
 'score_time': array([0.00936317, 0.00169516, 0.00179482, 0.00187278, 0.00140905]),
 'test_precision': array([0.74074074, 0.7755102 , 0.75      , 0.75      , 0.76470588])}

In [19]:
base_score = cv_results['test_precision'].mean()
base_score

0.7561913654350629

In [20]:
y_train = y_train.squeeze()

In [21]:
proba_0, proba_1 = cross_val_predict(LogisticRegression(),X_train, y_train.to_list(),method = "predict_proba").T
precision, recall, thresholds = precision_recall_curve(y_train, proba_1)

### Threshold Changing

In [22]:
df_precision = pd.DataFrame({"precision" : precision[:-1], "threshold" : thresholds})
new_threshold = df_precision[df_precision['precision'] >= 0.75]['threshold'].min()
new_threshold

0.3789109813284838

In [23]:
model = LogisticRegression()
model.fit(X_train, y_train)

def custom_predict(X_train, custom_threshold):
    probs = model.predict_proba(X_train)
    return probs

custom_prediction = custom_predict(X_test, custom_threshold=new_threshold)
predictions = pd.DataFrame(custom_prediction, columns=['0','1'])

In [24]:
predictions

Unnamed: 0,0,1
0,0.179033,0.820967
1,0.272458,0.727542
2,0.282737,0.717263
3,0.313732,0.686268
4,0.305635,0.694365
...,...,...
62,0.247119,0.752881
63,0.228109,0.771891
64,0.227505,0.772495
65,0.269126,0.730874
