In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
import xgboost as xgb
import shap
import pickle

In [None]:
np.random.seed(123)

In [None]:
data = pd.read_csv("Pts Ast Reb 2.csv")

In [None]:
#Filtering out players who played less than 10 career games
data = data.groupby('Player_(POI_Game_Stats)').filter(lambda x: len(x) > 10)

In [None]:
# Dropping Points and Assists Since they are not needed to predict points
data = data.drop(['PTS','AST'], axis=1)


In [None]:
# Keeping only year age of players and dropping number of days
def year_age(row,column):
    year = int(row[column][0:2])
    return year

In [None]:
data['Age_(POI_Game_Stats)'] = data.apply(year_age,args=('Age_(POI_Game_Stats)',), axis=1)
data['Age_(DOI_Game_Stats)'] = data.apply(year_age,args=('Age_(DOI_Game_Stats)',), axis=1)

In [None]:
# Converting Date columne to Datetime type
data['Date'] = pd.to_datetime(data['Date'])

In [None]:
# sorting in descending order of dates, so when i get the rolling mean in the next code block, it's for the past X games

data = data.sort_values('Date')

In [None]:
# Getting Trend for average number of points in the past i games
def past_X_games(i,df):
    group = df.groupby('Player_(POI_Game_Stats)')['TRB'].apply(lambda x: x.shift().rolling(i).mean()).reset_index()
    column_name = "REB_{}".format(i)
    thing = group.set_index('index').rename(columns={"TRB":column_name})
    return thing

In [None]:
moving_days = [3,5,7,10]

In [None]:
# Calculating Average points in past 3,5,7 and 10 games
for i in moving_days:
    window_av = past_X_games(i,data)
    data = data.join(window_av,how='left')

In [None]:
# Keeping only month of date

data['Date'] = data['Date'].dt.strftime('%b')

In [None]:
# Keeping only the year the season began

def season(row):
    return int(row['Season'][0:4])

In [None]:
data['Season'] = data.apply(season,axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import VarianceThreshold

In [None]:
le = LabelEncoder()

In [None]:
# Creating binary variables for Position and Home/Away since they both ony have 2 options

data['Pos'] = le.fit_transform(data['Pos'])
data['Home/Away'] = le.fit_transform(data['Home/Away'])

In [None]:
data = data.drop(['Player_(POI_Game_Stats)','Tm_(POI_Game_Stats)','Opp','Player_(DOI_Game_Stats)', 'Season'], axis=1)

In [None]:
# Creating dummy variables for categorical data, ie the month of the game played

dummy = pd.get_dummies(data['Date'])
data = data.drop('Date',axis=1)



In [None]:
data = data.join(dummy, how='left')


In [None]:
# Filter out features that have limited correlation to the Pts scored in a game

for i in data.columns.values:
    cor = abs(data['TRB'].corr(data[i]))
    #cor = str(cor)
    if cor > 0.1:
        continue
    else:
        del data[i]

In [None]:
X = data.drop(['TRB'],axis=1)
y = data['TRB']

In [None]:
# Filter out features that are largely constant or change minimaly

sel = VarianceThreshold()
vt = sel.fit(X)
X = X.iloc[:, vt.variances_ > 0.2] 

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score,RandomizedSearchCV, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error

In [None]:
# Creating Simple Linear Regression Model and Gradient Boosting Regression Model

gbr = xgb.XGBRegressor(max_depth=5,n_estimators=250,learning_rate=0.01)
lr = LinearRegression()

In [None]:
X_train,X_test ,y_train,y_test = train_test_split(X,y,test_size=0.3,train_size=0.7)

In [None]:
gbr.fit(X_train, y_train)
lr.fit(X_train,y_train)

In [None]:
mean_squared_error(y_test,gbr.predict(X_test))


In [None]:
mean_squared_error(y_test,lr.predict(X_test))


In [None]:
# Dataframe of the features importances from the gradient boosting model

feature_importances = pd.DataFrame([gbr.feature_importances_], columns=X.columns.values).transpose()

In [None]:
# Only keeping features that have importance more than 0.01

important_features = feature_importances[feature_importances[0]>0.01].reset_index()['index'].tolist()

In [None]:
feature_importances.plot(kind='bar')

In [None]:
# Filter out all features that were not important features

X_ = X[X.columns.intersection(important_features)]

In [None]:
# Test, Train split with only the important features

X_train,X_test ,y_train,y_test = train_test_split(X_,y,test_size=0.3,train_size=0.7)


In [None]:
gbr.fit(X_train, y_train)
mean_squared_error(y_test,gbr.predict(X_test))

In [None]:
# Hyperparameter tuning the number of estimators and the learning rate

parameters = {"n_estimators": [100,250,500,750], "learning_rate": [0.05,0.1,0.2]}

grid = GridSearchCV(gbr,parameters, scoring='neg_mean_squared_error',cv=3)

grid.fit(X_,y)

print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
# We see ideal learning rate is 0.05 and number of estimators is 100

In [None]:
# Hyperparameter tuning the max depth and the min child weight of the model

parameters = {"max_depth":[3,5,7,9], "min_child_weight":[1,3,5]}

grid = GridSearchCV(estimator=xgb.XGBRegressor(n_estimators=100,learning_rate=0.05),param_grid=parameters, scoring='neg_mean_squared_error',cv=3)

grid.fit(X_,y)                    

print("Best: %f using %s" % (grid.best_score_, grid.best_params_))

means = grid.cv_results_['mean_test_score']
stds = grid.cv_results_['std_test_score']
params = grid.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
    
# We see ideal max depth is 3 and child weight is 5

In [None]:
gbr = xgb.XGBRegressor(n_estimators=100,learning_rate=0.05,max_depth=3,min_child_weight=5,importance_type='gain',booster='gbtree')

In [None]:
gbr.fit(X,y)

In [None]:
explainer = shap.TreeExplainer(gbr)
shap_values = explainer.shap_values(X_train)

In [None]:
shap.initjs()

In [None]:
# Shap values of first prediction

shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])


In [None]:
# Looking at summary of shap values for all important features
shap.summary_plot(shap_values, X_train)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
# Model with 1 layer, with nodes equal to number of features
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(len(X.columns.values), input_dim=len(X.columns.values), kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

# Model with 1 hidden layer
def Double_model():
    # create model
    model = Sequential()
    model.add(Dense(len(X.columns.values), input_dim=len(X.columns.values), kernel_initializer='normal', activation='relu'))
    model.add(Dense(0.5*len(X.columns.values), kernel_initializer='normal',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

# Model with 2 hidden layers
def Triple_model():
    # create model
    model = Sequential()
    model.add(Dense(len(X.columns.values), input_dim=len(X.columns.values), kernel_initializer='normal', activation='relu'))
    model.add(Dense(2*len(X.columns.values), kernel_initializer='normal',activation='relu'))
    model.add(Dense(0.5*len(X.columns.values), kernel_initializer='normal',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

# 1 hidden layer thats twice the size as the number of features
def FunStuff_model():
    # create model
    model = Sequential()
    model.add(Dense(len(X.columns.values), input_dim=len(X.columns.values), kernel_initializer='normal', activation='relu'))
    model.add(Dense(2*len(X.columns.values), kernel_initializer='normal',activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    # Compile model
    model.compile(loss='mean_absolute_error', optimizer='adam')
    return model

In [None]:
# Testing the First Neural Net
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=baseline_model, epochs=10, batch_size=500, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=3)
results = cross_val_score(pipeline, X, y, cv=kfold, scoring="neg_mean_squared_error")
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
# Testing the 2nd Neural Net
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=Double_model, epochs=10, batch_size=500, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=3)
results = cross_val_score(pipeline, X, y, cv=kfold, scoring="neg_mean_squared_error")
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
# Testing the 3rd Neural Net
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=Triple_model, epochs=10, batch_size=500, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=3)
results = cross_val_score(pipeline, X, y, cv=kfold, scoring="neg_mean_squared_error")
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
# Testing the 4th Neural Net
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(build_fn=FunStuff_model, epochs=10, batch_size=500, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=3)
results = cross_val_score(pipeline, X, y, cv=kfold, scoring="neg_mean_squared_error")
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
'''The Gradient Boosting Regeression Model with tuned hyperparameters 
    performed the best, so thats the one that will be used in the website'''