In [50]:
'''Notes:
3. This pickle file should added to the flask site with the ability to import all of the days players'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline

## Data Cleanup

In [85]:
os.chdir(r'F:\Projects\NBA\DB_Files')
df = pd.read_csv('games_export.csv')

In [86]:
#Add how well they did in the last game
df['Last_Game_Points'] = df.groupby('Name').PTS.shift(1)

In [87]:
#The rolling average method returns a multi-index dataframe. This needs to be added to the dataframe with the indexs dropped. 
average = df.groupby('Name')['PTS'].rolling(5).mean()
df["Average"] = average.reset_index(level=0, drop=True)

In [54]:
df.columns

Index(['Name', 'FTM', '3P%', '3PA', 'TOV', 'PF', 'REB', '3PM', 'FG%', 'FGA',
       'OREB', 'FT%', 'FTA', 'FGM', 'DREB', 'PTS', 'MIN', '+/-', 'AST', 'STL',
       'BLK', 'Date', 'Game', '1QH', '2QH', '3QH', '4QH', '1QA', '2QA', '3QA',
       '4QA', 'Total_H', 'Total_A', 'Ref1', 'Ref2', 'Ref3', 'W/L', 'Home',
       'Away', 'Fantasy_Score', 'Team', 'Years', 'Pos', 'Ht', 'Wt',
       'Last_Game_Points', 'Average'],
      dtype='object')

In [55]:
#This will remove everyone from the data frame who has played less than 4 games
#names = df.groupby('Name')
#df = names.filter(lambda x: len(x) > 4)

In [88]:
df['Date'] = pd.to_datetime(df['Date'])

In [89]:
#The function to fix the game number issue
def game_number_fix(row):
    if row['Game'] > 800:
        return row['Game'] - 99
    else:
        return row['Game']   

In [90]:
df['game_number'] = df.apply(lambda row: game_number_fix(row),axis=1)

In [91]:
def home(row):
    if row['Home'] == row['Team']:
        return row['Home']
    else: 
        return row['Away']

In [92]:
#Add a column for whether they are home or away
df['Home/Away'] = np.NAN
df['Home/Away'] = df.apply(lambda row: home(row), axis=1)

In [93]:
def opposing_team(row):
    if row['Home'] == row['Team']:
        return row['Away']
    else: 
        return row['Home'] 

In [94]:
#Add a column for whether they are home or away
df['Opponent'] = np.NAN
df['Opponent'] = df.apply(lambda row: opposing_team(row), axis=1)

In [63]:
df.drop(columns=['Home', 'Away','Game', 'MIN', 'Date','FTM', '3P%', '3PA', 'TOV', 'PF', 'REB', '3PM', 'FG%', 'FGA',
       'OREB', 'FT%', 'FTA', 'FGM', 'DREB','+/-', 'AST', 'STL',
       'BLK', '1QH', '2QH', '3QH', '4QH', '1QA', '2QA', '3QA',
       '4QA', 'Total_H', 'Total_A', 'W/L','Ht','Fantasy_Score'], axis=1,inplace=True)

In [64]:
df.columns

Index(['Name', 'PTS', 'Ref1', 'Ref2', 'Ref3', 'Team', 'Years', 'Pos', 'Wt',
       'Last_Game_Points', 'Average', 'game_number', 'Home/Away', 'Opponent'],
      dtype='object')

In [95]:
df_dummies = pd.get_dummies(df)

In [96]:
df_dummies = df_dummies.fillna(0)

## Machine Learning Section

In [67]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [68]:
X = df_dummies.drop(columns=['PTS'])
y = df_dummies['PTS']

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [70]:
regr = RandomForestRegressor()

In [71]:
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [72]:
predictions = regr.predict(X_test)

In [73]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 4.05876014198783
MSE: 28.89946374239351
RMSE: 5.375822145718133


## Put all predicts and actuals in a df

In [74]:
df_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions, 'Actual Index': y_test.index})  

In [75]:
df_comparison['Difference'] = np.sqrt((df_comparison['Actual'] - df_comparison['Predicted']) ** 2)

In [76]:
df_comparison.nlargest(5,'Difference')
#The index is the actual index

Unnamed: 0,Actual,Predicted,Actual Index,Difference
13094,56,26.8,13094,29.2
17195,41,12.7,17195,28.3
367,41,12.7,367,28.3
13431,47,18.9,13431,28.1
2842,50,23.5,2842,26.5


In [77]:
problem = df_comparison.nlargest(5,'Difference').iloc[0][2]

In [78]:
problem = X_test.loc[problem]
problem = problem.where(problem >= 1)
problem = problem.dropna()
problem

Years                        3.0
Wt                         235.0
Last_Game_Points            15.0
Average                     28.0
game_number               1021.0
Name_KarlAnthony Towns       1.0
Ref1_Smith"                  1.0
Ref2_Lewis"                  1.0
Ref3_Orr                     1.0
Team_MIN                     1.0
Pos_C                        1.0
Home/Away_MIN                1.0
Opponent_ATL                 1.0
Name: 13094, dtype: float64

In [98]:
test = df[df['game_number'] == 968]
test[['Name', 'REB']]

Unnamed: 0,Name,REB
272,Allen Crabbe,5
2717,Caris LeVert,5
4361,DAngelo Russell,4
4554,DeMarre Carroll,6
5508,Dwayne Bacon,6
5975,Dwight Howard,30
6915,Frank Kaminsky,4
8790,Jarrett Allen,9
10314,Joe Harris,6
10481,Jeremy Lamb,7


In [80]:
regr.score(X_test, y_test)

0.5510006751144264

In [81]:
feature_importances = pd.DataFrame(regr.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
Average,0.590117
game_number,0.053744
Last_Game_Points,0.049135
Wt,0.019864
Years,0.016105
Pos_F,0.002651
Pos_G,0.002546
Ref3_Taylor,0.001930
Opponent_NOP,0.001898
Ref3_Orr,0.001687
