In [180]:
'''Notes:
1. Need to add the previous seasons average to the csv file. Also get injury status. National TV if possible.
After that, the machine learning section should be broken out to its own module and then called on all of the important
features in determining the fan duel fantasy score. The players year should be entered, but this should be a dummy variable
instead of an actual int value. 

2. a function should be created to add up all of these predicted values and a fantasy score calculated(This function is 
already done in the other data cleanup notebook). This should then be applied against a salary amount in order to calculate the
potential value of each player. 

3. This should be imported into the flask site with the ability to import all of the days players'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline

## Data Cleanup

In [181]:
os.chdir(r'F:\Projects\NBA\DB_Files')
df = pd.read_csv('games_export.csv')

In [182]:
#Add how well they did in the last game
df['Last_Game_Points'] = df.groupby('Name').PTS.shift(1)

In [183]:
#The rolling average method returns a multi-index dataframe. This needs to be added to the dataframe with the indexs dropped. 
average = df.groupby('Name')['PTS'].rolling(5).mean()
df["Average"] = average.reset_index(level=0, drop=True)

In [184]:
df

Unnamed: 0,Name,FTM,3P%,3PA,TOV,PF,REB,3PM,FG%,FGA,...,Ref2,Ref3,W/L,Home,Away,Fantasy_Score,Position,Team,Last_Game_Points,Average
0,Aaron Brooks,0,100.0,1,0,0,1,1,66.7,3,...,"Kogut""""",Orr,INDIANA,IND,MIN,11.0,Guard,MIN,,
1,Aaron Brooks,0,0.0,1,0,0,1,0,0.0,1,...,"Malloy""""",Goldenberg,DETROIT,MIN,DET,1.0,Guard,MIN,5.0,
2,Aaron Brooks,0,0.0,0,0,0,0,0,0.0,0,...,"Blair""""",Cutler,MINNESOTA,DAL,MIN,0.0,Guard,MIN,0.0,
3,Aaron Brooks,0,0.0,0,0,0,1,0,100.0,1,...,"Scott""""",Ervin,MINNESOTA,CHA,MIN,3.0,Guard,MIN,0.0,
4,Aaron Brooks,1,0.0,1,0,0,0,0,50.0,2,...,"Collins""""",Petraitis,GOLDEN,MIN,GSW,3.5,Guard,MIN,2.0,2.0
5,Aaron Brooks,0,100.0,1,0,1,0,1,50.0,2,...,"Dalen""""",Nansel,MINNESOTA,MIN,DAL,6.5,Guard,MIN,3.0,1.6
6,Aaron Brooks,0,0.0,1,0,0,0,0,0.0,1,...,"Blair""""",Williams,CHARLOTTE,MIN,CHA,0.0,Guard,MIN,3.0,1.6
7,Aaron Brooks,0,100.0,1,0,2,1,1,25.0,4,...,"Maddox""""",Adair,MIAMI,MIA,MIN,9.0,Guard,MIN,0.0,2.2
8,Aaron Brooks,0,100.0,1,1,1,0,1,100.0,2,...,"Smith""""",Holtkamp,MINNESOTA,PHX,MIN,6.0,Guard,MIN,3.0,2.8
9,Aaron Brooks,0,0.0,0,0,1,1,0,0.0,1,...,"Forte""""",Sterling,WASHINGTON,WAS,MIN,4.0,Guard,MIN,5.0,2.2


In [185]:
#This will remove everyone from the data frame who has played less than 4 games
#names = df.groupby('Name')
#df = names.filter(lambda x: len(x) > 4)

In [186]:
df['Date'] = pd.to_datetime(df['Date'])

In [187]:
#The function to fix the game number issue
def game_number_fix(row):
    if row['Game'] > 800:
        return row['Game'] - 99
    else:
        return row['Game']   

In [188]:
df['game_number'] = df.apply(lambda row: game_number_fix(row),axis=1)

In [189]:
def home(row):
    if row['Home'] == row['Team']:
        return row['Home']
    else: 
        return row['Away']

In [190]:
#Add a column for whether they are home or away
df['Home/Away'] = np.NAN
df['Home/Away'] = df.apply(lambda row: home(row), axis=1)

In [191]:
def opposing_team(row):
    if row['Home'] == row['Team']:
        return row['Away']
    else: 
        return row['Home'] 

In [192]:
#Add a column for whether they are home or away
df['Opponent'] = np.NAN
df['Opponent'] = df.apply(lambda row: opposing_team(row), axis=1)

In [193]:
df.drop(columns=['Home', 'Away','Game', 'MIN', 'Date','FTM', '3P%', '3PA', 'TOV', 'PF', 'REB', '3PM', 'FG%', 'FGA',
       'OREB', 'FT%', 'FTA', 'FGM', 'DREB','+/-', 'AST', 'STL',
       'BLK', '1QH', '2QH', '3QH', '4QH', '1QA', '2QA', '3QA',
       '4QA', 'Total_H', 'Total_A', 'W/L','Fantasy_Score'], axis=1,inplace=True)

In [194]:
df.columns

Index(['Name', 'PTS', 'Ref1', 'Ref2', 'Ref3', 'Position', 'Team',
       'Last_Game_Points', 'Average', 'game_number', 'Home/Away', 'Opponent'],
      dtype='object')

In [195]:
df_dummies = pd.get_dummies(df)

In [196]:
df_dummies = df_dummies.fillna(0)

## Machine Learning Section

In [197]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [198]:
X = df_dummies.drop(columns=['PTS'])
y = df_dummies['PTS']

In [199]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [200]:
regr = RandomForestRegressor()

In [201]:
regr.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [202]:
predictions = regr.predict(X_test)

In [203]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 4.04467680608365
MSE: 28.777476552598223
RMSE: 5.364464237237324


## Put all predicts and actuals in a df

In [204]:
df_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': predictions, 'Actual Index': y_test.index})  

In [205]:
df_comparison['Difference'] = np.sqrt((df_comparison['Actual'] - df_comparison['Predicted']) ** 2)

In [206]:
df_comparison.nlargest(5,'Difference')
#The index is the actual index

Unnamed: 0,Actual,Predicted,Actual Index,Difference
1078,33,1.7,1078,31.3
31,41,9.8,31,31.2
1738,29,2.3,1738,26.7
3050,50,23.5,3050,26.5
14543,29,3.2,14543,25.8


In [207]:
problem = X_test.loc[1078]
problem = problem.where(problem >= 1)
problem = problem.dropna()
problem

game_number                8.0
Name_Anthony Davis         1.0
Ref1_Corbin""              1.0
Ref2_Zielinski""           1.0
Ref3_Twardoski             1.0
Position_Forward-Center    1.0
Team_NOP                   1.0
Home/Away_NOP              1.0
Opponent_MEM               1.0
Name: 1078, dtype: float64

In [208]:
test = df[df['game_number'] == 8]
test[['Name', 'PTS']]

Unnamed: 0,Name,PTS
912,Andrew Harrison,2
1078,Anthony Davis,33
2135,Brandan Wright,10
2790,Chandler Parsons,6
2887,Cheick Diallo,0
3916,Dante Cunningham,7
4090,Darius Miller,0
4908,DeMarcus Cousins,28
5606,Dillon Brooks,19
7129,ETwaun Moore,11


In [209]:
regr.score(X_test, y_test)

0.5562737464517187

In [212]:
feature_importances = pd.DataFrame(regr.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

Unnamed: 0,importance
Average,5.895806e-01
Last_Game_Points,5.626684e-02
game_number,5.163104e-02
Position_Guard,3.831504e-03
Position_Forward,3.150005e-03
Position_Center,2.221053e-03
Opponent_PHI,2.091109e-03
Ref3_Taylor,2.084639e-03
Opponent_ATL,1.826832e-03
"Ref1_Mauer""""",1.796807e-03
