In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns
%matplotlib inline

## Data Cleanup

In [264]:
os.chdir(r'F:\Projects\NBA\DB_Files')
df = pd.read_csv('games_export.csv')

In [265]:
#Add how well they did in the last game
df['Last_Game_Points'] = df.groupby('Name').PTS.shift(1)

In [266]:
#The rolling average method returns a multi-index dataframe. This needs to be added to the dataframe with the indexs dropped. 
average = df.groupby('Name')['PTS'].rolling(5).mean()
df["Average"] = average.reset_index(level=0, drop=True)

In [267]:
df

Unnamed: 0,Name,FTM,3P%,3PA,TOV,PF,REB,3PM,FG%,FGA,...,Ref2,Ref3,W/L,Home,Away,Fantasy_Score,Position,Team,Last_Game_Points,Average
0,Aaron Brooks,0,100.0,1,0,0,1,1,66.7,3,...,"Kogut""""",Orr,INDIANA,IND,MIN,11.0,Guard,MIN,,
1,Aaron Brooks,0,0.0,1,0,0,1,0,0.0,1,...,"Malloy""""",Goldenberg,DETROIT,MIN,DET,1.0,Guard,MIN,5.0,
2,Aaron Brooks,0,0.0,0,0,0,0,0,0.0,0,...,"Blair""""",Cutler,MINNESOTA,DAL,MIN,0.0,Guard,MIN,0.0,
3,Aaron Brooks,0,0.0,0,0,0,1,0,100.0,1,...,"Scott""""",Ervin,MINNESOTA,CHA,MIN,3.0,Guard,MIN,0.0,
4,Aaron Brooks,1,0.0,1,0,0,0,0,50.0,2,...,"Collins""""",Petraitis,GOLDEN,MIN,GSW,3.5,Guard,MIN,2.0,2.0
5,Aaron Brooks,0,100.0,1,0,1,0,1,50.0,2,...,"Dalen""""",Nansel,MINNESOTA,MIN,DAL,6.5,Guard,MIN,3.0,1.6
6,Aaron Brooks,0,0.0,1,0,0,0,0,0.0,1,...,"Blair""""",Williams,CHARLOTTE,MIN,CHA,0.0,Guard,MIN,3.0,1.6
7,Aaron Brooks,0,100.0,1,0,2,1,1,25.0,4,...,"Maddox""""",Adair,MIAMI,MIA,MIN,9.0,Guard,MIN,0.0,2.2
8,Aaron Brooks,0,100.0,1,1,1,0,1,100.0,2,...,"Smith""""",Holtkamp,MINNESOTA,PHX,MIN,6.0,Guard,MIN,3.0,2.8
9,Aaron Brooks,0,0.0,0,0,1,1,0,0.0,1,...,"Forte""""",Sterling,WASHINGTON,WAS,MIN,4.0,Guard,MIN,5.0,2.2


In [274]:
#This will remove everyone from the data frame who has played less than 4 games
names = df.groupby('Name')
df = names.filter(lambda x: len(x) > 4)

In [275]:
df['Date'] = pd.to_datetime(df['Date'])

In [276]:
df.dtypes

Name                        object
FTM                          int64
3P%                        float64
3PA                          int64
TOV                          int64
PF                           int64
REB                          int64
3PM                          int64
FG%                        float64
FGA                          int64
OREB                         int64
FT%                        float64
FTA                          int64
FGM                          int64
DREB                         int64
PTS                          int64
MIN                         object
+/-                          int64
AST                          int64
STL                          int64
BLK                          int64
Date                datetime64[ns]
Game                         int64
1QH                          int64
2QH                          int64
3QH                          int64
4QH                          int64
1QA                          int64
2QA                 

In [277]:
#The function to fix the game number issue
def game_number_fix(row):
    if row['Game'] > 800:
        return row['Game'] - 99
    else:
        return row['Game']   

In [278]:
df['game_number'] = df.apply(lambda row: game_number_fix(row),axis=1)

In [279]:
def home(row):
    if row['Home'] == row['Team']:
        return row['Home']
    else: 
        return row['Away']

In [280]:
#Add a column for whether they are home or away
df['Home/Away'] = np.NAN
df['Home/Away'] = df.apply(lambda row: home(row), axis=1)

In [281]:
def opposing_team(row):
    if row['Home'] == row['Team']:
        return row['Away']
    else: 
        return row['Home'] 

In [282]:
#Add a column for whether they are home or away
df['Opponent'] = np.NAN
df['Opponent'] = df.apply(lambda row: opposing_team(row), axis=1)

In [283]:
df.drop(columns=['game_number','Game', 'MIN', 'Date','FTM', '3P%', '3PA', 'TOV', 'PF', 'REB', '3PM', 'FG%', 'FGA',
       'OREB', 'FT%', 'FTA', 'FGM', 'DREB','+/-', 'AST', 'STL',
       'BLK', '1QH', '2QH', '3QH', '4QH', '1QA', '2QA', '3QA',
       '4QA', 'Total_H', 'Total_A', 'W/L','Fantasy_Score'], axis=1,inplace=True)

In [284]:
df.columns

Index(['Name', 'PTS', 'Ref1', 'Ref2', 'Ref3', 'Home', 'Away', 'Position',
       'Team', 'Last_Game_Points', 'Average', 'Home/Away', 'Opponent'],
      dtype='object')

In [285]:
df_dummies = pd.get_dummies(df)

In [286]:
df_dummies = df_dummies.fillna(0)

In [252]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
scaled_features = StandardScaler().fit_transform(df_dummies)

In [253]:
#Keep the original index
scaled_features_df = pd.DataFrame(scaled_features, index=df_dummies.index, columns=df_dummies.columns)

In [254]:
scaled_features_df

Unnamed: 0,PTS,Last_Game_Points,Average,Name_Aaron Brooks,Name_Aaron Gordon,Name_Aaron Harrison,Name_Aaron Jackson,Name_Abdel Nader,Name_Adreian Payne,Name_Al Horford,...,Opponent_OKC,Opponent_ORL,Opponent_PHI,Opponent_PHX,Opponent_POR,Opponent_SAC,Opponent_SAS,Opponent_TOR,Opponent_UTA,Opponent_WAS
0,-0.627764,-1.220881,-1.378798,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
1,-1.252637,-0.599971,-1.378798,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
2,-1.252637,-1.220881,-1.378798,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
3,-1.002688,-1.220881,-1.378798,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
4,-0.877713,-0.972517,-1.082674,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
5,-0.877713,-0.848335,-1.141899,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
6,-1.252637,-0.848335,-1.141899,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
7,-0.877713,-1.220881,-1.053062,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
8,-0.627764,-0.848335,-0.964225,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,5.448793,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,-0.182428
9,-1.252637,-0.599971,-1.053062,28.211700,-0.048455,-0.019406,-0.006468,-0.04294,-0.014463,-0.052214,...,-0.180337,-0.186429,-0.184620,-0.183527,-0.184135,-0.190593,-0.182428,-0.184620,-0.188104,5.481610


## Machine Learning Section

In [287]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [288]:
X = df_dummies.drop(columns=['PTS'])
y = df_dummies['PTS']

In [289]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [290]:
lm = LinearRegression()

In [291]:
lm.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [292]:
print(lm.intercept_)

-47949622758.51043


In [293]:
predictions = lm.predict(X_test)

In [294]:
from sklearn import metrics

In [295]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 4.093041346884447
MSE: 29.214213476312146
RMSE: 5.405017435338404


## Machine Learning Section(Scaler)

In [255]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [256]:
X = scaled_features_df.drop(columns=['PTS'])
y = scaled_features_df['PTS']

In [257]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [258]:
lm = LinearRegression()

In [259]:
lm.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [260]:
print(lm.intercept_)

-42512623502.475845


In [261]:
predictions = lm.predict(X_test)

In [262]:
from sklearn import metrics

In [263]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 301777549249.8779
MSE: 3.696054082511381e+26
RMSE: 19225124401447.656


## Put all predicts and actuals in a df

In [297]:
df_assess = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})  

In [300]:
df_assess['Difference'] = df_assess['Actual'] - df_assess['Predicted']

In [302]:
df_assess.nlargest(5,'Difference')

Unnamed: 0,Actual,Predicted,Difference
31,41,9.678192,31.321808
4867,52,24.96833,27.03167
9387,60,33.298019,26.701981
4911,39,12.998543,26.001457
15301,34,8.237076,25.762924


## Look for maximum and figure out what's going on

In [239]:
#get the predictions index value of the maximum predict value
maximum = predictions.max() 
np.nonzero(predictions == maximum)

(array([6007], dtype=int64),)

In [240]:
predictions[6007]

2221647724.546646

In [241]:
#This will tell what the particular row is using for input data
problem = X_test.iloc[6007]
problem = problem.where(problem >= 1)
problem = problem.dropna()
problem

Name_Chinanu Onuaku    1.0
Ref1_Foster""          1.0
Ref2_Tiven""           1.0
Ref3_Orr               1.0
Home_HOU               1.0
Away_SAC               1.0
Position_Center        1.0
Team_HOU               1.0
Home/Away_HOU          1.0
Opponent_SAC           1.0
Name: 2932, dtype: float64

In [242]:
y_test.iloc[6007]

4

In [243]:
problem_list = list(problem.index)

In [244]:
coeff_df = pd.DataFrame(lm.coef_,X.columns,columns=['Coefficient'])
coeff_df.loc[problem_list]

Unnamed: 0,Coefficient
Name_Chinanu Onuaku,663957700.0
"Ref1_Foster""""",692802900.0
"Ref2_Tiven""""",508653800.0
Ref3_Orr,680664900.0
Home_HOU,-3500033000.0
Away_SAC,-6996489000.0
Position_Center,-1676202000.0
Team_HOU,3080355000.0
Home/Away_HOU,2195958000.0
Opponent_SAC,-3022978000.0


In [273]:
df[df['Name'] == "Chinanu Onuaku"]

Unnamed: 0,Name,FTM,3P%,3PA,TOV,PF,REB,3PM,FG%,FGA,...,Ref2,Ref3,W/L,Home,Away,Fantasy_Score,Position,Team,Last_Game_Points,Average


In [296]:
len(df)

23838