In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%config InlineBackend.figure_format = 'svg'
import warnings; warnings.simplefilter('ignore')

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LassoCV, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model, metrics
import xgboost as xgb

In [None]:
data = pd.read_csv('ShortTermPred.csv') ## Historical data
data2019 = pd.read_csv('2019_alldata.csv') ## Data to predict

In [None]:
## Use same format as in data
data2019['DayOfWeek'] = data2019['DayOfWeek'].str.slice(start = 0, stop = 3)

In [None]:
## Set index datetime

data['0']=pd.to_datetime(data['0'],format='%Y-%m-%d')  ## convert to datetime
data.set_index('0', inplace = True) ## set datetime as index
data.head()

data2019['Dates']=pd.to_datetime(data2019['Dates'],format='%Y-%m-%d')  ## convert to datetime
data2019.set_index('Dates', inplace = True) ## set datetime as index


In [None]:
#data = data[data.index.year > 2000]## Use to test if using only recent data works better

In [None]:
data = data.drop(['precipitation','year','Soccergame','Stadium'],axis=1) ## variables that are not used

In [None]:
## Dummy coding for multinomial variables

data = pd.concat([data,pd.get_dummies(data['DayOfWeek'],prefix='Day')],axis=1)
data.drop(['DayOfWeek'],axis=1, inplace=True)
data = pd.concat([data,pd.get_dummies(data['VisitingTeam_Team'],prefix='Opponent')],axis=1)
data.drop(['VisitingTeam_Team'],axis=1, inplace=True)

data2019 = pd.concat([data2019,pd.get_dummies(data2019['DayOfWeek'],prefix='Day')],axis=1)
data2019.drop(['DayOfWeek'],axis=1, inplace=True)
data2019 = pd.concat([data2019,pd.get_dummies(data2019['VisitingTeam_Team'],prefix='Opponent')],axis=1)
data2019.drop(['VisitingTeam_Team'],axis=1, inplace=True)

In [None]:
#### Need to add columns of 0's for teams they haven't played in 2019

missingcolumns = list(set(data.columns) - set(data2019.columns))
missingcolumn_df = pd.DataFrame(columns=missingcolumns)
for col in missingcolumn_df.columns:
    missingcolumn_df[col] = np.zeros(data2019.shape[0])

missingcolumn_df.set_index(data2019.index,inplace=True) ## Will merge on date in next line
data2019 = pd.merge(data2019,missingcolumn_df, left_index=True,right_index=True)

In [None]:
###################### Assign X and Y
y = data['Attendance']
X = data
X = X.drop('Attendance',axis=1)

y2019 = data2019['Attendance']
X2019 = data2019
X2019 = data2019.drop('Attendance',axis=1)

In [None]:
##Numerical variables need to be scaled
Numerical_cols = ['GameNumber','WinLossRatio','WinLossRatioLast10','month','GamesBack','NYA_WinLossRatio','BOS_WinLossRatio','BAL_WinLossRatio']
Other_cols = data.columns.drop(Numerical_cols)
Other_cols = Other_cols.drop('Attendance')

In [None]:
#Create training and testing sets
X_train, X_holdout, y_train, y_holdout = train_test_split(X, y, test_size=0.2, random_state=17)

In [None]:
## Scale numerical predictors, historical data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[Numerical_cols])
X_holdout_scaled = scaler.transform(X_holdout[Numerical_cols])

In [None]:
## Scale numerical predictors, 2019 data
data2019_scaled = scaler.transform(X2019[Numerical_cols])

In [None]:
##Merge numerical and categorical variables
X_train_final = np.append(X_train_scaled,X_train[Other_cols],axis=1)
X_holdout_final = np.append(X_holdout_scaled, X_holdout[Other_cols],axis=1)

X2019_final = np.append(data2019_scaled, data2019[Other_cols],axis=1)

In [None]:
###Linear regression model
LinRegression = LinearRegression()
LinRegression.fit(X_train_final,y_train)
regpreds=LinRegression.predict(X_holdout_final)

In [None]:
###Linear regression model correlation coefficient
from scipy.stats.stats import pearsonr  
regtrainpreds=LinRegression.predict(X_train_final)
print (pearsonr(regtrainpreds,y_train))

In [None]:
##Linear regression scatterplot
dfpreds_df=pd.DataFrame(columns=['yholdout','regpreds'])
dfpreds_df['yholdout']=y_holdout
dfpreds_df['regpreds']=regpreds

plt.scatter(y_holdout, regpreds);

In [None]:
##Linear regression coefficient importance
reg_importance = pd.DataFrame(data=LinRegression.coef_,columns=['coefficients'],index=data.columns.drop('Attendance')) # you code here
reg_importance.sort_values('coefficients',ascending=False)

In [None]:
########## Lasso Regression

alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(alphas=alphas, cv=5,verbose=True) # you code here
lasso_cv.fit(X_train_final,y_train) # you code here

In [None]:
##Lasso correlation coefficient
lassotrainpreds=lasso_cv.predict(X_train_final)
print (pearsonr(lassotrainpreds,y_train))

In [None]:
lassopreds=lasso_cv.predict(X_holdout_final)
dfpreds_df['LassoPreds'] = lassopreds
dfpreds_df.corr()

In [None]:
##Lasso scatterplot
plt.scatter(y_holdout, lassopreds);
plt.title("Predicted versus actual attendance");

In [None]:
##Lasso feature importance
lasso_importance = pd.DataFrame(data=lasso_cv.coef_,columns=['coefficients'],index=data.columns.drop('Attendance')) # you code here
lasso_importance.sort_values('coefficients',ascending=False)

In [None]:
##### Random forest

forest = RandomForestRegressor() # you code here
forest.fit(X_train_final,y_train)

In [None]:
foresttrainpreds= forest.predict(X_train_final)
print (pearsonr(foresttrainpreds,y_train))

In [None]:
## Random forest grid search
forest_params = {'max_depth': [5,10,15,20,25,30], 
                  'min_samples_leaf': [5,10,15],
                  'max_features': [10,15,20,25,30,35,40,45]}

locally_best_forest = GridSearchCV(forest,forest_params, cv=10)
locally_best_forest.fit(X_train_final,y_train)

In [None]:
### Best hyperparameters for random forest
locally_best_forest.best_params_, locally_best_forest.best_score_

In [None]:
##Random forest scatterplot and performance metrics
forestpreds = locally_best_forest.predict(X_holdout_final)
forestpreds_df=pd.DataFrame(columns=['yholdout','forestpreds'])
forestpreds_df['yholdout']=y_holdout
forestpreds_df['forestpreds']=forestpreds

diffscores = [abs(a - b) for a, b in zip(forestpreds, y_holdout)]
diffscores = round(sum(diffscores) / float(len(diffscores)),2)
percerror = [abs(a - b)/b for a, b in zip(forestpreds, y_holdout)]
percerror = round(sum(percerror) / float(len(percerror)),2)

rr = metrics.r2_score(y_holdout, forestpreds)
rr = round(rr,2)

plt.scatter(y_holdout, forestpreds);
plt.ylabel = 'Prediction'
plt.xlabel = 'Actual'
plt.title("R squared = " + str(rr) +
          '\n Average error in attendance prediction = ' + str(diffscores)+
         '\n Percentage error = ' + str(percerror))
plt.show()

In [None]:
##Compare correlation coefficients of different models
dfpreds_df['ForestPreds'] = forestpreds
dfpreds_df.corr()

In [None]:
##Estimate importance of random forest coefficients
rf_importance = pd.DataFrame(data=locally_best_forest.best_estimator_.feature_importances_,columns=['coefficients'],index=data.columns.drop('Attendance')) # you code here
rf_importance.sort_values('coefficients',ascending=False)

In [None]:
forestpreds2019 = locally_best_forest.predict(X2019_final)
lassocvpreds2019 = lasso_cv.predict(X2019_final)

In [None]:
### Create dataframe for web app

###Get day of week, month
forestpreds2019_df=pd.DataFrame(columns=['y2019','forestpreds2019'])
forestpreds2019_df['y2019']=y2019
forestpreds2019_df['forestpreds2019']=forestpreds2019
forestpreds2019_df['month'] = forestpreds2019_df.index.month
import calendar
dayofweek=[]
for x in forestpreds2019_df.index:
    dayofweek.append(calendar.day_name[x.weekday()])

monthdict = {3: 'March',4: 'April',5: 'May',6: 'June',7: 'July',8: 'August',9: 'September',10: 'October'}
forestpreds2019_df['monthletter'] = forestpreds2019_df['month']
forestpreds2019_df['monthletter'] = forestpreds2019_df['monthletter'].map(monthdict)

In [None]:
forestpreds2019_df['lassopreds'] = lassocvpreds2019
forestpreds2019_df['day'] = dayofweek
forestpreds2019_df['opponent'] = [
"Tigers",
"Tigers",
"Tigers",
"Tigers",
"Orioles",
"Orioles",
"Orioles",
"Rays",
"Rays",
"Rays",
"Giants",
"Giants",
"Athletics",
"Athletics",
"Athletics",
"Twins",
"Twins",
"Twins",
"White Sox",
"White Sox",
"White Sox",
"Red Sox",
"Red Sox",
"Red Sox",
"Red Sox",
"Padres",
"Padres",
"Padres",
"Yankees",
"Yankees",
"Yankees",
"D-backs",
"D-backs",
"D-backs",
"Angels",
"Angels",
"Angels",
"Angels",
"Royals",
"Royals",
"Royals",
"Royals",
"Red Sox",
"Red Sox",
"Red Sox",
"Orioles",
"Orioles",
"Orioles",
"Indians",
"Indians",
"Indians",
"Rays",
"Rays",
"Rays",
"Yankees",
"Yankees",
"Yankees",
"Yankees",
"Rangers",
"Rangers",
"Rangers",
"Mariners",
"Mariners",
"Mariners",
"Braves",
"Braves",
"Astros",
"Astros",
"Astros",
"Red Sox",
"Red Sox",
"Red Sox",
"Yankees",
"Yankees",
"Yankees",
"Orioles",
"Orioles",
"Orioles",
"Rays",
"Rays",
"Rays",
]



In [None]:
forestpreds2019_df.to_csv('2019finalpredictions.csv')