In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn import preprocessing
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from IPython.display import display
import pandas as pd
%matplotlib inline

import seaborn as sns
sns.set(style='whitegrid')
pd.set_option('display.width', 1500)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings('ignore')

# import stuff for modeling
from sklearn.neighbors import KNeighborsRegressor
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV

In [2]:
data_train = pd.read_csv('../train_data/data_train.csv', index_col = 0)
data_train = data_train.dropna()
data_train_dummy = pd.get_dummies(data_train, columns = ['weather'])
x_train = data_train_dummy.drop(['goal_diff'], axis = 1)
y_train = data_train_dummy[['goal_diff']]

In [21]:
data_test = pd.read_csv('../test_data/data_test.csv', index_col = 0)
data_test = data_train.dropna()
data_test_dummy = pd.get_dummies(data_test, columns = ['weather'])
x_test = data_test_dummy.drop(['goal_diff'], axis = 1)
y_test = data_test_dummy[['goal_diff']]

Start off with KNN, using cross validation

In [3]:
neighbors = [1, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 100]
knn_cv_scores = []

# perform 5-fold cross validation
for k in neighbors:
    knn = KNeighborsRegressor(n_neighbors = k)
    scores = cross_val_score(knn, x_train, y_train, cv = 5, scoring = 'r2')
    knn_cv_scores.append(scores.mean())

In [4]:
knn_cv_scores

[-0.65120082048891592,
 0.028425328744255872,
 0.13517035939482239,
 0.17089338049954059,
 0.18978256157432555,
 0.19405485086106861,
 0.19312757077678383,
 0.19399264740284011,
 0.19849491417313356,
 0.2007823669038232,
 0.20372280263116327,
 0.21018572471828464]

Now, try making a linear model, using cross validation

In [16]:
linear_reg_scores = []
linear_reg_cv_scores = cross_val_score(LinearRegression(), sm.add_constant(x_train), y_train, cv = 5, scoring = 'r2')
linear_reg_cv_scores

array([ 0.21951063,  0.24728907,  0.28615433,  0.13306209,  0.18650881])

In [17]:
# quick OLS model
ols_model = sm.OLS(endog = y_train, exog = sm.add_constant(x_train), hasconst=True).fit()
r2_score(y_train, ols_model.predict(sm.add_constant(x_train)))

0.26874869374924337

In [24]:
# generate list of lambdas
lambdas = [0.1, 0.5, 1, 5, 10, 50, 100]

# fit Ridge Model
fit_ridge = RidgeCV(alphas = lambdas, cv = 5).fit(x_train, y_train)

# fit Lasso Model
fit_lasso = LassoCV(alphas = lambdas, cv = 5, max_iter = 100000).fit(x_train, y_train)

ypredict_ridge = fit_ridge.predict(x_test)

ypredict_lasso = fit_lasso.predict(x_test)

print("ridge", r2_score(y_test, ypredict_ridge))
print("lasso", r2_score(y_test, ypredict_lasso))

ridge 0.26676448349
lasso 0.264543361969
