In [14]:
import warnings
warnings.filterwarnings('ignore')

In [15]:
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from collections import OrderedDict
%matplotlib inline

# Regularized Regression

Now that we have our baseline results, we would like to run the actual fit of the model leaving parameters $\theta$ and $\gamma$ to be fit. We can do so by reconstructing the baseline model to the following:

$$ Y_{um} = \mu + \bar{\theta} \cdot I_u + \bar{\gamma} \cdot I_m$$

Where $\bar{\theta}$ is a vector of coefficients for users who have made ratings and $\bar{\gamma}$ is a coefficients for restaurants for which ratings have been made. We multiply these by indicator variables $I_u$ and $I_m$, respectively, for the u-th user and m-th restaurant to go in the feature matrix.  

The way we implement this is by constructing an $N$ by $U + M + 1$ matrix, where the $N$ is the number of reviews, $U$ is the total number of reviewers, and $M$ is the total number of restaurants (we include an additional column for the intercept). 

We will run this matrix through a multiple linear regression to compare results with baseline method, but we will also run this matrix through Ridge and Lasso regularization, using both $R^2$, but also $RMSE$, as the data contains a lot of noise that will likely be overrepresented by using $R^2$. This should help compared to the linear regression run using this matrix process, as the number of features included in this regression has expanded greatly.

## Run regularization on full universe of reviews

In [16]:
## Load in test and train data for all markets
train_df = pd.read_csv("Data/states/train/OH/train_150.csv")
test_df = pd.read_csv("Data/states/test/OH/test_150.csv")
print(train_df.shape)
print(test_df.shape)

(3925, 13)
(942, 13)


In [17]:
# We should have more than this number of observations in any subsampled data set 
train_small = train_df[['user_id','business_id','review_score']]
len(train_df.user_id.unique())
test_small = test_df[['user_id','business_id','review_score']]

In [18]:
## Create user and business dummies in test and training set
train_dummies = pd.get_dummies(train_small, columns=['user_id','business_id'], drop_first=False)
test_dummies = pd.get_dummies(test_small, columns=['user_id','business_id'], drop_first=False)

In [19]:
train_dummies.head()

Unnamed: 0,review_score,user_id_3Uv0dGI2IXJb2OUj8R2GJA,user_id_5QFws6LKMCZCgKHl8WR1jQ,user_id_CcOgdfEJxgrxTAwag5k18Q,user_id_H_-K6erSJYtzg3ZEvOg3EQ,user_id_NfU0zDaTMEQ4-X9dbQWd9A,user_id_PrwnAL82LL4Ewt_wJpHWCA,user_id_QaN-nccbLZPWzownQYgTVQ,user_id_RlpkcJqctkKXl-LO1IAtig,user_id_RylA6VZUTRuMGBu4nHKbCw,...,business_id_zW2Nzu38bB5nlOhhim-O5A,business_id_zYbEKtLeosxhTzF4zSRIyA,business_id_zc0sUY7iWuJB93AHWKy_xw,business_id_zhBkNLn2KPnh5-NIueXVHA,business_id_zl3Y1_DprpVzY3Izad4M-Q,business_id_zlZQM-cJPVW7FHJsYTvyYg,business_id_zluk4cL7Ch-uRlRply42ZQ,business_id_zm3w7U26kDxREFDSLJRBgQ,business_id_zo9fKM_Sty6qGztXKoMPmQ,business_id_zzSYBWuv_fXGtSgsO-6_1g
0,3.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
## Create matrices with all users and all businesses in them, fill NaNs with 0s
train_cols = pd.DataFrame(columns = train_dummies.columns)
test_cols = pd.DataFrame(columns = test_dummies.columns)
all_cols = train_cols.append(test_cols)
all_cols

train = all_cols.append(train_dummies)
train = train.fillna(0.)
test = all_cols.append(test_dummies)
test = test.fillna(0.)
print(train.shape)
print(test.shape)

(3925, 1938)
(942, 1938)


In [21]:
## Create train and test matrices for linear, Ridge, and Lasso regressions
X_train_all = train.drop('review_score', axis=1)
y_train_all = train['review_score']

X_test_all = test.drop('review_score', axis=1)
y_test_all = test['review_score']

In [22]:
## Run matrices through linear regression
baseline_all = LinearRegression(fit_intercept=True)
baseline_all.fit(X_train_all, y_train_all)

print('Linear Regression Train Score:', baseline_all.score(X_train_all, y_train_all))
print('Linear Regression Score:', baseline_all.score(X_test_all, y_test_all))
print('Linear Regression Train RMSE:', sqrt(mean_squared_error(y_train_all, baseline_all.predict(X_train_all))))
print('Linear Regression Test RMSE:',sqrt(mean_squared_error(y_test_all, baseline_all.predict(X_test_all))))

Linear Regression Train Score: 0.568596188694
Linear Regression Score: -7.99233209172e+26
Linear Regression Train RMSE: 0.6304631614809212
Linear Regression Test RMSE: 27961194912001.582


We see here that, because of all of the added factors for all users and all restaurants, this model is significantly overfitting to the training set. Our next set is regularization to correct for this overfitting.

In [23]:
# Implement RidgeCV and LassoCV
lambdas = [.001,.005,1,5,10,50,100,500,1000]

clf = RidgeCV(cv = 5, alphas=lambdas, fit_intercept=True)
clf.fit(X_train_all, y_train_all)

print("----")
print('Ridge Train Score', clf.score(X_train_all, y_train_all))
print('Ridge Test Score', clf.score(X_test_all, y_test_all))
print('Ridge Train RMSE:', sqrt(mean_squared_error(y_train_all, clf.predict(X_train_all))))
print('Ridge Test RMSE:', sqrt(mean_squared_error(y_test_all, clf.predict(X_test_all))))

clfl = LassoCV(cv = 5, alphas=lambdas, fit_intercept=True)
clfl.fit(X_train_all, y_train_all)

print("----")
print('Lasso Train Score', clfl.score(X_train_all, y_train_all))
print('Lasso Test Score', clfl.score(X_test_all, y_test_all))
print('Lasso Train RMSE:', sqrt(mean_squared_error(y_train_all, clfl.predict(X_train_all))))
print('Lasso Test RMSE:', sqrt(mean_squared_error(y_test_all, clfl.predict(X_test_all))))

----
Ridge Train Score 0.316232352454
Ridge Test Score 0.0995977594321
Ridge Train RMSE: 0.7937285478990413
Ridge Test RMSE: 0.9385064244062807
----
Lasso Train Score 0.104427996576
Lasso Test Score 0.0524151551981
Lasso Train RMSE: 0.9083808726779814
Lasso Test RMSE: 0.9627821269653825


As we see, Ridge does much better than Lasso. This is because we do not want to zero out features, as is done in Lasso, we simply want to penalize the magnitudes of each coefficient. This method still turns out not to do quite as well as the baseline model from the previous section, but the RMSE between that model and Ridge are comparable.