In [100]:
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.model_selection import GridSearchCV
from collections import OrderedDict
%matplotlib inline

# Regularized Regression

Now that we have our baseline results, we would like to run the actual fit of the model leaving parameters $\theta$ and $\gamma$ to be fit. We can do so by reconstructing the baseline model to the following:

$$ Y_{um} = \mu + \bar{\theta} \cdot I_u + \bar{\gamma} \cdot I_m$$

Where $\bar{\theta}$ is a vector of coefficients for users who have made ratings and $\bar{\gamma}$ is a coefficients for restaurants for which ratings have been made. We multiply these by indicator variables $I_u$ and $I_m$, respectively, for the u-th user and m-th restaurant to go in the feature matrix.  

The best way to implement that would be to construct an $N$ by $U + M + 1$ matrix, where the $N$ is the number of reviews, $U$ is the total number of reviewers, and $M$ is the total number of restaurants (we include an additional column for the intercept). 

We will run this matrix through a multiple linear regression to compare results with baseline method (**we expect to get the same results as before**), but we will also run this matrix through Ridge and Lasso regularization. This should help, as the number of features included in this regression has expanded greatly.

## Run regularization on full universe of reviews

In [105]:
## Load in test and train data for all markets
train_df = pd.read_csv("Data/states/train/OH/train_150.csv")
test_df = pd.read_csv("Data/states/test/OH/test_150.csv")
print(train_df.shape)
print(test_df.shape)

(3925, 13)
(942, 13)


In [106]:
def subsample(df,size):
    ids = list(df.user_id.unique())
    
    df_reshuffle = df.sample(frac=1, random_state = 120717)
    
    random_indices = []
    
    for i in ids:
        id_obs = df_reshuffle[df_reshuffle['user_id']==i]
        random_indices.append(id_obs[:1].index.tolist())
    print(random_indices)
    random_indices_flat = list(chain.from_iterable(random_indices))
    
    df_at_least_one = df.iloc[random_indices_flat]
    
    remaining_obs = df_reshuffle[~df_reshuffle.isin(df_at_least_one)].dropna()
    
    remaining_to_sample = size - len(ids)
    
    top_up = remaining_obs.sample(n=remaining_to_sample, random_state = 120717)
    
    final_df = df_at_least_one.append(top_up)
    
    return final_df
    
    

In [107]:
# We should have more than this number of observations in any subsampled data set 
train_small = train_df[['user_id','business_id','review_score']]
len(train_df.user_id.unique())
test_small = test_df[['user_id','business_id','review_score']]

In [108]:
# Block to check how many unique users and observations we would have total, taking several different thresholds
#combined = train_df.append(test_df)
#test = combined.groupby('user_id').count()
#print(len(test[test['user_review_count']>100].business_id))
#test[test['user_review_count']>100].business_id.sum()

In [109]:
# This doesn't work
#sub_samp = subsample(train_small,250000)

In [110]:
## Create user and business dummies in test and training set
train_dummies = pd.get_dummies(train_small, columns=['user_id','business_id'], drop_first=False)
test_dummies = pd.get_dummies(test_small, columns=['user_id','business_id'], drop_first=False)

In [111]:
train_dummies.head()

Unnamed: 0,review_score,user_id_3Uv0dGI2IXJb2OUj8R2GJA,user_id_5QFws6LKMCZCgKHl8WR1jQ,user_id_CcOgdfEJxgrxTAwag5k18Q,user_id_H_-K6erSJYtzg3ZEvOg3EQ,user_id_NfU0zDaTMEQ4-X9dbQWd9A,user_id_PrwnAL82LL4Ewt_wJpHWCA,user_id_QaN-nccbLZPWzownQYgTVQ,user_id_RlpkcJqctkKXl-LO1IAtig,user_id_RylA6VZUTRuMGBu4nHKbCw,...,business_id_zW2Nzu38bB5nlOhhim-O5A,business_id_zYbEKtLeosxhTzF4zSRIyA,business_id_zc0sUY7iWuJB93AHWKy_xw,business_id_zhBkNLn2KPnh5-NIueXVHA,business_id_zl3Y1_DprpVzY3Izad4M-Q,business_id_zlZQM-cJPVW7FHJsYTvyYg,business_id_zluk4cL7Ch-uRlRply42ZQ,business_id_zm3w7U26kDxREFDSLJRBgQ,business_id_zo9fKM_Sty6qGztXKoMPmQ,business_id_zzSYBWuv_fXGtSgsO-6_1g
0,3.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
train_cols = pd.DataFrame(columns = train_dummies.columns)
test_cols = pd.DataFrame(columns = test_dummies.columns)
all_cols = train_cols.append(test_cols)
all_cols

train = all_cols.append(train_dummies)
train = train.fillna(0.)
test = all_cols.append(test_dummies)
test = test.fillna(0.)
print(train.shape)
print(test.shape)

(3925, 1938)
(942, 1938)


In [113]:
X_train_all = train.drop('review_score', axis=1)
y_train_all = train['review_score']

X_test_all = test.drop('review_score', axis=1)
y_test_all = test['review_score']

In [114]:
print(X_test_all.shape)
print(y_test_all.shape)

(942, 1937)
(942,)


In [115]:
baseline_all = LinearRegression(fit_intercept=True)
baseline_all.fit(X_train_all, y_train_all)

print('Baseline Intercept:', baseline_all.intercept_)
print('Baseline Coefficients:', baseline_all.coef_)
print('Baseline Train Score:', baseline_all.score(X_train_all, y_train_all))
print('Baseline Test Score:', baseline_all.score(X_test_all, y_test_all))

Baseline Intercept: -3.64458158917e+12
Baseline Coefficients: [  8.14422317e+10  -1.24945923e+14   7.45084593e+13 ...,   3.56313936e+12
   3.56313936e+12   3.56313936e+12]
Baseline Train Score: 0.568596188694
Baseline Test Score: -7.99233209172e+26


In [116]:
# Implement RidgeCV and LassoCV
lambdas = [.001,.005,1,5,10,50,100,500,1000]

clf = RidgeCV(alphas=lambdas, fit_intercept=False, normalize=True)
clf.fit(X_train_all, y_train_all)
si= np.argsort(np.abs(clf.coef_))

print("----")
#print(clf.coef_, data_train.columns)
print('Ridge Train Score', clf.score(X_train_all, y_train_all))
print('Ridge Test Score', clf.score(X_test_all, y_test_all))
#print('')
#for i,x in enumerate(clf.coef_[si]):
#    print(data_train.columns[si[i]], x)
#    if abs(x)<.01:
#        print(i,x, X_train.columns[i])
        
clfl = LassoCV(alphas=lambdas, fit_intercept=False, normalize=False)
#preprocessing.StandardScaler()
clfl.fit(X_train_all, y_train_all)

print("----")
#print(clf.coef_)
print('Lasso Train Score', clfl.score(X_train_all, y_train_all))
print('Lasso Test Score', clfl.score(X_test_all, y_test_all))
#print('')
#si= np.argsort(np.abs(clfl.coef_))
#for i,x in enumerate(clfl.coef_[si]):
#    print(data_train.columns[si][i], x)
    #if abs(x)<.01:
        #print(i,x, X_train.columns[i])

----
Ridge Train Score 0.30892083752
Ridge Test Score 0.0866833443683
----
Lasso Train Score 0.105065079606
Lasso Test Score 0.0501492433174
