In [1]:
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline

  from pandas.core import datetools


# Regularized Regression

Now that we have our baseline results, we would like to run the actual fit of the model leaving parameters $\theta$ and $\gamma$ to be fit. We can do so by reconstructing the baseline model to the following:

$$ Y_{um} = \mu + \bar{\theta} \cdot I_u + \bar{\gamma} \cdot I_m$$

Where $\bar{\theta}$ is a vector of coefficients for users who have made ratings and $\bar{\gamma}$ is a coefficients for restaurants for which ratings have been made. We multiply these by indicator variables $I_u$ and $I_m$, respectively, for the u-th user and m-th restaurant to go in the feature matrix.  

The best way to implement that would be to construct an $N$ by $U + M + 1$ matrix, where the $N$ is the number of reviews, $U$ is the total number of reviewers, and $M$ is the total number of restaurants (we include an additional column for the intercept). 

We will run this matrix through a multiple linear regression to compare results with baseline method (**we expect to get the same results as before**), but we will also run this matrix through Ridge and Lasso regularization. This should help, as the number of features included in this regression has expanded greatly.

## Run regularization on full universe of reviews

In [64]:
## Load in test and train data for all markets
train_df = pd.read_csv("Data/states/train/PA/train_150.csv")
test_df = pd.read_csv("Data/states/test/PA/test_150.csv")
print(train_df.shape)
print(test_df.shape)

(4870, 13)
(1231, 13)


In [65]:
def subsample(df,size):
    ids = list(df.user_id.unique())
    
    df_reshuffle = df.sample(frac=1, random_state = 120717)
    
    random_indices = []
    
    for i in ids:
        id_obs = df_reshuffle[df_reshuffle['user_id']==i]
        random_indices.append(id_obs[:1].index.tolist())
    print(random_indices)
    random_indices_flat = list(chain.from_iterable(random_indices))
    
    df_at_least_one = df.iloc[random_indices_flat]
    
    remaining_obs = df_reshuffle[~df_reshuffle.isin(df_at_least_one)].dropna()
    
    remaining_to_sample = size - len(ids)
    
    top_up = remaining_obs.sample(n=remaining_to_sample, random_state = 120717)
    
    final_df = df_at_least_one.append(top_up)
    
    return final_df
    
    

In [66]:
# We should have more than this number of observations in any subsampled data set 
train_small = train_df[['user_id','business_id','review_score']]
len(train_df.user_id.unique())
test_small = test_df[['user_id','business_id','review_score']]

In [67]:
# Block to check how many unique users and observations we would have total, taking several different thresholds
#combined = train_df.append(test_df)
#test = combined.groupby('user_id').count()
#print(len(test[test['user_review_count']>100].business_id))
#test[test['user_review_count']>100].business_id.sum()

In [68]:
# This doesn't work
#sub_samp = subsample(train_small,250000)

In [69]:
## Create user and business dummies in test and training set
train_dummies = pd.get_dummies(train_small, columns=['user_id','business_id'], drop_first=False)
test_dummies = pd.get_dummies(test_small, columns=['user_id','business_id'], drop_first=False)

In [70]:
train_dummies.head()

Unnamed: 0,review_score,user_id_-Vu7L3U7-kxDyY1VHxw3zw,user_id_135DbbQnr3BEkQbBzZ9T1A,user_id_2jKzO_01d12oiu-2bOYcYg,user_id_4m9NXICYBC5i9t4aTt-I6w,user_id_4wp4XI9AxKNqJima-xahlg,user_id_5JVY32_bmTBfIGpCCsnAfw,user_id_6Ki3bAL0wx9ymbdJqbSWMA,user_id_7AGLlj5YzqdBfCPiZCa1mQ,user_id_8AwcaBJjiMpQ__FPxktwwQ,...,business_id_zaDZo74bWzds9MunE6XVxA,business_id_zdVDfA4S1EvkPUlfA7eJfg,business_id_zi6cB_bkswWPLD2k3IVtyg,business_id_ziJsGjXvidzZWC1I0-SOSg,business_id_zjOfUAyqyH_BPe7EkFBrEQ,business_id_zkW7OAv8Cnb-3SxEBOubTQ,business_id_zmoQ2eIDyeKlKVGA8p9esQ,business_id_zuUWG7OdMv6awFRYhEYT5Q,business_id_zxSfGIhK3hH3vVz_pS5eaA,business_id_zzwhN7x37nyjP0ZM8oiHmw
0,3.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [71]:
train_cols = pd.DataFrame(columns = train_dummies.columns)
test_cols = pd.DataFrame(columns = test_dummies.columns)
all_cols = train_cols.append(test_cols)
all_cols

train = all_cols.append(train_dummies)
train = train.fillna(0.)
test = all_cols.append(test_dummies)
test = test.fillna(0.)
print(train.shape)
print(test.shape)

(4870, 1974)
(1231, 1974)


In [72]:
X_train_all = train.drop('review_score', axis=1)
y_train_all = train['review_score']

X_test_all = test.drop('review_score', axis=1)
y_test_all = test['review_score']

In [73]:
print(X_test_all.shape)
print(y_test_all.shape)

(1231, 1973)
(1231,)


In [74]:
baseline_all = LinearRegression(fit_intercept=True)
baseline_all.fit(X_train_all, y_train_all)

print('Baseline Intercept:', baseline_all.intercept_)
print('Baseline Coefficients:', baseline_all.coef_)
print('Baseline Train Score:', baseline_all.score(X_train_all, y_train_all))
print('Baseline Test Score:', baseline_all.score(X_test_all, y_test_all))

Baseline Intercept: 3.7310321558e+12
Baseline Coefficients: [ -4.64286729e+10  -4.64286729e+10  -4.64286729e+10 ...,  -3.68460348e+12
  -3.68460348e+12  -3.68460348e+12]
Baseline Train Score: 0.546363450256
Baseline Test Score: -3.0876366392e+26


In [75]:
# Implement RidgeCV and LassoCV
lambdas = [.001,.005,1,5,10,50,100,500,1000]

clf = RidgeCV(alphas=lambdas, fit_intercept=False, normalize=True)
clf.fit(X_train_all, y_train_all)
si= np.argsort(np.abs(clf.coef_))

print("----")
#print(clf.coef_, data_train.columns)
print('Ridge Train Score', clf.score(X_train_all, y_train_all))
print('Ridge Test Score', clf.score(X_test_all, y_test_all))
#print('')
#for i,x in enumerate(clf.coef_[si]):
#    print(data_train.columns[si[i]], x)
#    if abs(x)<.01:
#        print(i,x, X_train.columns[i])
        
clfl = LassoCV(cv=10, alphas=lambdas, fit_intercept=False, normalize=False)
#preprocessing.StandardScaler()
clfl.fit(X_train_all, y_train_all)

print("----")
#print(clf.coef_)
print('Lasso Train Score', clfl.score(X_train_all, y_train_all))
print('Lasso Test Score', clfl.score(X_test_all, y_test_all))
#print('')
#si= np.argsort(np.abs(clfl.coef_))
#for i,x in enumerate(clfl.coef_[si]):
#    print(data_train.columns[si][i], x)
    #if abs(x)<.01:
        #print(i,x, X_train.columns[i])

----
Ridge Train Score 0.321330051605
Ridge Test Score 0.108080876198
----
Lasso Train Score 0.0977368611093
Lasso Test Score 0.0649921333855


## Run regularization for each of the eight markets