In [1]:
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline

  from pandas.core import datetools


# Regularized Regression

Now that we have our baseline results, we would like to run the actual fit of the model leaving parameters $\theta$ and $\gamma$ to be fit. We can do so by reconstructing the baseline model to the following:

$$ Y_{um} = \mu + \bar{\theta} \cdot I_u + \bar{\gamma} \cdot I_m$$

Where $\bar{\theta}$ is a vector of coefficients for users who have made ratings and $\bar{\gamma}$ is a coefficients for restaurants for which ratings have been made. We multiply these by indicator variables $I_u$ and $I_m$, respectively, for the u-th user and m-th restaurant to go in the feature matrix.  

The best way to implement that would be to construct an $N$ by $U + M + 1$ matrix, where the $N$ is the number of reviews, $U$ is the total number of reviewers, and $M$ is the total number of restaurants (we include an additional column for the intercept). 

We will run this matrix through a multiple linear regression to compare results with baseline method (**we expect to get the same results as before**), but we will also run this matrix through Ridge and Lasso regularization. This should help, as the number of features included in this regression has expanded greatly.

## Run regularization on full universe of reviews

In [2]:
## Load in test and train data for all markets
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
print(train_df.shape)
print(test_df.shape)

(1409493, 13)
(535585, 13)


In [14]:
#df_resh = train_df.sample(frac=1)
t1 = train_df[0:10]
t2 = train_df[0:15]
t2[~t2.isin(t1)].dropna()


Unnamed: 0,review_date,business_longitude,business_id,business_categories,business_name,business_state,review_score,user_id,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
10,2012-04-30,-115.119257,p5rpYtxS5xPQjt3MXYVEwA,"['Vegetarian', 'Restaurants', 'Burgers', 'Vega...",Greens and Proteins,NV,5.0,---1lKK3aKOuomHnwAkAow,3.97,600.0,4.0,36.026232,245.0
11,2015-11-30,-115.134574,Vg1C_1eqwIwkZLIXGMTW3g,"['Nightlife', 'British', 'Bars', 'Sports Bars'...",Crown & Anchor British Pub,NV,1.0,---1lKK3aKOuomHnwAkAow,3.97,366.0,3.5,36.101467,245.0
12,2010-11-05,-115.285177,CWNMLT-ppaUjLMmrnYDPVg,"['Hawaiian', 'Asian Fusion', 'Restaurants']",Roy's Restaurant,NV,5.0,---1lKK3aKOuomHnwAkAow,3.97,97.0,4.0,36.158503,245.0
13,2012-05-01,-115.139584,Xspg78dOvuedvuPEUwZBjw,"['Coffee & Tea', 'Sandwiches', 'Restaurants', ...",The Beat Coffeehouse & Records,NV,4.0,---1lKK3aKOuomHnwAkAow,3.97,329.0,4.0,36.169108,245.0
14,2010-12-02,-115.312478,WOO81gScY3_VpaIfXFAKpw,"['French', 'Restaurants']",Technique Restaurant,NV,4.0,---1lKK3aKOuomHnwAkAow,3.97,30.0,4.0,36.188111,245.0


In [25]:
def subsample(df,size):
    ids = list(df.user_id.unique())
    
    df_reshuffle = df.sample(frac=1)
    
    random_indices = []
    
    for i in ids:
        id_obs = df_reshuffle[df_reshuffle['user_id']==i]
        random_indices.append(id_obs[:1].index.tolist())
    
    random_indices_flat = list(chain.from_iterable(random_indices))
    
    df_at_least_one = df.iloc[random_indices_flat]
    
    remaining_obs = df_reshuffle[~df_reshuffle.isin(df_at_least_one)].dropna()
    
    remaining_to_sample = size - len(ids)
    
    top_up = remaining_obs.sample(n=remaining_to_sample)
    
    final_df = df_at_least_one.append(top_up)
    
    return final_df
    
    

In [26]:
testing1 = train_df[0:200]
try_this = subsample(testing1,25)

In [28]:
print(len(testing1.user_id.unique()))
len(try_this.user_id.unique())

15


15

In [54]:
train_small = train_df[['user_id','business_id','review_score']]
len(train_small.user_id.unique())

163377

In [None]:
sub_samp = subsample(train_small,300000)

In [58]:
sub_samp

[[12]]

In [None]:
train_sample = train_df[0:500000]
train_sample.shape

In [None]:
## Create user and business dummies in test and training set
train_users = pd.get_dummies(train_sample, columns=['user_id'], drop_first=False)
#test_dummies = 

In [18]:
train_users.shape

(10000, 1172)

In [4]:
X_train_all = train_df[['user_average_rating', 'business_average_rating']].values


array([[ 3.97,  4.  ],
       [ 3.97,  3.5 ],
       [ 3.97,  2.5 ],
       ..., 
       [ 3.86,  4.5 ],
       [ 3.86,  3.  ],
       [ 3.86,  3.  ]])

## Run regularization for each of the eight markets