In [1]:
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline

  from pandas.core import datetools


# Regularized Regression

Now that we have our baseline results, we would like to run the actual fit of the model leaving parameters $\theta$ and $\gamma$ to be fit. We can do so by reconstructing the baseline model to the following:

$$ Y_{um} = \mu + \bar{\theta} \cdot I_u + \bar{\gamma} \cdot I_m$$

Where $\bar{\theta}$ is a vector of coefficients for users who have made ratings and $\bar{\gamma}$ is a coefficients for restaurants for which ratings have been made. We multiply these by indicator variables $I_u$ and $I_m$, respectively, for the u-th user and m-th restaurant to go in the feature matrix.  

The best way to implement that would be to construct an $N$ by $U + M + 1$ matrix, where the $N$ is the number of reviews, $U$ is the total number of reviewers, and $M$ is the total number of restaurants (we include an additional column for the intercept). 

We will run this matrix through a multiple linear regression to compare results with baseline method (**we expect to get the same results as before**), but we will also run this matrix through Ridge and Lasso regularization. This should help, as the number of features included in this regression has expanded greatly.

## Run regularization on full universe of reviews

In [2]:
## Load in test and train data for all markets
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
print(train_df.shape)
print(test_df.shape)

(1409493, 13)
(535585, 13)


In [39]:
def subsample(df,size):
    ids = list(df.user_id.unique())
    
    df_reshuffle = df.sample(frac=1, random_state = 120717)
    
    random_indices = []
    
    for i in ids:
        id_obs = df_reshuffle[df_reshuffle['user_id']==i]
        random_indices.append(id_obs[:1].index.tolist())
    print(random_indices)
    random_indices_flat = list(chain.from_iterable(random_indices))
    
    df_at_least_one = df.iloc[random_indices_flat]
    
    remaining_obs = df_reshuffle[~df_reshuffle.isin(df_at_least_one)].dropna()
    
    remaining_to_sample = size - len(ids)
    
    top_up = remaining_obs.sample(n=remaining_to_sample, random_state = 120717)
    
    final_df = df_at_least_one.append(top_up)
    
    return final_df
    
    

In [42]:
testing1 = train_df[0:200]
try_this = subsample(testing1,25)

[[11], [54], [61], [69], [75], [93], [106], [109], [111], [123], [125], [131], [141], [144], [186]]


In [43]:
try_this

Unnamed: 0,review_date,business_longitude,business_id,business_categories,business_name,business_state,review_score,user_id,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
11,2015-11-30,-115.134574,Vg1C_1eqwIwkZLIXGMTW3g,"['Nightlife', 'British', 'Bars', 'Sports Bars'...",Crown & Anchor British Pub,NV,1.0,---1lKK3aKOuomHnwAkAow,3.97,366.0,3.5,36.101467,245.0
54,2017-06-18,-111.925112,KwufI-XUy8Vs2cnJ8n7F6w,"['Mexican', 'Restaurants', 'Fast Food']",El Pollo Loco,AZ,5.0,--2HUmLkcNHZp0xw6AMBPg,4.6,15.0,4.5,33.582505,28.0
61,2012-12-13,-111.921845,K0hEFxUJiwsiLOnIm4tHOw,"['Gastropubs', 'Restaurants', 'Nightlife', 'Am...",Old Town Whiskey,AZ,4.0,--3WaS23LcIXtxyFULJHTA,3.93,39.0,4.0,33.494032,213.0
69,2008-01-08,-115.173516,cyV6Y8eOgpDTtdJQnB6FaA,"['Bars', 'Arcades', 'Restaurants', 'Arts & Ent...",ESPN Zone,NV,3.0,--4q8EyqThydQm-eKZpS-A,3.42,67.0,3.5,36.103001,400.0
75,2015-01-13,-112.067783,dIqMQYs6tmowQUygS8sQ6A,"['Food', 'Breakfast & Brunch', 'American (New)...",JoBot Coffee & Diner,AZ,5.0,--4rAAfZnEIAKJE80aIiYg,2.46,426.0,4.0,33.458356,25.0
93,2017-01-30,-79.390158,oOGLDf2rzeCPS7UQ8hhPlQ,"['Sushi Bars', 'Asian Fusion', 'Restaurants', ...",SU&BU,ON,4.0,--BumyUHiO_7YsHurb9Hkw,3.87,93.0,4.0,43.647245,38.0
106,2012-10-15,-111.940251,4K5NUsinIgtq-yuxdp2HdA,"['Restaurants', 'Breakfast & Brunch', 'Mexican...",Fuzzy's Taco Shop,AZ,4.0,--CIuK7sUpaNzalLAlHJKA,3.1,293.0,3.5,33.425965,188.0
109,2013-07-17,-115.116974,u1RQFcA4Br_peVRh_WBOsQ,"['Barbeque', 'Restaurants', 'Comfort Food']",Top Notch Barbeque,NV,5.0,--DxiDMQgN08E5gTM0aj7Q,3.46,117.0,4.0,36.020264,13.0
111,2008-10-14,-80.080827,b_XIKJ2nNzksuWhfMTEehQ,"['Bars', 'Belgian', 'Brasseries', 'American (N...",Creekhouse,PA,5.0,--EMqnd727rtC0G5Oc-Mrg,3.82,85.0,3.5,40.440278,28.0
123,2014-01-23,-115.149449,WDYE-OCXNgKyXGNuBjMgEw,"['American (New)', 'Restaurants']",Table 34,NV,5.0,--HCoE1ghaAlcaAfshICgw,4.71,215.0,4.0,36.057451,45.0


In [54]:
train_small = train_df[['user_id','business_id','review_score']]
len(train_small.user_id.unique())

163377

In [None]:
sub_samp = subsample(train_small,300000)

In [58]:
sub_samp

[[12]]

In [None]:
train_sample = train_df[0:500000]
train_sample.shape

In [None]:
## Create user and business dummies in test and training set
train_users = pd.get_dummies(train_sample, columns=['user_id'], drop_first=False)
#test_dummies = 

In [18]:
train_users.shape

(10000, 1172)

In [4]:
X_train_all = train_df[['user_average_rating', 'business_average_rating']].values


array([[ 3.97,  4.  ],
       [ 3.97,  3.5 ],
       [ 3.97,  2.5 ],
       ..., 
       [ 3.86,  4.5 ],
       [ 3.86,  3.  ],
       [ 3.86,  3.  ]])

## Run regularization for each of the eight markets