In [48]:
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline

# Regularized Regression

Now that we have our baseline results, we would like to run the actual fit of the model leaving parameters $\theta$ and $\gamma$ to be fit. We can do so by reconstructing the baseline model to the following:

$$ Y_{um} = \mu + \bar{\theta} \cdot I_u + \bar{\gamma} \cdot I_m$$

Where $\bar{\theta}$ is a vector of coefficients for users who have made ratings and $\bar{\gamma}$ is a coefficients for restaurants for which ratings have been made. We multiply these by indicator variables $I_u$ and $I_m$, respectively, for the u-th user and m-th restaurant to go in the feature matrix.  

The best way to implement that would be to construct an $N$ by $U + M + 1$ matrix, where the $N$ is the number of reviews, $U$ is the total number of reviewers, and $M$ is the total number of restaurants (we include an additional column for the intercept). 

We will run this matrix through a multiple linear regression to compare results with baseline method (**we expect to get the same results as before**), but we will also run this matrix through Ridge and Lasso regularization. This should help, as the number of features included in this regression has expanded greatly.

## Run regularization on full universe of reviews

In [2]:
## Load in test and train data for all markets
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
print(train_df.shape)
print(test_df.shape)

(1409493, 13)
(535585, 13)


In [39]:
def subsample(df,size):
    ids = list(df.user_id.unique())
    
    df_reshuffle = df.sample(frac=1, random_state = 120717)
    
    random_indices = []
    
    for i in ids:
        id_obs = df_reshuffle[df_reshuffle['user_id']==i]
        random_indices.append(id_obs[:1].index.tolist())
    print(random_indices)
    random_indices_flat = list(chain.from_iterable(random_indices))
    
    df_at_least_one = df.iloc[random_indices_flat]
    
    remaining_obs = df_reshuffle[~df_reshuffle.isin(df_at_least_one)].dropna()
    
    remaining_to_sample = size - len(ids)
    
    top_up = remaining_obs.sample(n=remaining_to_sample, random_state = 120717)
    
    final_df = df_at_least_one.append(top_up)
    
    return final_df
    
    

In [95]:
# We should have more than this number of observations in any subsampled data set 
train_small = train_df[['user_id','business_id','review_score']]
len(train_df.user_id.unique())

163377

In [96]:
# Block to check how many unique users and observations we would have total, taking several different thresholds
combined = train_df.append(test_df)
test = combined.groupby('user_id').count()
print(len(test[test['user_review_count']>100].business_id))
test[test['user_review_count']>100].business_id.sum()

1383


239444

In [82]:
# This doesn't work
sub_samp = subsample(train_small,250000)

KeyboardInterrupt: 

In [None]:
## Create user and business dummies in test and training set
train_users = pd.get_dummies(train_sample, columns=['user_id'], drop_first=False)
#test_dummies = 

## Run regularization for each of the eight markets