In [9]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline

# Baseline Model

At its most basic level, a recommendation system takes information previously collected on both users and items and is able to pair them together to predict how much a user would like new items. Often times, such a baseline is surprisingly effective, and improvements on the baseline are hard-won. 

Here, we define our baseline using a simple multiple linear regression of average rating for each user and average rating for a given restaurant, $m$, to predict what each user would rate $m$. The model is as follows:

$$\hat{Y_{um}} = \hat{\mu} + \hat{\theta_{u}} + \hat{\gamma_{m}}$$

Where $\hat{\theta_{u}}$ is the average rating for user $u$, $\hat{\gamma_{m}}$ is the average rating for restaurant $m$, and $\hat{\mu}$ is the intercept.


In our analysis, we will first run this baseline model on ratings from all 8 states of interest (NV, AZ, ON, NC, OH, PA, QC, and WI). Then we will run the model on each state separately to see how well the model predicts within market.

## Run baseline model on full universe of reviews

In [6]:
## Load in test and train data for all markets
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
print(train_df.shape)
print(test_df.shape)

(1409493, 13)
(535585, 13)


In [11]:
train_df.head()

Unnamed: 0,review_date,business_longitude,business_id,business_categories,business_name,business_state,review_score,user_id,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
0,2011-05-01,-115.279668,nK7JeIqdBli3umEhBIh33g,"['Italian', 'Restaurants']",Lucio Ristorante,NV,3.0,---1lKK3aKOuomHnwAkAow,3.97,31,4.0,36.143983,245
1,2013-03-11,-115.26685,S599hCA4kJJO3_b6SRFKoA,"['Restaurants', 'Mexican', 'Breakfast & Brunch...",Michoacan Gourmet Mexican Restaurant,NV,4.0,---1lKK3aKOuomHnwAkAow,3.97,324,3.5,36.271357,245
2,2010-10-17,-115.19223,2BbFeotL85cIaBjSq1SWiA,"['American (New)', 'Restaurants']",Springs Cafe by Wolfgang Puck,NV,1.0,---1lKK3aKOuomHnwAkAow,3.97,23,2.5,36.168099,245
3,2016-01-19,-115.189844,gTlDDzDEHyDQ6iwjNhpI6A,"['Restaurants', 'Indian']",Mount Everest India's Cuisine,NV,5.0,---1lKK3aKOuomHnwAkAow,3.97,1067,4.5,36.143973,245
4,2012-07-21,-115.151632,AxeQEz3-s9_1TyIo-G7UQw,"['Restaurants', 'Barbeque', 'Thai']",Thai Original BBQ Restaurant,NV,3.0,---1lKK3aKOuomHnwAkAow,3.97,87,4.0,36.153504,245


In [10]:
print(train_df.columns)
train_df.describe()

Index(['review_date', 'business_longitude', 'business_id',
       'business_categories', 'business_name', 'business_state',
       'review_score', 'user_id', 'user_average_rating',
       'business_review_count', 'business_average_rating', 'business_latitude',
       'user_review_count'],
      dtype='object')


Unnamed: 0,business_longitude,review_score,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
count,1409493.0,1409493.0,1409493.0,1409493.0,1409493.0,1409493.0,1409493.0
mean,-100.4127,3.705296,3.737982,388.74,3.698366,37.54085,174.193
std,16.56028,1.242746,0.5116403,720.7097,0.5801115,4.069533,388.9996
min,-115.6796,1.0,1.0,3.0,1.0,33.13728,0.0
25%,-115.1364,3.0,3.47,66.0,3.5,33.61141,21.0
50%,-111.9258,4.0,3.77,170.0,4.0,36.11304,56.0
75%,-80.08083,5.0,4.06,395.0,4.0,41.48515,172.0
max,115.0868,5.0,5.0,6979.0,5.0,48.34344,11065.0


In [13]:
# Define X as user average and business average, Y as given rating for test and train data
X_train_all = train_df[['user_average_rating', 'business_average_rating']]
y_train_all = train_df['review_score']
X_test_all = test_df[['user_average_rating', 'business_average_rating']]
y_test_all = test_df['review_score']

In [17]:
# Fit linear regression using training data and test using the testing data
baseline_all = LinearRegression(fit_intercept=True)
baseline_all.fit(X_train_all, y_train_all)

print('Baseline Intercept:', baseline_all.intercept_)
print('Baseline Coefficients:', baseline_all.coef_)
print('Baseline Train Score:', baseline_all.score(X_train_all, y_train_all))
print('Baseline Test Score:', baseline_all.score(X_test_all, y_test_all))

Baseline Intercept: -2.0590063094
Baseline Coefficients: [ 0.8109637  0.7389574]
Baseline Train Score: 0.266157747505
Baseline Test Score: 0.272368710332


## Run baseline model for each of the eight markets