In [9]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline

# Baseline Model

At its most basic level, a recommendation system takes information previously collected on both users and items and is able to pair them together to predict how much a user would like new items. Often times, such a baseline is surprisingly effective, and improvements on the baseline are hard-won. 

Here, we define our baseline using a simple multiple linear regression of average rating for each user and average rating for a given restaurant, $m$, to predict what each user would rate $m$. The model is as follows:

$$\hat{Y_{um}} = \hat{\mu} + \hat{\theta}_{u} + \hat{\gamma}_{m}$$

Where $\hat{\theta}_{u}$ is the average rating for user $u$, $\hat{\gamma}_{m}$ is the average rating for restaurant $m$, and $\hat{\mu}$ is the intercept.


In our analysis, we will first run this baseline model on ratings from all 8 states of interest (NV, AZ, ON, NC, OH, PA, QC, and WI). Then we will run the model on each state separately to see how well the model predicts within market.

## Run baseline model on full universe of reviews

In [28]:
## Load in test and train data for all markets
#train_df = pd.read_csv("Data/train.csv")
#test_df = pd.read_csv("Data/test.csv")
train_df = pd.read_csv("Data/states/train/PA/train_150.csv")
test_df = pd.read_csv("Data/states/test/PA/test_150.csv")
print(train_df.shape)
print(test_df.shape)

(4870, 13)
(1231, 13)


In [29]:
train_df.head()

Unnamed: 0,review_date,business_longitude,business_id,business_categories,business_name,business_state,review_score,user_id,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
0,2010-12-30,-79.923402,v3BLVPLRo2Egeak29B9lug,"['Restaurants', 'Indian']",Coriander India Grill,PA,3.0,-Vu7L3U7-kxDyY1VHxw3zw,3.3,126,3.0,40.432151,548
1,2010-06-23,-79.957143,MBirSnTW4pt2k7Ny6KC72w,"['Chinese', 'Szechuan', 'Restaurants']",Szechuan Express,PA,1.0,-Vu7L3U7-kxDyY1VHxw3zw,3.3,61,3.0,40.441793,548
2,2010-11-30,-79.95612,jd1rD7jr-_zI46Oxh5lDLA,"['Restaurants', 'Pizza', 'Italian']",Antoon's Pizza,PA,2.0,-Vu7L3U7-kxDyY1VHxw3zw,3.3,19,3.5,40.440158,548
3,2011-03-16,-79.97998,t5DQeB3dORfNwx9otGUrSA,"['Restaurants', 'Sports Bars', 'Bars', 'Wine B...",Casey's Draft House,PA,2.0,-Vu7L3U7-kxDyY1VHxw3zw,3.3,26,3.5,40.428944,548
4,2010-09-18,-79.968631,7mU3l5VjH1IxsXcxBxUblg,"['Bars', 'Restaurants', 'Nightlife', 'Mexican']",Emilliano's Mexican Restaurant and Bar,PA,2.0,-Vu7L3U7-kxDyY1VHxw3zw,3.3,208,3.5,40.427722,548


In [30]:
print(train_df.columns)
train_df.describe()

Index(['review_date', 'business_longitude', 'business_id',
       'business_categories', 'business_name', 'business_state',
       'review_score', 'user_id', 'user_average_rating',
       'business_review_count', 'business_average_rating', 'business_latitude',
       'user_review_count'],
      dtype='object')


Unnamed: 0,business_longitude,review_score,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
count,4870.0,4870.0,4870.0,4870.0,4870.0,4870.0,4870.0
mean,-79.989342,3.665708,3.783821,129.050719,3.65883,40.438113,753.461396
std,0.069442,0.959412,0.219365,159.652129,0.601462,0.045704,318.613522
min,-80.262913,1.0,3.3,3.0,1.0,40.206497,255.0
25%,-80.01689,3.0,3.66,32.0,3.5,40.427769,532.0
50%,-79.984218,4.0,3.79,76.5,3.5,40.44159,708.0
75%,-79.949205,4.0,3.95,157.0,4.0,40.455916,861.0
max,-79.661028,5.0,4.21,1353.0,5.0,40.690182,1452.0


In [31]:
# Define X as user average and business average, Y as given rating for test and train data
X_train_all = train_df[['user_average_rating', 'business_average_rating']]
y_train_all = train_df['review_score']
X_test_all = test_df[['user_average_rating', 'business_average_rating']]
y_test_all = test_df['review_score']

In [32]:
# Fit linear regression using training data and test using the testing data
baseline_all = LinearRegression(fit_intercept=True)
baseline_all.fit(X_train_all, y_train_all)

print('Baseline Intercept:', baseline_all.intercept_)
print('Baseline Coefficients:', baseline_all.coef_)
print('Baseline Train Score:', baseline_all.score(X_train_all, y_train_all))
print('Baseline Test Score:', baseline_all.score(X_test_all, y_test_all))

Baseline Intercept: -1.27978884079
Baseline Coefficients: [ 0.66961267  0.65917324]
Baseline Train Score: 0.206271909893
Baseline Test Score: 0.187776186611


----
Ridge Train Score 0.200682664465
Ridge Test Score 0.180413592917
----
Lasso Train Score 0.200681495722
Lasso Test Score 0.180447540241


## Run baseline model for each of the eight markets