In [7]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from math import sqrt
import statsmodels.api as sm
from statsmodels.api import OLS
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
%matplotlib inline

# Baseline Model

At its most basic level, a recommendation system takes information previously collected on both users and items and is able to pair them together to predict how much a user would like new items. Often times, such a baseline is surprisingly effective, and improvements on the baseline are hard-won. 

Here, we define our baseline using a simple multiple linear regression of average rating for each user and average rating for a given restaurant, $m$, to predict what each user would rate $m$. The model is as follows:

$$\hat{Y_{um}} = \hat{\mu} + \hat{\theta}_{u} + \hat{\gamma}_{m}$$

Where $\hat{\theta}_{u}$ is the average rating for user $u$, $\hat{\gamma}_{m}$ is the average rating for restaurant $m$, and $\hat{\mu}$ is the intercept.


In our analysis, we will run this baseline model on ratings from Ohio, as it provides an inbetween representation of larger markets and smaller markets.

## Run baseline model on full universe of reviews

In [2]:
## Load in test and train data for all markets
#train_df = pd.read_csv("Data/train.csv")
#test_df = pd.read_csv("Data/test.csv")
train_df = pd.read_csv("Data/states/train/OH/train_150.csv")
test_df = pd.read_csv("Data/states/test/OH/test_150.csv")
print(train_df.shape)
print(test_df.shape)

(3925, 13)
(942, 13)


In [3]:
train_df.head()

Unnamed: 0,review_date,business_longitude,business_id,business_categories,business_name,business_state,review_score,user_id,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
0,2012-08-09,-81.688974,HNs2Nf-trqFTDtho4vhfmA,"['Bars', 'Lounges', 'Restaurants', 'American (...",The South Side,OH,3.0,3Uv0dGI2IXJb2OUj8R2GJA,3.85,275,3.5,41.482026,482
1,2011-09-27,-81.57972,SP7H3zPArNvbHKQW0c_gpA,"['Restaurants', 'Thai', 'Asian Fusion']",High Thai'd,OH,2.0,3Uv0dGI2IXJb2OUj8R2GJA,3.85,100,4.0,41.510991,482
2,2011-06-20,-81.729861,YgHp9MdZ1vVdYyMEro4TtQ,"['Bars', 'Barbeque', 'Pizza', 'American (New)'...",XYZ the Tavern,OH,4.0,3Uv0dGI2IXJb2OUj8R2GJA,3.85,181,4.0,41.484139,482
3,2014-04-16,-81.73041,wmstf9dw0-kN3YThIxx8eQ,"['Irish', 'Bars', 'Pubs', 'Nightlife', 'Restau...",Stone Mad Pub,OH,4.0,3Uv0dGI2IXJb2OUj8R2GJA,3.85,126,3.5,41.486707,482
4,2010-08-02,-81.690048,Xny0n0s98TpP82sQxfgIMQ,"['Polish', 'Nightlife', 'Restaurants', 'Americ...",Sokolowski's University Inn,OH,3.0,3Uv0dGI2IXJb2OUj8R2GJA,3.85,368,4.5,41.484752,482


In [4]:
print(train_df.columns)
train_df.describe()

Index(['review_date', 'business_longitude', 'business_id',
       'business_categories', 'business_name', 'business_state',
       'review_score', 'user_id', 'user_average_rating',
       'business_review_count', 'business_average_rating', 'business_latitude',
       'user_review_count'],
      dtype='object')


Unnamed: 0,business_longitude,review_score,user_average_rating,business_review_count,business_average_rating,business_latitude,user_review_count
count,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0,3925.0
mean,-81.671409,3.696051,3.826395,116.180637,3.691338,41.470231,731.208662
std,0.142339,0.960004,0.192418,133.248942,0.59844,0.076287,439.526213
min,-82.226472,1.0,3.32,3.0,1.0,41.108641,309.0
25%,-81.761182,3.0,3.7,33.0,3.5,41.459052,464.0
50%,-81.690421,4.0,3.78,74.0,4.0,41.484801,609.0
75%,-81.580318,4.0,3.97,148.0,4.0,41.500613,868.0
max,-81.072826,5.0,4.17,896.0,5.0,41.764307,1952.0


In [5]:
# Define X as user average and business average, Y as given rating for test and train data
X_train_all = train_df[['user_average_rating', 'business_average_rating']]
y_train_all = train_df['review_score']
X_test_all = test_df[['user_average_rating', 'business_average_rating']]
y_test_all = test_df['review_score']

In [8]:
# Fit linear regression using training data and test using the testing data
baseline_all = LinearRegression(fit_intercept=True)
baseline_all.fit(X_train_all, y_train_all)

print('Baseline Intercept:', baseline_all.intercept_)
print('Baseline Coefficients:', baseline_all.coef_)
print('Baseline Train Score:', baseline_all.score(X_train_all, y_train_all))
print('Baseline Test Score:', baseline_all.score(X_test_all, y_test_all))
print(sqrt(mean_squared_error(y_train_all, baseline_all.predict(X_train_all))))
print(sqrt(mean_squared_error(y_test_all, baseline_all.predict(X_test_all))))

Baseline Intercept: -1.71779400589
Baseline Coefficients: [ 0.81630482  0.62046353]
Baseline Train Score: 0.196211027717
Baseline Test Score: 0.189254189855
0.8605750453011812
0.8905561821699436
