In [1]:
# Baseline estimation
# Ruf = mu + Bu + Bf, where
# Ruf = Predicted restaurant rating for restaurant f by user u
# mu = The rating of the avg restaurant review (in Phoenix)
# Bu = User u's avg rating - mu
# Bf = Avg. rating for restaurant f - mu

import csv
from sets import Set

# 1. Preparing data

phx_business = open('..\Data\Input\phx.csv')
phx_csv = csv.reader(phx_business)
phx_data = list(phx_csv)
phx_b_ids = Set([])

# Storing business ids of Phx restaurants in a list
i = 0
while i < len(phx_data):
    phx_b_ids.add(phx_data[i][15])
    i = i + 1



In [2]:
# 2. Separating review data created - into training and test set
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split

df = pd.read_csv('..\Data\Input\Phx_Rating.csv')
train, test = train_test_split(df, test_size = 0.2)

In [3]:
#We have a 80/20 training to test set ratio
print len(train)
print len(test)

150819
37705


In [8]:
train.head()

Unnamed: 0,user_id,review_id,business_id,stars
109944,xeEzA63SazODlR6qsf81uw,_oShmRlICA_rGR8hRM8cvg,VY_tvNUCCXGXQeSvJl757Q,4
100066,tSX0aSxqVbQJHsEQswcBgA,zcozfkF_XvCeFAcfh1GYIw,6L9McpPjh3UzZxvR1cf1Lw,5
62586,JIHbz6V2TJ6ho0yUaBsckQ,YFk441mFgnmm9Z5E_X6xjA,7SO_rX1F6rQEl-5s3wZxgQ,5
47637,GfPg4wBCFbIXIRlbnKH31A,Fr17dFbZ2uUf4TxLN20hvQ,VVeogjZya58oiTxK7qUjAQ,5
185477,WehCZQtQu96-VL6RawV_BQ,F4B2xXlkHD5UMYELs6hGFw,O_wVKStXEt3rgZc6MjROqA,5


In [9]:
# 3. Calculating mu

tot_ratings = 0
for index,row in train.iterrows():
    tot_ratings += int(row[3])
mu = float(tot_ratings) / float(len(train))
print "The average rating for a review is %f" %mu

The average rating for a review is 3.804620


In [10]:
# 4. Setting up to calculate Bu
user_dict = {}
# dict in the form of ['userid',(no. of reviews by the user, sum of all ratings)]

for index,row in train.iterrows():    
    if row[0] not in user_dict:
        user_dict[row[0]] = (1 , int(row[3]))
    else:
        tup = user_dict[row[0]]
        new_tup = (tup[0] + 1 , tup[1] + int(row[3]))
        user_dict[row[0]] = new_tup
print "Total no. of unique users in the training set = %d" % len(user_dict)
# A sample entry
# print user_dict['t95D1tnWvAOy2sxXnI3GUA']

Total no. of unique users in the training set = 56724


In [12]:
# 5. Setting up to calclate Bf
restr_dict = {}
# dict in the form of ['businessid',(no. of ratings for the restaurant, sum of all ratings)]
for index,row in train.iterrows():  
    if row[2] not in restr_dict:
        restr_dict[row[2]] = (1 , int(row[3]))
    else:
        tup = restr_dict[row[2]]
        new_tup = (tup[0] + 1 , tup[1] + int(row[3]))
        restr_dict[row[2]] = new_tup
print "Total no. of unique restaurants in the training set = %d" % len(restr_dict)

Total no. of unique restaurants in the training set = 2919


In [13]:
# Calculating the rating of the avg. restaurant, and avg. user
avg_restr_rat = 0
avg_user_rat = 0
for key in restr_dict:
    tup = restr_dict[key]
    avg_restr_rat += float(tup[1]) / float(tup[0])
avg_restr_rat = avg_restr_rat / len(restr_dict)

for key in user_dict:
    tup = user_dict[key]
    avg_user_rat += float(tup[1]) / float(tup[0])
avg_user_rat = avg_user_rat / len(user_dict)

print avg_restr_rat
print avg_user_rat

3.4411581483
3.82454696971


In [15]:
# 6. Calculating RMSE and MAE
rmse_par = 0
mae_par = 0
mu_rms_par = 0
mu_mae_par = 0
for index,row in test.iterrows():
    real_rating = row[3]
    
    if row[0] in user_dict:
        user_tup = user_dict[row[0]]
        # mu * 7 added to neutralize outliers
        user_avg = ((avg_user_rat * 7) + float(user_tup[1])) / (7 + float(user_tup[0]))
        Bu = user_avg - mu
    else:
        Bu = 0
    
    if row[2] in restr_dict:
        restr_tup = restr_dict[row[2]]
        # mu * 20 added to neutralize outliers
        restr_avg = ((avg_restr_rat * 20) + float(restr_tup[1])) / (20 + float(restr_tup[0]))
        Bf = restr_avg - mu
    else:
        Bf = 0
    
    # This is our predicted rating
    pred_rating = mu + Bu + Bf
    rmse_par += (real_rating - pred_rating) ** 2
    mae_par += abs(real_rating - pred_rating)
    mu_rms_par += (real_rating - mu) ** 2
    mu_mae_par += abs(real_rating - mu)
    

test_len = len(test)
rmse = (rmse_par / test_len) ** 0.5
mae = mae_par / test_len
mu_err = (mu_rms_par / test_len) ** 0.5
mu_abs_err = mu_mae_par / test_len
print "Root mean square error for baseline prediction = %f" % rmse
print "Mean absolute error for baseline prediction = %f" % mae
print "RMS Error when you predict mean (mu) for every instance = %f" % mu_err
print "MAE Error when you predict mean (mu) for every instance = %f" % mu_abs_err

Root mean square error for baseline prediction = 1.175559
Mean absolute error for baseline prediction = 0.945651
RMS Error when you predict mean (mu) for every instance = 1.276848
MAE Error when you predict mean (mu) for every instance = 1.035440
