In [1]:
# Import necessary packages
from sklearn.model_selection import train_test_split
import pandas as pd
import math

# <center>Reading Yelp Dataset</center>

In [2]:
ratings_df = pd.read_csv("Yelp\\ratings.csv")

# <center>Splitting the Dataset</center>

In [3]:
# Next, we split the Yelp dataset into a train and test set
# 80% will be used for training
# 20% will be used for testing
train_lst,test_lst = train_test_split(ratings_df, test_size=0.2,random_state=42)

#Since the train_test_split() function returns lists,
#I'm converting to dataframes to make it easier to work with
train_df = pd.DataFrame(train_lst)
test_df = pd.DataFrame(test_lst)

In [4]:
train_df.head()

Unnamed: 0,User ID,Business ID,Rating,Date
13033,4027,397,4,2/1/2014
12113,3374,608,3,10/15/2013
21100,5040,220,5,10/18/2015
20763,6884,799,5,9/26/2015
4438,948,686,2,10/17/2010


In [5]:
test_df.head()

Unnamed: 0,User ID,Business ID,Rating,Date
21643,7280,450,5,11/16/2015
17597,3547,83,3,2/27/2015
29819,9022,1395,5,4/21/2017
29342,8229,333,1,3/23/2017
11453,3400,1089,5,7/30/2013


# <center>Global Mean</center>

In [6]:
# Calculate global mean of train set.
# It's computed by taking the arithmetic 
# average of all ratings in the training set.

global_mean = train_df["Rating"].mean()

print('The global mean is',round(global_mean,3))

The global mean is 3.598


# <center>Create Deviation Function</center>

In [7]:
# This function calculates the deviation
# User deviation can be obtained by subtracting
# the global mean from his/her mean rating. 
# Item deviation can be obtained by subtracting
# the global mean from the item's mean rating.

def find_deviations(ID, rating, df, global_mean):
    dev_df = pd.DataFrame(columns = [ID,'Mean Rating', 'Deviation'])
    dev_df[ID] = df[ID]
    dev_df['Mean Rating'] = df[ID].map(df.groupby(ID)[rating].mean())
    dev_df['Deviation'] = dev_df['Mean Rating'].apply(lambda x: x - global_mean)
    dev_df = dev_df.drop_duplicates()
    
    return dev_df

# <center>Calculate User Deviation </center>

In [8]:
# Calculate bu(user deviation) for each user 
user_dev = find_deviations("User ID", "Rating", train_df, global_mean)

user_dev.head()

Unnamed: 0,User ID,Mean Rating,Deviation
13033,4027,4.5,0.902416
12113,3374,3.0,-0.597584
21100,5040,3.875,0.277416
20763,6884,4.75,1.152416
4438,948,3.0,-0.597584


# <center>Calculate Item Deviation</center>

In [9]:
# Calculate bi(item deviation) for each item
item_dev = find_deviations("Business ID", "Rating", train_df, global_mean)

item_dev.head()

Unnamed: 0,Business ID,Mean Rating,Deviation
13033,397,3.277778,-0.319807
12113,608,2.666667,-0.930918
21100,220,4.25,0.652416
20763,799,3.571429,-0.026156
4438,686,4.333333,0.735749


# <center>Predict Ratings Using Baseline Model</center>
<center>$\hat{r}_{ui} = \mu + b_{u} +  b_{i}$ </center><br>
<center>$\hat{r}_{ui}$ = predicted rating for user u on item i</center> <br>
<center>$\mu$ = global_mean </center><br>
<center>$b_{u}$ = user u's average rating deviation from global_mean </center><br>
<center>$b_{i}$ = item i's average rating deviation from global_mean </center><br>

In [10]:
test_df = test_df.set_index(pd.Series(range(test_df.shape[0])))
Predicted_Ratings = []
for i in range(0, len(test_df)):
    q1 = 0 
    q2 = 0
    total = global_mean
    if test_df.loc[i,'User ID'] in user_dev['User ID'].values:
        q1 = user_dev[user_dev['User ID'] == test_df.loc[i,'User ID']].Deviation.values[0]
    if test_df.loc[i,'Business ID'] in item_dev['Business ID'].values:
        q2 = item_dev[item_dev['Business ID'] == test_df.loc[i,'Business ID']].Deviation.values[0]            
    total += q1 + q2
    Predicted_Ratings.append(total)

test_df['Predicted Ratings'] = Predicted_Ratings

In [11]:
test_df.head()

Unnamed: 0,User ID,Business ID,Rating,Date,Predicted Ratings
0,7280,450,5,11/16/2015,5.331708
1,3547,83,3,2/27/2015,3.350941
2,9022,1395,5,4/21/2017,3.68813
3,8229,333,1,3/23/2017,4.777416
4,3400,1089,5,7/30/2013,4.765625


# <center>Calculate the Mean Absolute Error</center>

In [12]:
# Subtract the predicted rating from the
difference = abs(test_df["Rating"] - test_df["Predicted Ratings"])
result = difference.sum() / len(test_df)
print('The MAE =',result)

The MAE = 1.091803004865211
