In [104]:
import pandas as pd
import numpy as np
import math

In [2]:
rate = pd.read_csv('./ratings_small.csv')

In [3]:
rate_mtrx = rate.pivot(index='userId', columns='movieId', values='rating')
##  select the first 7 movies only
rate_mtrx = rate_mtrx.reset_index()[['userId',1,2,3,4,5,6,7]]

In [69]:
### take a look at the first 10 rows
rate_mtrx.head(10)

movieId,userId,1,2,3,4,5,6,7
0,1,,,,,,,
1,2,,,,,,,
2,3,,,,,,,
3,4,,,,,,,
4,5,,,4.0,,,,
5,6,,,,,,,
6,7,3.0,,,,,,
7,8,,,,,,,
8,9,4.0,,,,,,
9,10,,,,,,,


In [4]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(rate_mtrx, test_size=0.2)

In [5]:
total_sum = (train[1].sum(skipna=True) +
      train[2].sum(skipna=True) +
      train[3].sum(skipna=True) +
      train[4].sum(skipna=True) +
      train[5].sum(skipna=True) +
      train[6].sum(skipna=True) +
      train[7].sum(skipna=True))

total_entries = (~np.isnan(train)).sum(1).sum()

train_mean = total_sum/total_entries

print("training raw average: ", train_mean)

training raw average:  1.7935393258426966


In [99]:
### calculating rmse for training data
stan_err = []
for i in train.columns[1:8]:
    for j in train[i]:
        if  np.isnan(j) == False:
            stan_err.append((j-train_mean)**2)
            
train_rmse = math.sqrt(sum(stan_err)/len(stan_err))

print("training RMSE: ", train_rmse)

training RMSE:  2.061338333323324


In [100]:
stan_err = []
for i in test.columns[1:8]:
    for j in test[i]:
        if  np.isnan(j) == False: 
            stan_err.append((j-train_mean)**2)

test_rmse = math.sqrt(sum(stan_err)/len(stan_err))

print("testing RMSE: ", test_rmse)

testing RMSE:  2.100959174821407


In order to find the baseline predictor, we need to find the movie bias for each movie and user bias for each user

In [46]:
### calculate and store movie bias for each movie
movie_bias = []
for movie in train.columns[1:8]:
    movie_bias.append(train[movie].mean(skipna=True)-train_mean)
    
print(movie_bias)

[2.0989606741573033, 1.6175717852684144, 1.4064606741573036, 0.591076058772688, 1.4964606741573034, 2.0770489094514213, 1.513278855975485]


In [47]:
### calculate and store user bias for each user
user_bias = []
for user in range(len(train)):
    user_bias.append(train.iloc[user,1:8].mean(skipna=True)-train_mean)
    
print(len(user_bias))

536


In [71]:
### baseline predictor = user bias + movie bias + raw avg
### calculate baseline predictor for each combination of user and movie; we store the values to df DataFrame
df_mtx = np.matrix([user_bias]).reshape(len(user_bias),1) + np.matrix([movie_bias]).reshape(1,len(movie_bias)) + train_mean
df = pd.DataFrame(df_mtx)

print(df.head(10))

          0         1         2         3         4         5         6
0       NaN       NaN       NaN       NaN       NaN       NaN       NaN
1       NaN       NaN       NaN       NaN       NaN       NaN       NaN
2       NaN       NaN       NaN       NaN       NaN       NaN       NaN
3       NaN       NaN       NaN       NaN       NaN       NaN       NaN
4  5.432294  4.950905  4.739794  3.924409  4.829794  5.410382  4.846612
5  7.098961  6.617572  6.406461  5.591076  6.496461  7.077049  6.513279
6  6.098961  5.617572  5.406461  4.591076  5.496461  6.077049  5.513279
7       NaN       NaN       NaN       NaN       NaN       NaN       NaN
8       NaN       NaN       NaN       NaN       NaN       NaN       NaN
9       NaN       NaN       NaN       NaN       NaN       NaN       NaN


Some predictor values are greater than 5(the max rating), so we need to cap these value at 5

In [40]:
### capping baseline predictor values at 5
df.loc[df[0]>5,0]=5
df.loc[df[1]>5,1]=5
df.loc[df[2]>5,2]=5
df.loc[df[3]>5,3]=5
df.loc[df[4]>5,4]=5
df.loc[df[5]>5,5]=5
df.loc[df[6]>5,6]=5

In [118]:
### calculating rmse for training data
train2 = train.iloc[:,1:8]

se = (df - train2)**2
total_count = (~np.isnan(se)).sum(1).sum()

new_rmse = np.sqrt((se.sum(1).sum())/total_count)

print("new RMSE: " + str(new_rmse))

new rmse: 1.9508534075383046


In [122]:
improvement = (1 - new_rmse/train_rmse)*100
print("RMSE is improved by " + str(round(improvement)) + "%")

RMSE is improved by 5.0%
