In [1]:
from io import StringIO
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import os
import urllib.request
import numpy as np
from sklearn.metrics import mean_squared_error
from datetime import datetime
# print message after packages imported successfully
print("import of packages successful")
##import of packages successful
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise import BaselineOnly

import of packages successful


## 2. Baseline Estimates

* Let $u$ and $v$ be two users and $i$ and $j$ two films;

* we define:
  *  $r_{ui}$ as the rating by user $u$ on film $i$;
  *  $\hat{r}_{ui}$ as the **predicted** rating of $r_{ui}$;


* In Netflix data $99\%$ of ratings are missing, the $(u,i)$ pairs for which $r_{ui}$ is known are stored in the set

\begin{equation}
\mathcal{K} = \{(u,i) \quad \vert \quad   r_{ui} \quad \mbox{is known}\}.
\end{equation}

\\

* In rating data, we tend to have users who systematically give higher ratings than others and also, some movies which receive higher ratings than others;

* In Section 2.1 of the [article](https://www.cs.rochester.edu/twiki/pub/Main/HarpSeminar/Factorization_Meets_the_Neighborhood-_a_Multifaceted_Collaborative_Filtering_Model.pdf) these tendencies are considered as baseline ratings $b_{ui}$ and are defined as

\begin{equation}
b_{ui} = \mu + b_u + b_i,
\end{equation}

\\

* where:
  *  $\mu$ is the overall average rating;
  * $b_u$ the observed deviation of user $u$;
  * $b_i$ the observed deviation of movie $i$;

\\



* Estimates of $b_u$s and $b_i$s are obtained by the minimization of the regularized MSE loss function

\begin{equation}
\sum_{(u,i) \in \mathcal{K}} (r_{ui} - b_{ui})^2 + \lambda_1 \left( \sum_{u} b_u^2 + \sum_{i} b_i^2 \right),
\end{equation}

\\

  - where $\lambda_1 \left( \sum_{u} b_u^2 + \sum_{i} b_i^2 \right)$ is the regularization term to avoid overfitting. The penality coefficient $\lambda_1 = 0.02$.

\\

* Write a python function **baseline_estimator** to estimate $b_u, b_i$ for every $(u,i) \in \mathcal{K}$;

\\

* The problem above can easily be transformed into linear regression problem and the minimization of the regularized MSE can be done through gradient descent.


In [2]:
%store -r tf
%store -r df
%store -r R_pd1
df = df
tf = tf
Rf = R_pd1

In [3]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
train_data = Dataset.load_from_df(Rf, reader)
trainset = train_data.build_full_trainset()


# SGD
print('Baseline Estimators Using SGD')
bsl_options = {'method': 'sgd',
               'learning_rate': .00005,
            }
algo = BaselineOnly(bsl_options=bsl_options)

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)

Baseline Estimators Using SGD
Estimating biases using sgd...


<surprise.prediction_algorithms.baseline_only.BaselineOnly at 0x7f4d365d8790>

In [4]:
uid = str(2442)  # raw user id (as in the ratings file). They are **strings**!
iid = str(1)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)


user: 2442       item: 1          r_ui = 4.00   est = 3.69   {'was_impossible': False}


In [5]:
testset= [tuple(x) for x in tf.itertuples(index=False)]

In [6]:
predictions = algo.test(testset)
print(predictions[0])
# Then compute RMSE
accuracy.rmse(predictions)

user: 30878      item: 1          r_ui = 4.00   est = 3.71   {'was_impossible': False}
RMSE: 0.8822


0.8821615718481657

In [7]:
%store testset trainset

Stored 'testset' (list)
Stored 'trainset' (Trainset)
