In [1]:
import os
import re
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import defaultdict

from surprise.prediction_algorithms.algo_base import AlgoBase
from surprise import accuracy
from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor, SVD, SVDpp
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
# from surprise.model_selection import LeaveOneOut


In [2]:
class MovieLensData:
    """
    Movie Lens Data
    """
    def __init__(self, users_path, ratings_path, movies_path, genre_path):
        self.users_path = users_path
        self.ratings_path = ratings_path
        self.movies_path = movies_path
        self.genre_path = genre_path
        
    def read_user_data(self):
        """
        read user data, set user_data
        """
        user_columns = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
        self.user_data = pd.read_csv(self.users_path, sep='|', names=user_columns)
        return self.user_data

    def read_ratings_data(self):
        """
        read ratings data, set ratings_data
        
        """
        ratings_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
        ratings_df = pd.read_csv(self.ratings_path, sep='\t', names=ratings_columns)
        ratings_df.drop( "unix_timestamp", inplace = True, axis = 1 )
        self.ratings_data_df = ratings_df
        reader = Reader(rating_scale=(1, 5))
        self.ratings_data = Dataset.load_from_df(ratings_df, reader=reader)
        
        return self.ratings_data

    def clean_title(self, title):
        """
        auxiliary function for readings movie data
        """
        return re.sub("[\(\[].*?[\)\]]", "",title)

    def process_genre(self, series):
        """
        auxiliary function for readings movie data
        """        
        genres = series.index[6:-2]    
        text = []
        for i in genres:
            if series[i] == 1:
                text.append(i)
                break
        return ", ".join(text)
    
    def read_movies_data(self):
        """
        read movies data, set movie_data
        
        """
        self.movie_id_to_name = {}
        self.name_to_movie_id = {}
        
        genre_df = pd.read_csv(self.genre_path, sep='|', encoding='latin-1')
        genre_columns = ["unknown"] + list(genre_df[genre_df.columns[0]].values)

        movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
        self.movie_data = pd.read_csv(self.movies_path, sep='|', names=movie_columns+genre_columns,
                     encoding='latin-1')
        self.movie_data['title'] = self.movie_data['title'].apply(self.clean_title)
        self.movie_data['title'] = self.movie_data['title'].str.strip()
        self.movie_data['genre'] = self.movie_data.apply(self.process_genre,axis=1)
        
        
        for index, row in self.movie_data.iterrows():
            movie_id = int(row['movie_id'])
            movie_name = row['title']
            self.movie_id_to_name[movie_id] = movie_name
            self.name_to_movie_id[movie_name] = movie_id
            
        return self.movie_data
    
    def get_user_ratings(self, user):
        """
        select ratings for a certain user
        Args
            user: user for which to return the ratings
        Returns
            the ratings for a certain user
        """
        user_ratings = []
        hit_user = False
        user_ratings = self.ratings_data_df.loc[self.ratings_data_df.user_id==user]
        user_ratings = user_ratings[['movie_id', 'rating']]

        return user_ratings     
    
    def get_popularity_ranks(self):
        ratings = defaultdict(int)
        rankings = defaultdict(int)
        for index, row in self.ratings_data_df.iterrows():
            movie_id = int(row['movie_id'])
            ratings[movie_id] += 1
        rank = 1
        for movie_id, rating_count in sorted(ratings.items(), key=lambda x: x[1], reverse=True):
            rankings[movie_id] = rank
            rank += 1
        return rankings
    
    def get_movie_name(self, movie_id):
        if movie_id in self.movie_id_to_name:
            return self.movie_id_to_name[movie_id]
        else:
            return ""
        
    def get_movie_id(self, movie_name):
        if movie_name in self.name_to_movie_id:
            return self.name_to_movie_id[movie_name]
        else:
            return 0

In [3]:
path = "../ml-100k"
movie_lens_data = MovieLensData(
    users_path = os.path.join(path, "u.user"),
    ratings_path = os.path.join(path, "u.data"), 
    movies_path = os.path.join(path, "u.item"), 
    genre_path = os.path.join(path, "u.genre") 
    )

In [4]:
data = movie_lens_data.read_ratings_data()
train_set, test_set = train_test_split(data, test_size=0.25, random_state=42)

In [6]:
# MySVDpp
class MySVDpp(AlgoBase):

    def __init__(self, n_factors=20, n_epochs=20, init_mean=0, init_std_dev=.1,
                 gamma_all=.007, lambda_all=.02,
                 gamma1=None, gamma2=None, gamma3=None,
                 lambda1=None, lambda2=None, lambda3=None,
                 random_state=None,
                 testset=None
                 ):
        self.gamma1 = gamma1 if gamma1 is not None else gamma_all
        self.gamma2 = gamma2 if gamma2 is not None else gamma_all
        self.gamma3 = gamma3 if gamma3 is not None else gamma_all
        self.lambda1 = lambda1 if lambda1 is not None else lambda_all
        self.lambda2 = lambda2 if lambda2 is not None else lambda_all
        self.lambda3 = lambda3 if lambda3 is not None else lambda_all
        self.n_factors = n_factors
        self.n_epochs = n_epochs
        self.init_mean = init_mean
        self.init_std_dev = init_std_dev
        self.random_state = random_state
        self.testset = testset

        AlgoBase.__init__(self)

    def fit(self, trainset):
        AlgoBase.fit(self, trainset)
        self.sgd(trainset)
        return self

    def sgd(self, trainset):
        rng = np.random.RandomState(self.random_state)
        self.BU = np.zeros(trainset.n_users, dtype=np.double)
        self.BI = np.zeros(trainset.n_items, dtype=np.double)
        self.P = rng.normal(self.init_mean, self.init_std_dev,
                            size=(trainset.n_users, self.n_factors))
        self.Q = rng.normal(self.init_mean, self.init_std_dev,
                            size=(trainset.n_items, self.n_factors))
        self.Y = rng.normal(self.init_mean, self.init_std_dev,
                            size=(trainset.n_items, self.n_factors))
        Z = np.zeros(self.n_factors, dtype=np.double)

        g1 = self.gamma1
        g2 = self.gamma2
        g3 = self.gamma3
        l1 = self.lambda1
        l2 = self.lambda2
        l3 = self.lambda3

        max_Iu_length = 0
        for u in range(trainset.n_users):
            max_Iu_length = max(max_Iu_length, len(trainset.ur[u]))
        Iu = [0]*max_Iu_length

        self.RMSE = list()
        self.MAE = list()
        for current_epoch in range(self.n_epochs):
            print(' processing epoch %d' % current_epoch, flush=True)

            for u, i, r in trainset.all_ratings():

                # items rated by u.
                for k, (j, _) in enumerate(trainset.ur[u]):
                    Iu[k] = j
                nu = len(trainset.ur[u])

                nuq = np.sqrt(nu)

                # compute user implicit feedback
                Pu = self.P[u, :].copy()
                Qi = self.Q[i, :].copy()

                Z[:] = 0
                for k in range(nu):
                    Z += self.Y[Iu[k], :]
                Z /= nuq
                Z += Pu

                # compute current error
                err = r - (self.trainset.global_mean +
                           self.BU[u] + self.BI[i] + np.dot(Qi, Z))

                # update biases
                self.BU[u] += g1 * (err - l1 * self.BU[u])
                self.BI[i] += g1 * (err - l1 * self.BI[i])

                # update factors
                self.P[u, :] += g2 * (err * Qi - l2 * Pu)
                self.Q[i, :] += g2 * (err * Z - l2 * Qi)
                nueq = err * Qi / nuq
                for k in range(nu):
                    j = Iu[k]
                    self.Y[j, :] += g3 * (nueq - l3 * self.Y[j, :])

            predictions = self.test(self.testset)
            rmse = accuracy.rmse(predictions, verbose=True)
            mae = accuracy.mae(predictions, verbose=True)
            self.RMSE.append(rmse)
            self.MAE.append(mae)
            # print('  err %lf %lf' % (rmse, mae), flush=True)
            print('', flush=True)

    def estimate(self, u, i):

        est = self.trainset.global_mean

        if self.trainset.knows_user(u):
            est += self.BU[u]

        if self.trainset.knows_item(i):
            est += self.BI[i]

        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            nu = len(self.trainset.ur[u])  # nb of items rated by u
            u_impl = (sum(self.Y[j]
                      for (j, _) in self.trainset.ur[u]) / np.sqrt(nu))
            est += np.dot(self.Q[i], self.P[u] + u_impl)

        return est

    def draw(self):
        plt.plot(range(self.n_epochs), self.RMSE)
        plt.title('RMSE')
        # plt.xlabel('Number of Epochs')
        # plt.ylabel('RMSE')
        plt.legend()
        plt.grid()
        # plt.show()
        plt.savefig('./rmse.png')
        plt.clf()

        plt.plot(range(self.n_epochs), self.MAE)
        plt.title('MAE')
        # plt.xlabel('Number of Epochs')
        # plt.ylabel('MAE')
        plt.legend()
        plt.grid()
        # plt.show()
        plt.savefig('./mae.png')
        plt.clf()


In [7]:
algo_np = MySVDpp(testset=test_set)
algo_np.fit(train_set)

 processing epoch 0


RMSE: 0.9968
MAE:  0.7999

 processing epoch 1
RMSE: 0.9719
MAE:  0.7737

 processing epoch 2
RMSE: 0.9609
MAE:  0.7632

 processing epoch 3
RMSE: 0.9544
MAE:  0.7570

 processing epoch 4
RMSE: 0.9497
MAE:  0.7526

 processing epoch 5
RMSE: 0.9459
MAE:  0.7489

 processing epoch 6
RMSE: 0.9423
MAE:  0.7456

 processing epoch 7
RMSE: 0.9388
MAE:  0.7423

 processing epoch 8
RMSE: 0.9353
MAE:  0.7390

 processing epoch 9
RMSE: 0.9320
MAE:  0.7358

 processing epoch 10
RMSE: 0.9290
MAE:  0.7329

 processing epoch 11
RMSE: 0.9265
MAE:  0.7304

 processing epoch 12
RMSE: 0.9246
MAE:  0.7283

 processing epoch 13
RMSE: 0.9233
MAE:  0.7268

 processing epoch 14
RMSE: 0.9224
MAE:  0.7257

 processing epoch 15
RMSE: 0.9220
MAE:  0.7250

 processing epoch 16
RMSE: 0.9220
MAE:  0.7246

 processing epoch 17
RMSE: 0.9224
MAE:  0.7245

 processing epoch 18
RMSE: 0.9231
MAE:  0.7246

 processing epoch 19
RMSE: 0.9241
MAE:  0.7250



<__main__.MySVDpp at 0x7fb8bcea2610>

In [8]:
algo_np.draw()

No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


No artists with labels found to put in legend.  Note that artists whose label start with an underscore are ignored when legend() is called with no argument.


<Figure size 640x480 with 0 Axes>

In [10]:
# algo_svd = SVDpp(n_factors=20)
algo_svd = SVDpp(n_factors=20, n_epochs=20, verbose=True)
algo_svd.fit(train_set)
RMSE = list()
MAE = list()
for _ in range(20):
    predictions = algo_svd.test(test_set)
    rmse = accuracy.rmse(predictions, verbose=True)
    mae = accuracy.mae(predictions, verbose=True)
    RMSE.append(rmse)
    MAE.append(mae)
    # print('  err %lf %lf' % (rmse, mae), flush=True)
    print('', flush=True)

 processing epoch 0
 processing epoch 1
 processing epoch 2
 processing epoch 3
 processing epoch 4
 processing epoch 5
 processing epoch 6
 processing epoch 7
 processing epoch 8
 processing epoch 9
 processing epoch 10
 processing epoch 11
 processing epoch 12
 processing epoch 13
 processing epoch 14
 processing epoch 15
 processing epoch 16
 processing epoch 17
 processing epoch 18
 processing epoch 19
RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

RMSE: 0.9250
MAE:  0.7256

