In [1]:
import numpy as np
import pandas as pd

In [2]:
import utilities # codeTimer context manager.
import data_preparation # Load dataset and build required matrices.
import factorisation # WALS factorisation.
import recommender # Recommender system.

## Factorisation

In [3]:
np.random.seed(17)

mov, rat = data_preparation.importDataset()
rec = recommender.recommenderSystem(mov, rat)
rec.predictionError()

The dataframe contains 610 users and 2999 items.
Prediction error: 1133575640.90861


In [4]:
reg_lambda = 0.1
n_iter = 1

with utilities.codeTimer("WALS factorisation"):
    rec.performFactorisation(reg_lambda, n_iter)

Test error: 1893085.9978277823
Train error: 1893085.9978277823
Executed 'WALS factorisation'.  Elapsed time: 37.711401s


## Recommendation

In [5]:
def recommend(rec_system, user_id):
    return rec_system.answerQuery(user_id)
        
def bestRated(rec_system, user_id):
    user_movies = rec_system.getUserMovies(user_id)
    return user_movies.sort_values(by = "Rating", ascending = False)

In [6]:
user_id = 4
recommend(rec, user_id).head(10)

Unnamed: 0,MovieID,Prediction,Title,Genres,AVG_Rating
350,378,3.99,Cliffhanger (1993),Action|Adventure|Thriller,3.034653
128,138,3.98,Die Hard: With a Vengeance (1995),Action|Crime|Thriller,3.555556
1164,1209,3.66,Air Force One (1997),Action|Thriller,3.344828
218,234,3.53,"Madness of King George, The (1994)",Comedy|Drama,3.758065
507,551,3.49,James and the Giant Peach (1996),Adventure|Animation|Children|Fantasy|Musical,3.44898
336,364,3.19,Beverly Hills Cop III (1994),Action|Comedy|Crime|Thriller,2.720339
1090,1135,3.17,Liar Liar (1997),Comedy,3.033784
308,334,3.11,Speed (1994),Action|Romance|Thriller,3.52924
1179,1224,3.07,"Game, The (1997)",Drama|Mystery|Thriller,3.701299
8,9,3.07,GoldenEye (1995),Action|Adventure|Thriller,3.496212


In [7]:
bestRated(rec, user_id).head(10)

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
15876,508,4,Adventure|Drama|Western,Dances with Wolves (1990),5.0
13038,413,4,Drama,In the Name of the Father (1993),5.0
14108,461,4,Drama|War,Schindler's List (1993),5.0
2626,52,4,Comedy|Drama|Romance,"Postman, The (Postino, Il) (1994)",5.0
16731,513,4,Animation|Children|Fantasy|Musical,Pinocchio (1940),5.0
16585,512,4,Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast (1991),5.0
6565,211,4,Crime|Drama,Heavenly Creatures (1994),5.0
16507,511,4,Animation|Children|Drama|Fantasy|Musical,Snow White and the Seven Dwarfs (1937),5.0
7862,257,4,Comedy|Crime|Drama|Thriller,Pulp Fiction (1994),5.0
7606,251,4,Crime|Drama,Once Were Warriors (1994),5.0


## TBD: test recommandation

## Similar items
Some suggestions:
* 911: Star Wars Episode VI
* 786: Dumbo
* 957: The Shining
* 474: Blade Runner

In [8]:
rec.suggestSimilar(474)

Unnamed: 0,MovieID,Title,Genres,Similarity
474,474,Blade Runner (1982),Action|Sci-Fi|Thriller,1.0
706,706,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,0.733741
2393,2393,Galaxy Quest (1999),Adventure|Comedy|Sci-Fi,0.70312
1305,1305,Dark City (1998),Adventure|Film-Noir|Sci-Fi|Thriller,0.692184
833,833,"Crying Game, The (1992)",Drama|Romance|Thriller,0.67973
896,896,One Flew Over the Cuckoo's Nest (1975),Drama,0.672173
2233,2233,Time Bandits (1981),Adventure|Comedy|Fantasy|Sci-Fi,0.671319
2350,2350,"Fisher King, The (1991)",Comedy|Drama|Fantasy|Romance,0.66943
2316,2316,Adventures of Buckaroo Banzai Across the 8th D...,Adventure|Comedy|Sci-Fi,0.668396
1485,1485,Metropolis (1927),Drama|Sci-Fi,0.665558


## New user recommendation

In [9]:
np.random.seed(17)

new_user, new_user_id = rec.generateNewUser(8)
np.shape(rec.R)

(610, 2999)

In [10]:
with utilities.codeTimer("New user factorisation"):
    rec.addNewUser(new_user, reg_lambda)
np.shape(rec.R)

Executed 'New user factorisation'.  Elapsed time: 30.383409s


(611, 2999)

In [11]:
recommend(rec, new_user_id).head(10)

Too few movies! Most poular movies will be suggested.


Unnamed: 0,MovieID,Title,Genres,AVG_Rating,Counts
314,314,Forrest Gump (1994),Comedy|Drama|Romance|War,4.164134,329
277,277,"Shawshank Redemption, The (1994)",Crime|Drama,4.429022,317
257,257,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
510,510,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.16129,279
1938,1939,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.192446,278
224,224,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251
418,418,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.75,238
97,97,Braveheart (1995),Action|Drama|War,4.031646,237
507,507,Terminator 2: Judgment Day (1991),Action|Sci-Fi,3.970982,224
461,461,Schindler's List (1993),Drama|War,4.225,220


In [12]:
bestRated(rec, new_user_id).head(10)

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
59364,36,610,Drama,"Cry, the Beloved Country (1995)",5.0
59365,92,610,Comedy,Happy Gilmore (1996),4.0
59367,691,610,Comedy|Drama|Romance,"Apartment, The (1960)",4.0
59368,1220,610,Crime|Drama|Mystery|Thriller,Kiss the Girls (1997),4.0
59371,2684,610,Drama|Musical,Center Stage (2000),4.0
59370,1867,610,Drama|Romance,My Name Is Joe (1998),3.0
59366,515,610,Adventure|Western,"Wild Bunch, The (1969)",1.0
59369,1342,610,Comedy|Drama,The Players Club (1998),0.5


## Cold start problem
If a new user has rated less than 10 movies, the most popular and unseen movies will be recommended.

In [None]:
np.random.seed(17)

new_user, new_user_id = rec.generateNewUser(8)
np.shape(rec.R)

with utilities.codeTimer("New user factorisation"):
    rec.addNewUser(new_user, reg_lambda)

In [None]:

recommend(rec, new_user_id).head(10)

# OLD CODE, TO BE REFACTORED

## Test recom

In the _ml-latest-small_ dataset `MovieId`s do not increase continuously. Even if less than 10000 movies are present, the index goes up to ~19000. In order to fix this unconvenience and make the dataframe indexing more intuitive, a more appropriate index has been built. If necessary, a reverse conversion to the original one could be achieved by storing a two column conversion dataframe.

In [None]:
# Build R matrix and dividing training/test sets.

from scipy.sparse import coo_matrix

np.random.seed(17)

# Dividing training and test set.

# Test set percentage.
p_test = 0.2

# Random boolean, TRUE for test set.
mask = np.random.choice(a = [False, True], size = R.size,
                        p = [p_test, 1 - p_test]).reshape(R.shape)

R_test = np.zeros(R.shape)
R_test[mask] = R[mask]

R_train = np.zeros(R.shape)
R_train[np.invert(mask)] = R[np.invert(mask)]

# Are the R_test and R_train matrices correctly built?
print("R_test and R_train are correctly built: {}"
      .format(np.array_equal(R, R_test + R_train)))

In [None]:
# Compares recommendation on test set with actual ratings.

def recommendTest(user_id, R_test, predicted_ratings):
    """
    TBD
    """
    pred = np.around(predicted_ratings, 2)[user_id]
    # Unseen movies (test set ones)
    idx = np.where(R_test[user_id] != 0)[0]
    # idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, pred[idx]))
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])

    recom_df = pd.merge(recom_df, R_df, on = "MovieID", how = "inner")
    recom_df = recom_df[(recom_df.UserID == user_id)]
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    
    return recom_df

In [None]:
#user_id = 6
#recommendTest(user_id, R_test, predicted_ratings).head(10)

## Cold start problem
If a new user has rated less than 10 movies, the most popular and unseen movies will be recommended.