In [1]:
import numpy as np
import pandas as pd

In [2]:
import utilities # codeTimer context manager.
import data_preparation # Load dataset and build required matrices.
import factorisation # WALS factorisation.
import recommender # Recommender system.

## Factorisation

In [3]:
np.random.seed(17)

mov, rat = data_preparation.importDataset()
rec = recommender.recommenderSystem(mov, rat)
rec.predictionError()

The dataframe contains 610 users and 2999 items.
Prediction error: 1133575640.90861


In [4]:
reg_lambda = 0.1
n_iter = 4

with utilities.codeTimer("WALS factorisation"):
    rec.performFactorisation(reg_lambda, n_iter)

Test error: 1893085.9978277823
Train error: 1893085.9978277823
Test error: 900696.2858679722
Train error: 900696.2858679722
Test error: 726924.1252320689
Train error: 726924.1252320689
Test error: 668523.4250739744
Train error: 668523.4250739744
Executed 'WALS factorisation'.  Elapsed time: 174.159575s


## Recommendation

In [5]:
def recommend(rec_system, user_id):
    return rec_system.answerQuery(user_id)
        
def bestRated(rec_system, user_id):
    user_movies = rec_system.getUserMovies(user_id)
    return user_movies.sort_values(by = "Rating", ascending = False)

In [6]:
user_id = 4
recommend(rec, user_id).head(10)

Unnamed: 0,MovieID,Prediction,Title,Genres,AVG_Rating
0,418,3.33,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.75
1,234,3.1,"Madness of King George, The (1994)",Comedy|Drama,3.758065
2,551,3.03,James and the Giant Peach (1996),Adventure|Animation|Children|Fantasy|Musical,3.44898
3,192,2.83,Disclosure (1994),Drama|Thriller,3.538462
4,334,2.6,Speed (1994),Action|Romance|Thriller,3.52924
5,785,2.53,Mary Poppins (1964),Children|Comedy|Fantasy|Musical,3.887324
6,16,2.51,Sense and Sensibility (1995),Drama|Romance,3.776119
7,31,2.51,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,3.983051
8,505,2.43,Ghost (1990),Comedy|Drama|Fantasy|Romance|Thriller,3.434783
9,287,2.4,Star Trek: Generations (1994),Adventure|Drama|Sci-Fi,3.393519


In [7]:
bestRated(rec, user_id).head(10)

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
15876,508,4,Adventure|Drama|Western,Dances with Wolves (1990),5.0
13038,413,4,Drama,In the Name of the Father (1993),5.0
14108,461,4,Drama|War,Schindler's List (1993),5.0
2626,52,4,Comedy|Drama|Romance,"Postman, The (Postino, Il) (1994)",5.0
16731,513,4,Animation|Children|Fantasy|Musical,Pinocchio (1940),5.0
16585,512,4,Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast (1991),5.0
6565,211,4,Crime|Drama,Heavenly Creatures (1994),5.0
16507,511,4,Animation|Children|Drama|Fantasy|Musical,Snow White and the Seven Dwarfs (1937),5.0
7862,257,4,Comedy|Crime|Drama|Thriller,Pulp Fiction (1994),5.0
7606,251,4,Crime|Drama,Once Were Warriors (1994),5.0


## TBD: test recommandation

## Similar items

In [8]:
# Some suggestions:
#
# 911: Star Wars Episode VI
# 786: Dumbo
# 957: The Shining
# 474: Blade Runner

rec.suggestSimilar(474)

Unnamed: 0,MovieID,Title,Genres,Similarity
474,474,Blade Runner (1982),Action|Sci-Fi|Thriller,1.0
706,706,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,0.652995
896,896,One Flew Over the Cuckoo's Nest (1975),Drama,0.630796
969,969,Back to the Future (1985),Adventure|Comedy|Sci-Fi,0.628644
915,915,Alien (1979),Horror|Sci-Fi,0.625188
863,863,Monty Python and the Holy Grail (1975),Adventure|Comedy|Fantasy,0.603547
900,900,Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure,0.603292
939,939,"Terminator, The (1984)",Action|Sci-Fi|Thriller,0.592732
902,902,Aliens (1986),Action|Adventure|Horror|Sci-Fi,0.570372
973,973,Akira (1988),Action|Adventure|Animation|Sci-Fi,0.568143


# OLD CODE, TO BE REFACTORED

## Test recom

In the _ml-latest-small_ dataset `MovieId`s do not increase continuously. Even if less than 10000 movies are present, the index goes up to ~19000. In order to fix this unconvenience and make the dataframe indexing more intuitive, a more appropriate index has been built. If necessary, a reverse conversion to the original one could be achieved by storing a two column conversion dataframe.

In [None]:
# Build R matrix and dividing training/test sets.

from scipy.sparse import coo_matrix

np.random.seed(17)

# Dividing training and test set.

# Test set percentage.
p_test = 0.2

# Random boolean, TRUE for test set.
mask = np.random.choice(a = [False, True], size = R.size,
                        p = [p_test, 1 - p_test]).reshape(R.shape)

R_test = np.zeros(R.shape)
R_test[mask] = R[mask]

R_train = np.zeros(R.shape)
R_train[np.invert(mask)] = R[np.invert(mask)]

# Are the R_test and R_train matrices correctly built?
print("R_test and R_train are correctly built: {}"
      .format(np.array_equal(R, R_test + R_train)))

In [None]:
# Compares recommendation on test set with actual ratings.

def recommendTest(user_id, R_test, predicted_ratings):
    """
    TBD
    """
    pred = np.around(predicted_ratings, 2)[user_id]
    # Unseen movies (test set ones)
    idx = np.where(R_test[user_id] != 0)[0]
    # idx = np.where(R[user] == 0)[0]
    movie_pred = list(zip(idx, pred[idx]))
    recom_df = pd.DataFrame(movie_pred, columns = ['MovieID', 'Prediction'])

    recom_df = pd.merge(recom_df, R_df, on = "MovieID", how = "inner")
    recom_df = recom_df[(recom_df.UserID == user_id)]
    recom_df = recom_df.sort_values(by = 'Prediction', ascending = False)
    
    return recom_df

In [None]:
#user_id = 6
#recommendTest(user_id, R_test, predicted_ratings).head(10)

## Suggesting similar items

## Recommendation for a new user

In [None]:
from random import sample, choice

def generateNewUser(R, n_movies):
    new_user = []
    dim = np.shape(R)[1]
    
    new_user = np.zeros(dim)
    new_user_id = len(R)
    
    # Get indices of watched movies.
    obs = sample(range(dim), n_movies)
    avail_ratings = [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
    for i in obs:
        new_user[i] = choice(avail_ratings)
        
    return new_user, new_user_id


def updateMatrices(new_user, R, C, X):
    # Adding new user to R matrix.
    R = np.vstack((R, new_user))
    C = buildWeightMatrix(R, alpha = 10)
    X = np.vstack((X, np.random.rand(np.shape(X)[1])))
    
    return R, C, X


def updateDataFrame(R_df, new_user):    
    new_df = pd.DataFrame(new_user, columns=["Rating"])
    new_df["MovieID"] = range(0, len(new_user))
    new_df["UserID"] = R_df["UserID"].max() + 1
    new_df = new_df[new_df["Rating"] != 0]
    new_df = pd.merge(new_df, movies_df, on = "MovieID", how = "inner")
    new_df = new_df[['MovieID', 'UserID', 'Genres', 'Title', 'Rating']]
    
    R_df = R_df.append(new_df, ignore_index = True).sort_values(by = ["MovieID", "UserID"])
    
    return R_df

In [None]:
def newUserSinglePassWALS(new_user, R, C, X, Y, reg_lambda):
    
    
    # Updating R, C and X matrices.
    R = np.vstack((R, new_user))
    
    M = np.shape(X)[0]
    K = np.shape(X)[1]    
    
    # Perform user matrix optimisation.
    for u in range(1, M):
        Cu = np.diag(C[u, :])
        # A X_u = b
        A = multi_dot([Y.T, Cu, Y]) + reg_lambda * np.eye(K)
        b = multi_dot([Y.T, Cu, R[u, :]])
        X_u = np.linalg.solve(A, b)
        
        X[u,] = X_u

In [None]:
# Creating new user and updating both matrices and the R_df dataframe.

np.random.seed(17)

new_user, new_user_id = generateNewUser(R, 130)

R, C, X = updateMatrices(new_user, R, C, X)
R_df = updateDataFrame(R_df, new_user)

with codeTimer("New user recomputation"):
    newUserSinglePassWALS(new_user, R, C, X, Y, reg_lambda)

In [None]:
R

In [None]:
predicted_ratings = predict(X, Y)
recommendNew(new_user_id, R, predicted_ratings).head(10)

In [None]:
R_df[R_df['UserID'] == new_user_id].sort_values(by = 'Rating', ascending = False).head(15)

## Cold start problem

In [None]:
#test = R
R

In [None]:
np.shape(new_user)

In [None]:
np.insert(test, new_user, axis = 0)

In [None]:
R = np.vstack((R, new_user))

In [None]:
plt.plot(new_user)

In [None]:
import matplotlib.pyplot as plt

## Classify test commit.

In [None]:
class RecommenderSystem:
    def __init__(self, )

In [None]:
def recommend(rec_system, user_id):
    recom_df = rec_system.answer_query(user_id)
    with pd.option_context('display.max_rows', 10,
                           'display.max_columns', None):  
        print(df)