In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import utilities # codeTimer context manager and saving/loading utilities.
import data_preparation # Load dataset and build required matrices.
import factorisation # WALS factorisation.
import recommender # Recommender system.

## Loading dataset and creating recommender system

In [3]:
np.random.seed(17)

mov, rat, rat_test = data_preparation.importDataset()
k = 100
rec = recommender.recommenderSystem(mov, rat, rat_test, k)

print("Prediction MAE: {}".format(rec.predictionError()))

The dataframe contains 610 users and 9721 items.
Prediction MAE: 46.3893943883063


A pre-trained recommender system can be loaded using the following cell. The pre-trained system has been trained using the whole dataset with $k = 100$ latent factors and 10 iterations.

In order to save disk space, the saved system consists only of the item and user embedding matrices. This requires the user to call the previous cell, building the remaining components of the system. 

In [4]:
#utilities.loadRecSys(rec, "rec.npz")

## Exploratory analysis

In [None]:
np.count_nonzero(rec.R)
sparsity = np.count_nonzero(rec.R)/np.prod(rec.R.shape)
print("Matrix sparsity: {}%".format(sparsity*100))

In [None]:
# Distribution of number of ratings.
from matplotlib.ticker import MaxNLocator
plt.rcParams.update({'font.size': 13})

n_ratings = [np.count_nonzero(i) for i in rec.R]


fig, ax = plt.subplots()

ax.hist(n_ratings, bins = 150)

ax.set_title("Number of ratings histogram")
ax.set_ylabel("Users")
y_vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in y_vals/np.shape(rec.R)[0]])
ax.set_xlim(0, 600)
ax.set_xlabel("Number of ratings")
ax.xaxis.set_major_locator(MaxNLocator(integer = True))

plt.savefig("plots/n_ratings.pdf", transparent = True)
plt.show()

In [None]:
# Counting number ratings.
rates = rec.R.flatten()
rates = rates[rates != 0]
unique, counts = np.unique(rates, return_counts = True)

fig, ax = plt.subplots()
ax.bar(unique, counts / sum(counts), width = 0.4)

ax.set_title("Distribution of ratings")

y_vals = ax.get_yticks()
ax.set_yticklabels(['{:,.0%}'.format(x) for x in y_vals])

ax.set_xlabel("Rating")
ax.xaxis.set_major_locator(MaxNLocator(integer = False))
plt.rcParams['xtick.labelsize'] = 11
ax.set_xticks(list(unique))

plt.savefig("plots/ratings_dist.pdf", transparent = True)
plt.show()

## Selecting best regression coefficient with CV

Expect ~ 3 hours for the execution of the following cell.

In [None]:
np.random.seed(17)

reg_lambda = [0.05, 0.15, 0.30]

n_folds = 4
n_iter = 4

# reg_lambda is required to be a list.
with utilities.codeTimer("Best regression lambda CV"):
    best_lambda, errors = rec.bestLambdaCV(n_folds, n_iter, reg_lambda)

In [None]:
# Plotting results.

fig, ax = plt.subplots()
x = [i + 1 for i in range(n_iter)]
for i in range(len(reg_lambda)):
    ax.plot(x, errors[i][1])

ax.set_title("kFoldCV error")
ax.set_ylabel("MAE")
ax.set_xlabel("Iteration")
ax.xaxis.set_major_locator(MaxNLocator(integer = True))
ax.legend(reg_lambda, title = r"$\lambda$ values")

plt.savefig("plots/TestErrorCV.pdf", transparent = True)
plt.show()

## Factorisation

Expect ~ 50 minutes for the execution of the following cell.

In [None]:
#reg_lambda = best_lambda
reg_lambda = 0.15
n_iter = 10

with utilities.codeTimer("WALS factorisation"):
    train, test = rec.performFactorisation(reg_lambda, n_iter)

In [None]:
# Plotting results.
fig, ax = plt.subplots()
x = [i + 1 for i in range(len(test))]
ax.plot(x, test)

ax.set_title("WALS factorisation")
ax.set_ylabel("MAE")
#ax.set_ylim(0, 0.9)
ax.set_xlabel("Iteration")
ax.xaxis.set_major_locator(MaxNLocator(integer = True))

plt.savefig("plots/WALS_train.pdf", transparent = True)
plt.show()

Once the recommander system has been trained, it can be saved to file using the following cell.

In [None]:
#utilities.saveRecSys(rec, "rec.npz")

## Recommendation

In [5]:
def recommend(rec_system, user_id):
    return rec_system.answerQuery(user_id)
        
def bestRated(rec_system, user_id):
    user_movies = rec_system.getUserMovies(user_id)
    return user_movies.sort_values(by = "Rating", ascending = False)

In [6]:
user_id = 2
recommend(rec, user_id).head(10)

Unnamed: 0,MovieID,Prediction,Title,Genres,AVG_Rating
2633,2674,3.78,Gladiator (2000),Action|Adventure|Drama,3.909677
480,483,3.55,"Nightmare Before Christmas, The (1993)",Animation|Children|Fantasy|Musical,3.559783
944,957,3.28,"Shining, The (1980)",Horror,4.074766
889,902,2.88,Aliens (1986),Action|Adventure|Horror|Sci-Fi,3.964286
1037,1055,2.6,Star Trek VI: The Undiscovered Country (1991),Action|Mystery|Sci-Fi,3.353659
939,952,2.48,"Day the Earth Stood Still, The (1951)",Drama|Sci-Fi|Thriller,3.82
6115,6242,2.43,Little Miss Sunshine (2006),Adventure|Comedy|Drama,3.847826
913,926,2.39,Amadeus (1984),Drama,4.184211
1418,1438,2.38,Rain Man (1988),Drama,3.781915
2269,2302,2.37,Dogma (1999),Adventure|Comedy|Fantasy,3.614865


In [8]:
bestRated(rec, user_id).head(10)

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
63308,3734,2,Action|Sci-Fi|Thriller,Hangar 18 (1980),5.0
46993,2141,2,Adventure|Sci-Fi|Thriller,Saturn 3 (1980),5.0
60071,3335,2,Comedy|Horror,The Lair of the White Worm (1988),5.0
54483,2765,2,Action|Adventure|Sci-Fi|Thriller,"Road Warrior, The (Mad Max 2) (1981)",5.0
18806,656,2,Action|Adventure|Sci-Fi|Thriller,Escape from L.A. (1996),5.0
49044,2280,2,Horror|Sci-Fi,Piranha (1978),4.5
31927,1190,2,Action|Adventure|Fantasy,Conan the Barbarian (1982),4.5
40058,1701,2,Action|Horror|Sci-Fi|Thriller,"Thing, The (1982)",4.0
27769,974,2,Action|Adventure|Fantasy,Highlander (1986),3.5
29401,1054,2,Adventure|Sci-Fi,Star Trek: The Motion Picture (1979),3.0


## Similar items
Some suggestions:
* 911: Star Wars Episode VI
* 3638: The Lord of the Rings: The Fellowship of the Ring
* 957: The Shining
* 474: Blade Runner

In [11]:
rec.suggestSimilar(3638)

\begin{tabular}{rllr}
\toprule
 MovieID &                                              Title &                            Genres &  Similarity \\
\midrule
     911 &  Star Wars: Episode VI - Return of the Jedi (1983) &           Action|Adventure|Sci-Fi &    1.000000 \\
     898 &  Star Wars: Episode V - The Empire Strikes Back... &           Action|Adventure|Sci-Fi &    0.695422 \\
     510 &                   Silence of the Lambs, The (1991) &             Crime|Horror|Thriller &    0.610512 \\
     615 &               Independence Day (a.k.a. ID4) (1996) &  Action|Adventure|Sci-Fi|Thriller &    0.589441 \\
     224 &          Star Wars: Episode IV - A New Hope (1977) &           Action|Adventure|Sci-Fi &    0.552946 \\
    2674 &                                   Gladiator (2000) &            Action|Adventure|Drama &    0.548113 \\
\bottomrule
\end{tabular}



## Assessing results

The following two cells may take a few minutes to run. By default there are 10 test observations. If this is the case, the mean precision and the mean recall at 10 are expected to be the same.

In [7]:
with utilities.codeTimer("Mean precision"):
    print("Mean precision at 10: {}".format(rec.meanPrecision(10)))

Mean precision at 10: 0.02786885245901636
Executed 'Mean precision'.  Elapsed time: 108.889906s


In [8]:
with utilities.codeTimer("Mean recall"):
    print("Mean recall at 10: {}".format(rec.meanRecall(10)))

Mean recall at 10: 0.02786885245901636
Executed 'Mean recall'.  Elapsed time: 112.532156s


## New user recommendation

In [None]:
np.random.seed(17)

new_user, new_user_id = rec.generateNewUser(50)
np.shape(rec.R)

In [10]:
new_user_id
reg_lambda = 0.15

In [11]:
with utilities.codeTimer("New user factorisation"):
    rec.addNewUser(new_user, reg_lambda)
np.shape(rec.R)

Executed 'New user factorisation'.  Elapsed time: 0.754067s


(611, 9721)

In [12]:
recommend(rec, new_user_id).head(10)

Too few movies! Most poular movies will be suggested.


Unnamed: 0,MovieID,Title,Genres,AVG_Rating,Counts
314,314,Forrest Gump (1994),Comedy|Drama|Romance|War,4.173913,322
277,277,"Shawshank Redemption, The (1994)",Crime|Drama,4.431746,315
257,257,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197068,307
1933,1939,"Matrix, The (1999)",Action|Sci-Fi|Thriller,4.18251,263
224,224,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231076,251
97,97,Braveheart (1995),Action|Drama|War,4.031646,237
509,510,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.146552,232
418,418,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.742009,219
0,0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.92093,215
2216,2226,Fight Club (1999),Action|Crime|Drama|Thriller,4.258216,213


In [13]:
bestRated(rec, new_user_id).head(10)

Unnamed: 0,MovieID,UserID,Genres,Title,Rating
94740,6436,610,Adventure|Children|Sci-Fi,"Last Mimzy, The (2007)",4.5
94739,5893,610,Drama,Dear Wendy (2005),4.0
94736,293,610,Mystery|Thriller,Underneath (1995),3.0
94737,2513,610,Comedy|Drama|Romance,Bull Durham (1988),2.0
94741,6538,610,Comedy,"Brice Man, The (Brice de Nice) (2005)",2.0
94742,7878,610,Drama|Thriller,96 Minutes (2011),1.5
94743,8497,610,Comedy,"Inbetweeners 2, The (2014)",1.5
94738,2536,610,Adventure|Children|Comedy|Musical,"Muppet Movie, The (1979)",1.0


## Cold start problem
If a new user has rated less than 10 movies, the most popular and unseen movies will be recommended.

In [14]:
np.random.seed(17)

new_user, new_user_id = rec.generateNewUser(8)
np.shape(rec.R)

with utilities.codeTimer("New user factorisation"):
    rec.addNewUser(new_user, reg_lambda)

Executed 'New user factorisation'.  Elapsed time: 0.621690s


In [17]:
print(recommend(rec, new_user_id).head(10).to_latex(index = False))

Too few movies! Most poular movies will be suggested.
\begin{tabular}{rllrr}
\toprule
 MovieID &                                      Title &                                       Genres &  AVG\_Rating &  Counts \\
\midrule
     314 &                        Forrest Gump (1994) &                     Comedy|Drama|Romance|War &    4.173913 &     322 \\
     277 &           Shawshank Redemption, The (1994) &                                  Crime|Drama &    4.431746 &     315 \\
     257 &                        Pulp Fiction (1994) &                  Comedy|Crime|Drama|Thriller &    4.197068 &     307 \\
    1939 &                         Matrix, The (1999) &                       Action|Sci-Fi|Thriller &    4.182510 &     263 \\
     224 &  Star Wars: Episode IV - A New Hope (1977) &                      Action|Adventure|Sci-Fi &    4.231076 &     251 \\
      97 &                          Braveheart (1995) &                             Action|Drama|War &    4.031646 &     237 \\
     510