In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
import pickle

#### Let's collect some recommendations for a new users that loves Disney Movies! 

In [4]:
# for calculating recommendations

disney_movies = [
    4470, 48, 594, 27619, 152081, 595, 616, 1029, 
    596, 4016, 1033, 134853, 2018, 588, 364, 26999, 
    75395,2085, 1907, 2078, 1032, 177765
]

In [5]:
# for calculating recommendations: movies that the user likes!
query = [4470, 48, 594, 27619, 152081, 595, 616, 1029, 
    596, 4016, 1033]


# for testing the recommender after getting some recommendations
relevant_items = [
    134853, 2018, 588, 364, 26999, 
    75395,2085, 1907, 2078, 1032, 177765
]

# Non Negative Matrix Factorization for Recommender Systems
---
1. Model Development: Training

    - process the data
    - train the NMF model
    - inspect the P and Q matrix
    - calculate the reconstruction loss


2. Model Deployment: Inference

    - save the model on hard drive
    - construct a new user vector
    - calculate the reconstructed vector
    - produce recommendations


---
## 1. Model Development

In [6]:
ratings = pd.read_csv('./data/ml-latest-small/ratings.csv')
movies = pd.read_csv('./data/ml-latest-small/movies.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
movies.set_index('movieId').loc[query]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4470,Ariel (1988),Drama
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
616,"Aristocats, The (1970)",Animation|Children
1029,Dumbo (1941),Animation|Children|Drama|Musical
596,Pinocchio (1940),Animation|Children|Fantasy|Musical
4016,"Emperor's New Groove, The (2000)",Adventure|Animation|Children|Comedy|Fantasy


### Preprocessing

- TODO: filter out movies rated by less than 20 (/ 50 / 100 ...) users
- create a sparse user item matrix

In [None]:
# TODO: calculate the number of ratings per movie
ratings_per_movie = ratings.???
ratings_per_movie

In [None]:
# TODO: filter for movies with more than 20 ratings and extract the index
popular_movies = ratings_per_movie.???
popular_movies

In [None]:
# TODO: filter the ratings matrix and only keep the popular movies
ratings = ratings.loc[???]
ratings.head()

In [10]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [11]:
# Initialize a sparse user-item rating matrix 
# (data, (row_ind, col_ind)
R = csr_matrix((ratings['rating'], (ratings['userId'], ratings['movieId'])))
R

<611x193610 sparse matrix of type '<class 'numpy.float64'>'
	with 100836 stored elements in Compressed Sparse Row format>

In [15]:
# user with id 1 has watched 232 movies
R[1, ]

<1x193610 sparse matrix of type '<class 'numpy.float64'>'
	with 232 stored elements in Compressed Sparse Row format>

In [17]:
pd.DataFrame(R.todense())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,193600,193601,193602,193603,193604,193605,193606,193607,193608,193609
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Training

- initialize the model
- fit it on the user item matrix
- optionally, tune the number of components (hidden features): what happens if you set the number of components to a really low number?
- decrease the `tol` to train for a longer time

In [18]:
# initialize the unsupervised model
# 55 hidden features
model = NMF(n_components=55, init='nndsvd', max_iter=10000, tol=0.0001, verbose=2)

# fit it to the user-item rating matrix
model.fit(R)

# initialzed P, Q matrix with random values
# iterate and optimize the values stored in P and Q

violation: 1.0
violation: 0.293047792597504
violation: 0.19525562134711788
violation: 0.14608487639534865
violation: 0.11746849932263326
violation: 0.09629445182764722
violation: 0.07500075376272272
violation: 0.058037262577932396
violation: 0.046649356791845446
violation: 0.03787891243982954
violation: 0.03149046978328459
violation: 0.027351128268505574
violation: 0.02441172521628436
violation: 0.022221651802614116
violation: 0.02044011596846136
violation: 0.01876733766381424
violation: 0.017351483470425134
violation: 0.01611996563063832
violation: 0.015145635000948343
violation: 0.014231900268949757
violation: 0.013319585236061327
violation: 0.012435851169165393
violation: 0.011551258720972799
violation: 0.010635701337740151
violation: 0.009730329771087972
violation: 0.00887600637398399
violation: 0.008183961139138935
violation: 0.00751223671641134
violation: 0.0069654730324271484
violation: 0.0066092978402461245
violation: 0.006329757598910644
violation: 0.006066365044694439
violati

NMF(init='nndsvd', max_iter=10000, n_components=55, tol=0.001, verbose=2)

### Model inspection

#### the hidden features

In [19]:
# user-genre matrix
P = model.transform(R)

violation: 1.0
violation: 0.812184692828531
violation: 0.24565095914032445
violation: 0.09172361979755307
violation: 0.03819104300548128
violation: 0.020845641452666326
violation: 0.01260939916633755
violation: 0.00806206779057964
violation: 0.005213707567702452
violation: 0.00340117784762628
violation: 0.0022504085571787864
violation: 0.0015926970810198073
violation: 0.0011462956567310785
violation: 0.0008524647978960203
Converged at iteration 15


In [20]:
P.shape, R.shape

((611, 55), (611, 193610))

In [22]:
P[1, :]

array([0.        , 0.        , 0.        , 0.70997187, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.42276628, 0.        , 0.        , 0.7580365 , 0.        ,
       0.        , 0.        , 0.37625262, 0.        , 0.        ,
       0.        , 0.        , 1.284188  , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.08745916, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.05220307, 0.        , 0.        ,
       0.        , 0.        , 0.46938669, 0.        , 0.        ,
       0.        , 0.72999867, 0.10082466, 0.        , 0.        ])

In [23]:
P.max()

14.558539016510496

In [24]:
# movie-genre matrix
Q = model.components_

In [26]:
P.shape, Q.shape

((611, 55), (55, 193610))

In [None]:
# do the shapes match?


#### the reconstruction error

$$
L(R, \hat{R}) = \sqrt{\sum_i\sum_j(R_{ij}-\hat{R}_{ij})^2} = \sqrt{\sum_i\sum_j(R_{ij}-PQ_{ij})^2}
$$

In [27]:
model.reconstruction_err_

762.0727674566688

In [None]:
# R -> encoding -> P -> decoding -> Rhat
R_hat = ???

In [None]:
# TODO: how to caluclate the loss manually?
???

---
## 2. Model deployment: Make recommendations for a new user

### Save the trained model on your hard drive

In [28]:
with open('./nmf_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

In [29]:
!ls

collaborative_filtering_matrix_factorization.ipynb  nmf_recommender.pkl
data						    PCA_Clustering
exploratory_analysis_movies.ipynb		    PCA_movies.ipynb
Intro2UnsupervisedLearning_Warmup.ipynb


### Read the model from hard drive

In [31]:
with open('./nmf_recommender.pkl', 'rb') as file:
    model = pickle.load(file)
model.reconstruction_err_

762.0727674566688

### Receive a user query from the website

In [32]:
query

[4470, 48, 594, 27619, 152081, 595, 616, 1029, 596, 4016, 1033]

In [36]:
R.shape[1]

193610

### Construct a user vector

we need the same input as was used during training!

In [37]:
# new user vector: needs to have the same format as the training data

# pre fill it with zeros (= missing values)
print(R.shape[1])
user_vec = np.repeat(0, 193610)

# fill in the ratings that arrived from the query
user_vec[query] = 5
user_vec

193610


array([0, 0, 0, ..., 0, 0, 0])

### Calculate the score

1. transform the user vector to the hidden feature space (encoding) 
2. inverse transform into the sparse representation (decoding)

$$
\hat{r}_{ij} = p_i' \cdot q_j 
$$

In [39]:
model.inverse_transform(model.transform([user_vec]))

violation: 1.0
violation: 0.8925692473515198
violation: 0.05714234399643375
violation: 0.0016372225123111213
violation: 7.074954084789194e-05
Converged at iteration 6


array([[0.        , 0.4394966 , 0.33987123, ..., 0.        , 0.        ,
        0.        ]])

In [40]:
scores = model.inverse_transform(model.transform([user_vec]))

violation: 1.0
violation: 0.8925692473515198
violation: 0.05714234399643375
violation: 0.0016372225123111213
violation: 7.074954084789194e-05
Converged at iteration 6


In [41]:
scores.shape

(1, 193610)

In [42]:
R.shape

(611, 193610)

### Give recommendations

In [44]:
scores

array([[0.        , 0.4394966 , 0.33987123, ..., 0.        , 0.        ,
        0.        ]])

In [46]:
# convert the scores into a pandas series
scores = pd.Series(scores[0])
scores

0         0.000000
1         0.439497
2         0.339871
3         0.098730
4         0.002881
            ...   
193605    0.000000
193606    0.000000
193607    0.000000
193608    0.000000
193609    0.000000
Length: 193610, dtype: float64

In [47]:
# give a zero score to movies the user has allready seen
scores[query] = 0

In [49]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

364     0.715999
588     0.610827
1028    0.560190
1035    0.550817
2078    0.525882
          ...   
4247    0.000000
4275    0.000000
4249    0.000000
4274    0.000000
0       0.000000
Length: 193610, dtype: float64

In [50]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([364, 588, 1028, 1035, 2078, 919, 1073, 34, 1282, 2081], dtype='int64')

In [None]:
# TODO: precision@10: fraction of recommendations that are relevant
recommendations.???

In [52]:
movies.set_index('movieId').loc[recommendations]

Unnamed: 0,title,genres
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
1028,Mary Poppins (1964),Children|Comedy|Fantasy|Musical
1035,"Sound of Music, The (1965)",Musical|Romance
2078,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
919,"Wizard of Oz, The (1939)",Adventure|Children|Fantasy|Musical
1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
34,Babe (1995),Children|Drama
1282,Fantasia (1940),Animation|Children|Fantasy|Musical
2081,"Little Mermaid, The (1989)",Animation|Children|Comedy|Musical|Romance


In [53]:
relevant_items

[134853, 2018, 588, 364, 26999, 75395, 2085, 1907, 2078, 1032, 177765]

---
## 3. Project Task: NMF recommender function

- Collect different user queries for "typical" users (e.g. a horror movie buff) and evaluate the algorithm
- Set the number of components to a very low number (e.g. 2). What happens to the recommendations?
- Implement a recommender function that recommends movies to a new user based on the NMF model!

In [None]:
# collaborative filtering = look at ratings only!
def recommend_nmf(query, model, k=10):
    """
    Filters and recommends the top k movies for any given input query based on a trained NMF model. 
    Returns a list of k movie ids.
    """
    # 1. candiate generation
    
    # construct a user vector
    
   
    # 2. scoring
    
    # calculate the score with the NMF model
    
    
    # 3. ranking
    
    # set zero score to movies allready seen by the user
    
    # return the top-k highst rated movie ids or titles
    
    return [364, 372, 43, 34, 243]

In [None]:
# recommender.py
# from recommender import recommend_nmf