In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise
from scipy.sparse import csr_matrix
import sklearn
import pickle

In [66]:
# DRY - Don't repeat yourself!
# place a utils.py in the same folder as the notebook
# from utils import example_query, create_user_vector, create_rating_matrix

# for calculating recommendations
example_query = {
    # movieId, rating
    4470:5, 
    48:5,
    594:5,
    27619:5,
    152081:5,
    595:5,
    616:5,
    1029:5
}

# Neighborhood Based Filtering for Recommender Systems
---

> The key idea is that the rating of u for a new item i is likely to be similar to that of another user v,if u and v have rated other items in a similar way. Likewise,u is likely to rate two items i and j in a similar fashion, if other users have given similar ratings to these two items.

##### Use ratings of similar users (or items) to predict what you like! But: How can we measure similarity/distance? 

- Cosine Similariy/Distance (works good for sparse high dimensional data)
- Jaccard Similarity/Distance (only works on binarized vectors)
- Pearson Correlation/Distance (cosine similarity on centered vectors)
- Euclidian Distance/Similarity (not good for sparse high dimensional data)

You find many more metrics here:https://docs.scipy.org/doc/scipy/reference/spatial.distance.html

In [67]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [68]:
# movies liked by our test user
movies.set_index('movieId').loc[example_query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4470,Ariel (1988),Drama
48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance
594,Snow White and the Seven Dwarfs (1937),Animation|Children|Drama|Fantasy|Musical
27619,"Lion King 1½, The (2004)",Adventure|Animation|Children|Comedy
152081,Zootopia (2016),Action|Adventure|Animation|Children|Comedy
595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
616,"Aristocats, The (1970)",Animation|Children
1029,Dumbo (1941),Animation|Children|Drama|Musical


---
## 1. Model Development

### Preprocessing (same as for the NMF model!)

- filter out movies rated by less than 20/ 50 / 100 ... users
- filter out movies with an average rating lower than 2
- create a sparse user item matrix

In [69]:
# place a utils.py in the same folder as the notebook
# from utils import ratings, get_ratings_matrix

# R = get_ratings_matrix(ratings)

In [70]:
# calculate the number of ratings per movie
ratings_per_movie = ratings.groupby('movieId')['userId'].count()

# filter for movies with more than 20 ratings and extract the index
popular_movies = ratings_per_movie.loc[ratings_per_movie > 20].index

# filter the ratings matrix and only keep the popular movies
ratings = ratings.loc[ratings['movieId'].isin(popular_movies)]

# Initialize a sparse user-item rating matrix
# (data, (row_ind, col_ind)
R = csr_matrix(
    (ratings['rating'], (ratings['userId'], ratings['movieId'])))

R.shape

(611, 168253)

### Training (new!)

- initialize the model: pick a distance metric
- fit it to the user item matrix: only stores the data and doesn't do further. all the calculations take place later!

In [93]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
R_std = sc.fit_transform(R)

ValueError: Cannot center sparse matrices: pass `with_mean=False` instead. See docstring for motivation and alternatives.

In [71]:
# which metrics can we use for sparse matrics?
sorted(sklearn.neighbors.VALID_METRICS_SPARSE['brute'])

['cityblock', 'cosine', 'euclidean', 'l1', 'l2', 'manhattan', 'precomputed']

In [72]:
# initialize the unsupervised model
model = NearestNeighbors(metric='cosine')

# fit it to the user-item rating matrix
model.fit(R)

### Save the trained model on your hard drive

In [73]:
with open('./distance_recommender.pkl', 'wb') as file:
    pickle.dump(model, file)

---
## 2. Model deployment: Make recommendations for a new user

### Read the model from hard drive

In [74]:
with open('./distance_recommender.pkl', 'rb') as file:
    model = pickle.load(file)

In [75]:
# if you have loaded the model inside the utils.py you can also write:
# from utils import model

### Receive a user query

In [76]:
example_query

{4470: 5, 48: 5, 594: 5, 27619: 5, 152081: 5, 595: 5, 616: 5, 1029: 5}

### Construct a user vector (same as before!)

we need the same input as was used during training!

In [77]:
# from utils import make_user_vector
# user_vec = make_user_vector(query=example_query, length=168253)

In [78]:
# new user vector: needs to have the same format as the training data
# pre fill it with zeros
user_vec = np.repeat(0, 168253)

# fill in the ratings that arrived from the query
user_vec[list(example_query.keys())] = list(example_query.values())

In [79]:
user_vec[list(example_query.keys())]

array([5, 5, 5, 5, 5, 5, 5, 5])

### Calculate the score (new!)

1. find the neighborhood of $n$ similar users
2. use their ratings to calculate a score

In [80]:
# calculates the distances to all other users in the data!
distances, userIds = model.kneighbors([user_vec], n_neighbors=10, return_distance=True)

# sklearn returns a list of predictions - extract the first and only value of the list
distances = distances[0]
userIds = userIds[0]

In [81]:
distances, userIds

(array([0.8059715 , 0.83263452, 0.8346348 , 0.85242967, 0.85242967,
        0.8526059 , 0.85274623, 0.85989566, 0.86081367, 0.86770655]),
 array([476,  43, 563,   5, 170, 484,  58, 235,  20, 216]))

In [82]:
# only look at ratings for users that are similar!
neighborhood = ratings.set_index('userId').loc[userIds]
neighborhood

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
476,1,4.0,835021447
476,2,4.0,835021693
476,10,3.0,835021420
476,11,3.0,835021635
476,32,4.0,835021513
...,...,...,...
216,3996,4.0,982169907
216,4002,3.0,975212110
216,4023,3.0,982169946
216,4025,2.0,982169965


In [83]:
# calculate the summed up rating for each movie
# summing up introduces a bias for popular movies
# averaging introduces bias for movies only seen by few users in the neighboorhood
scores = neighborhood.groupby('movieId')['rating'].sum()
scores

movieId
1         20.5
2         12.0
3          8.0
5         12.0
7         10.0
          ... 
106920     3.0
112552     4.5
117529     3.5
119145     4.0
134853     4.0
Name: rating, Length: 543, dtype: float64

### Give recommendations (same as before!)

In [84]:
example_query.keys()

dict_keys([4470, 48, 594, 27619, 152081, 595, 616, 1029])

In [85]:
# give a zero score to movies the user has allready seen
allready_seen = scores.index.isin(example_query.keys())
scores.loc[allready_seen] = 0

In [86]:
# sort the scores from high to low 
scores = scores.sort_values(ascending=False)
scores

movieId
588     43.5
364     37.0
34      33.0
356     32.0
318     31.0
        ... 
595      0.0
1029     0.0
48       0.0
616      0.0
594      0.0
Name: rating, Length: 543, dtype: float64

In [87]:
# get the movieIds of the top 10 entries
recommendations = scores.head(10).index
recommendations

Int64Index([588, 364, 34, 356, 318, 596, 597, 457, 590, 150], dtype='int64', name='movieId')

In [88]:
# let's see the recommendations!
movies.set_index('movieId').loc[recommendations]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
34,Babe (1995),Children|Drama
356,Forrest Gump (1994),Comedy|Drama|Romance|War
318,"Shawshank Redemption, The (1994)",Crime|Drama
596,Pinocchio (1940),Animation|Children|Fantasy|Musical
597,Pretty Woman (1990),Comedy|Romance
457,"Fugitive, The (1993)",Thriller
590,Dances with Wolves (1990),Adventure|Drama|Western
150,Apollo 13 (1995),Adventure|Drama|IMAX


---
## 3. Project Task: neighborhood based recommender function

- Collect different example queries for "typical" users (e.g. a horror movie buff) and try out the algorithm
- Set the number of neighbors to a very high or low number. What happens to the recommendations?
- Implement a recommender function that recommends movies to a new user based on the NearestNeighbor model!


- ⭐ **Bonus**: Calculate the score using a weighted sum or average. Use the distances to the other users as weights
- ⭐ **Bonus**: Use the method to find and recommend similar movies! Hint: Run the model on the transposed user item rating matrix.
- ⭐ **Bonus**: First use NMF to reduce the dimensionality of the sparse user item matrix. Then run neighborhood based recommendation on the dense matrix.

In [165]:
def recommend_neighborhood(query, model, ratings, k=10):

    user_vec = np.repeat(0, 168253)
    user_vec[list(query.keys())] = list(query.values())

    distances, userIds = model.kneighbors([user_vec], n_neighbors=10, return_distance=True)

    distances = distances[0]
    userIds = userIds[0]

    neighborhood = ratings.set_index('userId').loc[userIds]
    factors = np.array(distances)[neighborhood.index.factorize()[0]]
    neighborhood['rating'] *= 1-factors
    scores = neighborhood.groupby('movieId')['rating'].sum()
    

    allready_seen = scores.index.isin(query.keys())
    scores.loc[allready_seen] = 0
    scores = scores.sort_values(ascending=False)

    recommendations = scores.head(k).index

    result = movies.set_index('movieId').loc[recommendations]

    #print ("user_vec: ", user_vec, " distances: ", distances, " userIds: ", userIds, " neighborhood: ", neighborhood, " scores: ", scores, " already_seen: ", allready_seen, " recommendations: ", recommendations)

    return result

In [166]:
test_query = {1: 5, 20: 5, 3: 5, 4: 5, 5: 5, 7:5}

In [167]:
# which movies are in the query?
movies.set_index('movieId').loc[test_query.keys()]

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
20,Money Train (1995),Action|Comedy|Crime|Drama|Thriller
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy
7,Sabrina (1995),Comedy|Romance


In [168]:
recommend_neighborhood(test_query, model, ratings, k = 10)

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
25,Leaving Las Vegas (1995),Drama|Romance
733,"Rock, The (1996)",Action|Adventure|Thriller
141,"Birdcage, The (1996)",Comedy
95,Broken Arrow (1996),Action|Adventure|Thriller
736,Twister (1996),Action|Adventure|Romance|Thriller
1356,Star Trek: First Contact (1996),Action|Adventure|Sci-Fi|Thriller
