In [109]:
import warnings
warnings.filterwarnings('ignore')

In [110]:
import collections
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

### Load pre-saved movieID to movie name mapping

In [111]:
movie_mapping = pd.read_csv('../data/interim/movie_mapping.csv')

In [112]:
movie_mapping.head()

Unnamed: 0,movie id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [113]:
movie_ratings = pd.read_csv('../data/interim/movie_ratings.csv')

In [114]:
movie_ratings.head()

Unnamed: 0,user_id,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
0,1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


### Convert the pivot table into a sparse matrix

In [115]:
csr_matr = csr_matrix(movie_ratings.values)
csr_matr

<943x1665 sparse matrix of type '<class 'numpy.float64'>'
	with 100636 stored elements in Compressed Sparse Row format>

### Initialize k nearest neighbours

In [116]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(csr_matr)
knnPickle = open('../models/knn_model_full.pkl', 'wb')
pickle.dump(knn_model, knnPickle)
knnPickle.close()

### I will load a u.data table to use it as a lookup table to get user -> item -> rating.

In [117]:
data = pd.read_csv('../data/raw/ml-100k/u.data', sep = '\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
data.drop(columns=['timestamp'], inplace=True)
data.head()

Unnamed: 0,user_id,item_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


### In 'watched' dict I will store: user_id: {movie_id: rating} - records.

In [118]:
watched = collections.defaultdict(dict)
for i in data.values.tolist():
    watched[i[0]][i[1]] = i[2]

def recommend_movies(user_id, knn, k):
    """
    Recommend movies to a user based on their similarity to other users.

    Args:
        user_id (int): The ID of the user for whom to recommend movies.
        knn (KNN): The KNN model used to find nearest neighbors.
        k (int): The number of nearest neighbors to consider.

    Returns:
        list: A list of tuples, where each tuple contains the predicted rating and the movie ID of a recommended movie.
    """

    # get nearest neigbours of the specified user
    distances, indices = knn.kneighbors(movie_ratings.iloc[user_id-1, :]\
                        .values.reshape(1, -1), n_neighbors = k)
    
    # get films that user has already watched
    user_watched = set(watched[movie_ratings.index[user_id-1]])

    # get movies that were watched by similar users
    neighbours_watched = {}

    for i in range(0, len(distances.flatten())):
        neighbours_watched[movie_ratings.index[indices.flatten()[i]]] = watched[movie_ratings.index[indices.flatten()[i]]].copy()

        for key, v in neighbours_watched[movie_ratings.index[indices.flatten()[i]]].items():
            neighbours_watched[movie_ratings.index[indices.flatten()[i]]][key] = [1 - distances.flatten()[i], v]

    # get movies that were not watched by similar users
    unwatched_films = []
    for u in neighbours_watched:
        a = neighbours_watched[u].keys() - user_watched.intersection(neighbours_watched[u].keys())
        for f in a:
            unwatched_films.append(f)
    
    # Find unwatched films that are common among neighbours
    common_unwatched = [item for item, count in collections.Counter(unwatched_films).items() if count > 1]
    
    # Predict rating the user would give for the unwatched films
    common_unwatched_rating = []
    for f in common_unwatched:
        m = []
        w = []

        for u in neighbours_watched:
            if neighbours_watched[u].get(f) is not None:
                m.append(neighbours_watched[u].get(f)[0]*neighbours_watched[u].get(f)[1])
                w.append(neighbours_watched[u].get(f)[0])

        # calculate predicted rating by taking the weighted average, where the weight is the distance of the neighbour from the user
        common_unwatched_rating.append([np.sum(m)/np.sum(w), f])
    common_unwatched_rating = sorted(common_unwatched_rating, reverse=True)

    return common_unwatched_rating

### Check the algorithm

In [119]:
user_id = 197
k = 10

recommended_movies = recommend_movies(user_id, knn_model, k)
print(f'{k} best recommendations to user {user_id} based on what similar users liked:')
for f in recommended_movies[:k]:
    print('\t', movie_mapping.loc[movie_mapping['movie id'] == f[1], 'movie title'].values[0])

10 best recommendations to user 197 based on what similar users liked:
	 Casablanca (1942)
	 Rock, The (1996)
	 Twelve Monkeys (1995)
	 Rear Window (1954)
	 Long Kiss Goodnight, The (1996)
	 Fugitive, The (1993)
	 Eraser (1996)
	 True Lies (1994)
	 Heat (1995)
	 To Kill a Mockingbird (1962)
