In [619]:
import collections
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

### Load the same movie mapping

In [620]:
movie_mapping = pd.read_csv('../data/interim/movie_mapping.csv')
movie_mapping.head()

Unnamed: 0,movie id,movie title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


### Load train and test data which contains, 'userID', 'itemID', 'rating', 'timestamp' columns.

In [621]:
u1base = pd.read_csv('../data/raw/ml-100k/u1.base', sep='\t', encoding='ISO-8859-1')
u1test = pd.read_csv('../data/raw/ml-100k/u1.test', sep='\t', encoding='ISO-8859-1')

u1base.columns = ['userID', 'itemID', 'rating', 'timestamp']
u1test.columns = ['userID', 'itemID', 'rating', 'timestamp']

u1base.drop(['timestamp'], axis=1, inplace=True)
u1test.drop(['timestamp'], axis=1, inplace=True)

### Remap itemID records to the corresponding movie names and average the rating of the movie if it was rated by the user more than once.

In [622]:
key_value_dict = movie_mapping.set_index('movie id')['movie title'].to_dict()

u1base['itemID'] = u1base['itemID'].map(key_value_dict)
u1test['itemID'] = u1test['itemID'].map(key_value_dict)

u1base = u1base.groupby(by=['userID', 'itemID'], as_index=False).agg({"rating":"mean"})
u1base.head()

u1base.sample(5)

Unnamed: 0,userID,itemID,rating
11164,210,Apt Pupil (1998),5.0
75145,895,Star Trek: First Contact (1996),3.0
76548,908,Some Like It Hot (1959),3.0
38340,521,GoodFellas (1990),3.0
54526,680,"Close Shave, A (1995)",5.0


### Calculate the number of occurrences of each user in train and test datasets

In [623]:
counts_base = u1base['userID'].value_counts()

In [624]:
counts_test = u1test['userID'].value_counts()

### users_to_check - the list of user_ids which I will use to calculate metrics. For evaluation stage I will only keep users which have rated at least 200 movies in the train set, and at least 200 movies in the test set.

In [625]:
users_to_check = []
for value in counts_base.items():
    if value[1] > 200:
        users_to_check.append(value[0])

for value in counts_test.items():
    if value[0] in users_to_check and value[1] < 200:
        users_to_check.remove(value[0])

print(len(users_to_check))

69


### Create a pivot table as in the previous notebook

In [626]:
movie_ratings = u1base.pivot(
    index='userID',
     columns='itemID',
      values='rating').fillna(0)

movie_ratings = movie_ratings.rename_axis(None, axis="columns").reset_index()

movie_ratings.head()

Unnamed: 0,userID,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),...,Wyatt Earp (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
0,1,0.0,0.0,0.0,5.0,0.0,0.0,3.0,4.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0


### Compute csr matrix and fit knn_model

In [627]:
csr_matr = csr_matrix(movie_ratings.values)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(csr_matr)
knnPickle = open('../models/knn_model_ubase1.pkl', 'wb')
pickle.dump(knn_model, knnPickle)
knnPickle.close()

### This part was taken from the previous notebook

In [628]:
data = pd.read_csv('../data/raw/ml-100k/u.data', sep = '\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])
data.drop(columns=['timestamp'], inplace=True)

watched = collections.defaultdict(dict)
for i in data.values.tolist():
    watched[i[0]][i[1]] = i[2]

def recommend_movies(user_id, knn, k):
    """
    Recommend movies to a user based on their similarity to other users.

    Args:
        user_id (int): The ID of the user for whom to recommend movies.
        knn (KNN): The KNN model used to find nearest neighbors.
        k (int): The number of nearest neighbors to consider.

    Returns:
        list: A list of tuples, where each tuple contains the predicted rating and the movie ID of a recommended movie.
    """

    # get nearest neigbours of the specified user
    distances, indices = knn.kneighbors(movie_ratings.iloc[user_id-1, :]\
                        .values.reshape(1, -1), n_neighbors = k)
    
    # get films that user has already watched
    user_watched = set(watched[movie_ratings.index[user_id-1]])

    # get movies that were watched by similar users
    neighbours_watched = {}

    for i in range(0, len(distances.flatten())):
        neighbours_watched[movie_ratings.index[indices.flatten()[i]]] = watched[movie_ratings.index[indices.flatten()[i]]].copy()

        for key, v in neighbours_watched[movie_ratings.index[indices.flatten()[i]]].items():
            neighbours_watched[movie_ratings.index[indices.flatten()[i]]][key] = [1 - distances.flatten()[i], v]

    # get movies that were not watched by similar users
    unwatched_films = []
    for u in neighbours_watched:
        a = neighbours_watched[u].keys() - user_watched.intersection(neighbours_watched[u].keys())
        for f in a:
            unwatched_films.append(f)
    
    # Find unwatched films that are common among neighbours
    common_unwatched = [item for item, count in collections.Counter(unwatched_films).items() if count > 1]
    
    # Predict rating the user would give for the unwatched films
    common_unwatched_rating = []
    for f in common_unwatched:
        m = []
        w = []

        for u in neighbours_watched:
            if neighbours_watched[u].get(f) is not None:
                m.append(neighbours_watched[u].get(f)[0]*neighbours_watched[u].get(f)[1])
                w.append(neighbours_watched[u].get(f)[0])

        # calculate predicted rating by taking the weighted average, where the weight is the distance of the neighbour from the user
        common_unwatched_rating.append([np.sum(m)/np.sum(w), f])
    common_unwatched_rating = sorted(common_unwatched_rating, reverse=True)

    return common_unwatched_rating

### Calculate similarity of 2 lists by converting them into sets and get their intersection, then I just normilize this value by <br> len(list1) + len(list2) - len(intersection).

In [629]:
def calculate_similarity(list1, list2):
    """
    Calculate the similarity between two lists.

    Args:
        list1 (list): The first input list.
        list2 (list): The second input list.

    Returns:
        float: The similarity between the two lists.
    """
    intersection = set(list1) & set(list2)
    similarity = len(intersection) / (len(list1) + len(list2) - len(intersection))
    return similarity

In [630]:
similarities = []
predicted_ratings = []
precisions = []
for user in users_to_check:
    k = 10
    recommended_movies_with_rating = recommend_movies(user, knn_model, k)
    recommended_movies = [movie_mapping.loc[movie_mapping['movie id'] == f[1], 'movie title'].values[0] for f in recommended_movies_with_rating]
    recommended_movies_ratings = [f[0] for f in recommended_movies_with_rating]

    # get ratings of top k recommended films
    for rate in recommended_movies_ratings[:k]:
        predicted_ratings.append(rate)

    # calculate precision for each user
    relevant_recs_num = 0
    for rate in recommended_movies_ratings:
        if rate >= 4: relevant_recs_num += 1
    precisions.append(relevant_recs_num / len(recommended_movies))
 
    # get films that user has rated but this data was not provided to the model
    test_movies = []
    for value in u1test.itertuples():
        user_id = value[1]
        movie_title = value[2]
        rating = value[3]
        if user_id == user and rating >= 4:
            test_movies.append((movie_title, rating))
        
    # sort this data by the rating
    test_movies = sorted(test_movies, key=lambda x: x[1], reverse=True)
    # and calculate similarity using set intersection
    similarities.append(calculate_similarity(recommended_movies, [mov[0] for mov in test_movies]))

print('Mean average precision @k of the recommended films:\n\t', np.mean(precisions))
print('\nMean rating of the recommended films:\n\t', np.mean(predicted_ratings))
print('\nMean similarity of the films that were not seen by the model, but user has rated them, and recommended films:\n\t', np.mean([s for s in similarities if s > 0]))

Mean average precision @k of the recommended films:
	 0.3816763311018658

Mean rating of the recommended films:
	 4.848630463698114

Mean similarity of the films that were not seen by the model, but user has rated them, and recommended films:
	 0.09290531851184022
