### This is a content-best recommender system implementation

In [93]:
import pandas as pd
import numpy as np
from pandas import Series, DataFrame

First, we load the movies and ratings data into Pandas

In [94]:
movies_df = pd.read_csv("./datasets/ml-latest-small/movies.csv")
ratings_df = pd.read_csv("./datasets/ml-latest-small/ratings.csv")

Since each user's ratings are stored in individual rows in ratings.csv, we convert the data to a matrix where each row represents a user and each column represents a movie. The value in each column represents the rating of that movie. Ratings range from 0.5 to 5.0. Entries of value 0.0 represent movies that have not yet been rated by a user.

In [95]:
user_matrix = ratings_df.pivot_table(
    index="userId",
    columns="movieId",
    values="rating",
    fill_value=0
)

By default, the genres of a movie are stored as a pipe-seperated list (e.g. Action|Adventure|Drama). We cannot work with the data in this format, though, so we One-hot encode the genres. This creates a binary matrix, where the entries have values of zero or one). For each genre, a column is added to the matrix. A value of zero means that a movie does not contain a genre, and a value of one means that it does.

In [110]:
item_matrix: DataFrame = movies_df.copy()

item_matrix["genres_list"] = item_matrix["genres"].apply(lambda x: x.split("|"))

MOVIE_GENRES = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)"
]

for genre in MOVIE_GENRES:
    item_matrix[genre] = item_matrix["genres_list"].apply(lambda x: 1 if genre in x else 0)

item_matrix = item_matrix.drop(columns=["title", "genres", "genres_list"])

item_matrix = item_matrix.set_index('movieId')
item_matrix

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193583,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
193587,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


The default ratings do not contain all of the movies, so we add any missing movies to ensure that the dimensions are compatible.

In [111]:
user_movies = set(user_matrix.columns)
item_movies = set(item_matrix.index)
missing_movies = item_movies - user_movies

for movie in missing_movies:
    user_matrix[movie] = 0.0

user_matrix = user_matrix[item_matrix.index]

Now that the matrices are prepared, we select the rating vector for a given user.

In [112]:
user_id = 389

user_rating_matrix: Series = user_matrix.loc[user_id]
user_rating_matrix

movieId
1         5.0
2         0.0
3         0.0
4         0.0
5         4.0
         ... 
193581    0.0
193583    0.0
193585    0.0
193587    0.0
193609    0.0
Name: 389, Length: 9742, dtype: float64

We then calculate the dot product between the user's rating vector and each row of the movie matrix. This operation results in a weighted vector, where each entry equals the sum of each genre for each movie, weighted by the user's rating of that movie.

In [113]:
preference_vector: Series = user_rating_matrix.dot(item_matrix)

# Display the resulting vector
preference_vector.sort_values(ascending=False)

Thriller              65.0
Action                58.0
Adventure             48.0
Comedy                44.0
Drama                 35.0
Romance               26.0
Sci-Fi                23.0
Crime                 21.0
Fantasy               13.0
Mystery               12.0
Animation              5.0
Children               5.0
Western                0.0
War                    0.0
Film-Noir              0.0
Musical                0.0
Horror                 0.0
Documentary            0.0
(no genres listed)     0.0
Name: 389, dtype: float64

Now that the preferences have been determined, we remove all movies that have already been rated by the user. This ensure that we only recommend new movies to the user.

In [114]:
user_rating_matrix = user_rating_matrix[user_rating_matrix == 0.0]
item_matrix = item_matrix[item_matrix.index.isin(user_rating_matrix.index)]
item_matrix

Unnamed: 0_level_0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
8,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193583,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
193585,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
193587,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


We normalize the preference vector such that the sum of all of its entries equals one.

In [115]:
preference_vector_norm = np.linalg.norm(preference_vector)

Now, we calculate the cosine similarity between the normalized preference vector and each of the movie vectors. This allows us to determine how similar each movie is to the user's preferences.

In [116]:
def calculate_cosine_similarity(movie_vector: Series):
    movie_vector_norm = np.linalg.norm(movie_vector)

    if movie_vector_norm == 0:
        return 0.0

    cosine_similarity = np.dot(preference_vector, movie_vector) / (preference_vector_norm * movie_vector_norm)

    return cosine_similarity

cosine_similarities = item_matrix.apply(calculate_cosine_similarity, axis=1)
cosine_similarities = pd.Series(cosine_similarities)

Now, we map the cosine similarity value of the preference vector and each of the movie vectors back to the original movie dataframe.

In [117]:
movie_recommendations_df: DataFrame = movies_df.copy()
movie_recommendations_df["score"] = cosine_similarities

sorted_movies_by_score = movie_recommendations_df.sort_values(by='score', ascending=False)

sorted_movies_by_score.head(n=10)

Unnamed: 0,movieId,title,genres,score
4956,7487,Henry & June (1990),Drama,0.918074
6564,55069,"4 Months, 3 Weeks and 2 Days (4 luni, 3 saptam...",Drama,0.878165
380,436,Color of Night (1994),Drama|Thriller,0.878165
4270,6234,Equus (1977),Drama|Mystery,0.875896
1429,1952,Midnight Cowboy (1969),Drama,0.875896
2835,3792,Duel in the Sun (1946),Drama|Romance|Western,0.875896
6503,53453,Starcrash (a.k.a. Star Crash) (1978),Action|Adventure|Fantasy|Sci-Fi,0.859946
8968,137345,That Demon Within (2014),Crime|Thriller,0.859946
4800,7153,"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy,0.845371
459,524,Rudy (1993),Drama,0.841568
