In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# Load MovieLens dataset
file_path = '../ml-latest-small/movies.csv'
df = pd.read_csv(file_path)

In [3]:
# Genres are typically in a pipe-separated string (e.g., "Action|Adventure|Sci-Fi")
# Split the genres into a list
df['genres'] = df['genres'].str.split('|')

# Create a set of all unique genres
unique_genres = sorted(set(genre for genres in df['genres'] for genre in genres))

# Create a dictionary to map genres to indices
genre_to_index = {genre: i for i, genre in enumerate(unique_genres)}

# Function to convert a list of genres to a binary vector
def genres_to_vector(genres):
    vector = [0] * len(unique_genres)
    for genre in genres:
        vector[genre_to_index[genre]] = 1
    return vector

# Create a sparse genre matrix where rows represent movies and columns represent genres
genre_matrix = csr_matrix([genres_to_vector(movie_genres) for movie_genres in df['genres']])

# Calculate cosine similarity matrix based on genres using NumPy
cosine_sim_matrix = cosine_similarity(genre_matrix, genre_matrix)

In [4]:
# Function to recommend movies based on a given movie
def recommend_movies(movie_id, num_recommendations=5):
    # Find the index of the given movie in the dataset
    movie_index = df[df['movieId'] == movie_id].index[0]
    
    # Get the cosine similarity scores for the given movie
    similarity_scores = cosine_sim_matrix[movie_index]
    
    # Get the indices of movies sorted by similarity (excluding the given movie itself)
    similar_movies_indices = np.argsort(similarity_scores)[::-1][1:num_recommendations + 1]
    
    # Get the movieIds of recommended movies
    recommended_movie_ids = df.iloc[similar_movies_indices]['movieId'].tolist()
    
    # Get the corresponding movie titles
    recommended_movies = df[df['movieId'].isin(recommended_movie_ids)][['movieId', 'title']]
    
    return recommended_movies

In [5]:
# Example: Recommend movies similar to a given movie (replace 1 with the desired movieId)
recommendations = recommend_movies(1, num_recommendations=5)
print(recommendations)

      movieId                                           title
1815     2294                                     Antz (1998)
3003     3754  Adventures of Rocky and Bullwinkle, The (2000)
3217     4016                Emperor's New Groove, The (2000)
6705    53121                          Shrek the Third (2007)
8928   136016                        The Good Dinosaur (2015)
