1. Importing dependecies

In [2]:
from scipy.spatial.distance import cosine
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
import os
import zipfile
from scipy.sparse import csr_matrix

2. Load Data

In [3]:
zip_file = 'ml-latest-small.zip'
data_dir = 'ml-latest-small'

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall()

In [4]:
ratings = pd.read_csv(os.path.join(data_dir, 'ratings.csv'))
movies = pd.read_csv(os.path.join(data_dir, 'movies.csv'))

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
genre_data = movies.set_index('movieId')['genres'].str.split('|').to_dict()
print(genre_data)

{1: ['Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy'], 2: ['Adventure', 'Children', 'Fantasy'], 3: ['Comedy', 'Romance'], 4: ['Comedy', 'Drama', 'Romance'], 5: ['Comedy'], 6: ['Action', 'Crime', 'Thriller'], 7: ['Comedy', 'Romance'], 8: ['Adventure', 'Children'], 9: ['Action'], 10: ['Action', 'Adventure', 'Thriller'], 11: ['Comedy', 'Drama', 'Romance'], 12: ['Comedy', 'Horror'], 13: ['Adventure', 'Animation', 'Children'], 14: ['Drama'], 15: ['Action', 'Adventure', 'Romance'], 16: ['Crime', 'Drama'], 17: ['Drama', 'Romance'], 18: ['Comedy'], 19: ['Comedy'], 20: ['Action', 'Comedy', 'Crime', 'Drama', 'Thriller'], 21: ['Comedy', 'Crime', 'Thriller'], 22: ['Crime', 'Drama', 'Horror', 'Mystery', 'Thriller'], 23: ['Action', 'Crime', 'Thriller'], 24: ['Drama', 'Sci-Fi'], 25: ['Drama', 'Romance'], 26: ['Drama'], 27: ['Children', 'Drama'], 28: ['Drama', 'Romance'], 29: ['Adventure', 'Drama', 'Fantasy', 'Mystery', 'Sci-Fi'], 30: ['Crime', 'Drama'], 31: ['Drama'], 32: ['Mystery', 'Sci-F

3. Exploratory Data Analysis

In [8]:
num_ratings = len(ratings)
num_movies = ratings['movieId'].nunique()
num_users = ratings['userId'].nunique()

print(f'Number of ratings: {num_ratings}')
print(f'Number of unique movieId: {num_movies}')
print(f'Number of unique userId: {num_users}')

Number of ratings: 100836
Number of unique movieId: 9724
Number of unique userId: 610


In [9]:
mean_rating = ratings.groupby('userId')['rating'].mean()
print(f'Mean rating per user: {mean_rating.mean():.2f}')

Mean rating per user: 3.66


Count how many time each movie was rated

In [10]:
movie_ratings = ratings.merge(movies, on='movieId')
movie_ratings["title"].value_counts().head(10)

title
Forrest Gump (1994)                          329
Shawshank Redemption, The (1994)             317
Pulp Fiction (1994)                          307
Silence of the Lambs, The (1991)             279
Matrix, The (1999)                           278
Star Wars: Episode IV - A New Hope (1977)    251
Jurassic Park (1993)                         238
Braveheart (1995)                            237
Terminator 2: Judgment Day (1991)            224
Schindler's List (1993)                      220
Name: count, dtype: int64

Calculate average rating for each movie

In [11]:
mean_rating = ratings.groupby('movieId')['rating'].mean()
mean_rating.head()

movieId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: rating, dtype: float64

We are interested to know the least and most popular movies

In [12]:
min_rating_movie = mean_rating.idxmin()
min_rating_movie_name = movies[movies["movieId"] == min_rating_movie]["title"].values[0]
min_rating_movie_rating = mean_rating[min_rating_movie]
print(f'Movie with the lowest average rating: {min_rating_movie_name} with an average rating of {min_rating_movie_rating:.2f}')

Movie with the lowest average rating: Gypsy (1962) with an average rating of 0.50


This movie is rated only by one user

In [13]:
ratings[ratings["movieId"] == min_rating_movie]

Unnamed: 0,userId,movieId,rating,timestamp
13633,89,3604,0.5,1520408880


In [14]:
max_rating_movie = mean_rating.idxmax()
max_rating_movie_name = movies[movies["movieId"] == max_rating_movie]["title"].values[0]
max_rating_movie_rating = mean_rating[max_rating_movie]
print(f'Movie with the highest average rating: {max_rating_movie_name} with an average rating of {max_rating_movie_rating:.2f}')

Movie with the highest average rating: Lamerica (1994) with an average rating of 5.00


We see that "Lamerica" has only two raitings, so we cannot assume that it is a popular movie

In [15]:
ratings[ratings["movieId"] == max_rating_movie]

Unnamed: 0,userId,movieId,rating,timestamp
13368,85,53,5.0,889468268
96115,603,53,5.0,963180003


To address this, we need to calculate bayesian average to fairly identify popularity of particular movie

In [16]:
movie_stats = ratings.groupby('movieId')['rating'].agg(['count', 'mean'])
movie_stats.head()

Unnamed: 0_level_0,count,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,215,3.92093
2,110,3.431818
3,52,3.259615
4,7,2.357143
5,49,3.071429


In [17]:
m = movie_stats['mean'].mean()
C = movie_stats['count'].mean()

print(f'Average rating of a movie: {m:.2f}')
print(f'Average number of ratings per movie: {C:.2f}')

Average rating of a movie: 3.26
Average number of ratings per movie: 10.37


In [18]:
def bayesian_avg(ratings):
    bayesian_avg = (C * m + ratings.sum()) / (C + ratings.count())
    return round(bayesian_avg, 3)

In [19]:
lamerica = pd.Series([5, 5])
print(bayesian_avg(lamerica))


3.543


In [20]:
bayesian_avg_ratings = ratings.groupby('movieId')['rating'].agg(bayesian_avg).reset_index()
bayesian_avg_ratings.columns = ['movieId', 'bayesian_avg']
bayesian_avg_ratings.head()

Unnamed: 0,movieId,bayesian_avg
0,1,3.891
1,2,3.417
2,3,3.26
3,4,2.898
4,5,3.105


In [21]:
movie_stats = movie_stats.merge(bayesian_avg_ratings, on='movieId')
movie_stats.head()

Unnamed: 0,movieId,count,mean,bayesian_avg
0,1,215,3.92093,3.891
1,2,110,3.431818,3.417
2,3,52,3.259615,3.26
3,4,7,2.357143,2.898
4,5,49,3.071429,3.105


In [22]:
movie_stats = movie_stats.merge(movies[['title', 'movieId']])
movie_stats.head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
0,1,215,3.92093,3.891,Toy Story (1995)
1,2,110,3.431818,3.417,Jumanji (1995)
2,3,52,3.259615,3.26,Grumpier Old Men (1995)
3,4,7,2.357143,2.898,Waiting to Exhale (1995)
4,5,49,3.071429,3.105,Father of the Bride Part II (1995)


Now we can sort this table to find most popular and least populat movie based on bayesian average

In [23]:
movie_stats.sort_values('bayesian_avg', ascending=False).head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
277,318,317,4.429022,4.392,"Shawshank Redemption, The (1994)"
659,858,192,4.289062,4.236,"Godfather, The (1972)"
2224,2959,218,4.272936,4.227,Fight Club (1999)
224,260,251,4.231076,4.193,Star Wars: Episode IV - A New Hope (1977)
46,50,204,4.237745,4.191,"Usual Suspects, The (1995)"


In [24]:
movie_stats.sort_values('bayesian_avg', ascending=True).head()

Unnamed: 0,movieId,count,mean,bayesian_avg,title
1172,1556,19,1.605263,2.19,Speed 2: Cruise Control (1997)
2679,3593,19,1.657895,2.224,Battlefield Earth (2000)
1372,1882,33,1.954545,2.267,Godzilla (1998)
1144,1499,27,1.925926,2.297,Anaconda (1997)
1988,2643,16,1.6875,2.307,Superman IV: The Quest for Peace (1987)


We can explore little bit about movies

In [25]:
movies['genres'] = movies['genres'].apply(lambda x: x.split('|'))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


We can calculate how many movies we have for each genre

In [26]:
genre_counts = movies.explode('genres')['genres'].value_counts()
genre_counts

genres
Drama                 4361
Comedy                3756
Thriller              1894
Action                1828
Romance               1596
Adventure             1263
Crime                 1199
Sci-Fi                 980
Horror                 978
Fantasy                779
Children               664
Animation              611
Mystery                573
Documentary            440
War                    382
Musical                334
Western                167
IMAX                   158
Film-Noir               87
(no genres listed)      34
Name: count, dtype: int64

4. Data Pre-Processing

Matrix for User-based approach

In [27]:
user_to_movie_df = ratings.pivot(
    index='userId',
     columns='movieId',
      values='rating').fillna(0)

In [28]:
user_to_movie_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


It is important to note that the user_to_movie_df DataFrame is a sparse matrix. This means that most of the values are zero. We can use the scipy.sparse.csr_matrix class to store the data more efficiently.

In [29]:
user_to_movie_sparse_df = csr_matrix(user_to_movie_df.values)

Matrix for Item-Based approach

In [30]:
movie_to_user_df = ratings.pivot(
    index='movieId',
     columns='userId',
      values='rating').fillna(0)

In [31]:
movie_to_user_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
movie_to_user_sparse_df = csr_matrix(movie_to_user_df.values)

Because KNN will return indices, we need to be able to man them to corresponding movieIds

In [33]:
# Mapping from movieId to matrix row index
movie_to_index = {movieId: idx for idx, movieId in enumerate(movie_to_user_df.index)}

# Mapping from matrix row index to movieId (reverse mapping)
index_to_movie = {idx: movieId for movieId, idx in movie_to_index.items()}

movieId_to_title = movies.set_index('movieId')['title'].to_dict()
title_to_movieId = movies.set_index('title')['movieId'].to_dict()

In [130]:
print("Movie ID to Matrix Index:")
for movieId, idx in movie_to_index.items():
    print(f"MovieID: {movieId}, Index: {idx}")

print("\nMatrix Index to Movie ID:")
for idx, movieId in index_to_movie.items():
    print(f"Index: {idx}, MovieID: {movieId}")

print("\nMovieID to Title:")
for movieId, title in movieId_to_title.items():
    print(f"MovieID: {movieId}, Title: {title}")

print("\nTitle to MovieID:")
for title, movieId in title_to_movieId.items():
    print(f"Title: {title}, MovieID: {movieId}")

Movie ID to Matrix Index:
MovieID: 1, Index: 0
MovieID: 2, Index: 1
MovieID: 3, Index: 2
MovieID: 4, Index: 3
MovieID: 5, Index: 4
MovieID: 6, Index: 5
MovieID: 7, Index: 6
MovieID: 8, Index: 7
MovieID: 9, Index: 8
MovieID: 10, Index: 9
MovieID: 11, Index: 10
MovieID: 12, Index: 11
MovieID: 13, Index: 12
MovieID: 14, Index: 13
MovieID: 15, Index: 14
MovieID: 16, Index: 15
MovieID: 17, Index: 16
MovieID: 18, Index: 17
MovieID: 19, Index: 18
MovieID: 20, Index: 19
MovieID: 21, Index: 20
MovieID: 22, Index: 21
MovieID: 23, Index: 22
MovieID: 24, Index: 23
MovieID: 25, Index: 24
MovieID: 26, Index: 25
MovieID: 27, Index: 26
MovieID: 28, Index: 27
MovieID: 29, Index: 28
MovieID: 30, Index: 29
MovieID: 31, Index: 30
MovieID: 32, Index: 31
MovieID: 34, Index: 32
MovieID: 36, Index: 33
MovieID: 38, Index: 34
MovieID: 39, Index: 35
MovieID: 40, Index: 36
MovieID: 41, Index: 37
MovieID: 42, Index: 38
MovieID: 43, Index: 39
MovieID: 44, Index: 40
MovieID: 45, Index: 41
MovieID: 46, Index: 42
Movi

5. Model 

KNN implemented with self implemented cosine similarity function

In [210]:
class KNN:
    def __init__(self, k=5, metric='cosine'):
        self.k = k
        self.metric = metric
        self.data = None
    
    def fit(self, X):
        # Store data as a sparse matrix (CSR format is efficient for row slicing and matrix operations)
        self.data = X

    def non_zero_cosine(self, input_vector):
        # Convert input_vector to COO format to access non-zero column indices and values
        input_vector_coo = input_vector.tocoo()
        input_vector_non_zero_cols = input_vector_coo.col
        input_vector_non_zero_vals = input_vector_coo.data

        distances = []

        # Iterate over each row in the matrix
        for i in range(self.data.shape[0]):
            # Extract values from the matrix row only at the non-zero indices of the input_vector
            row_values_at_input_non_zero_indices = np.array([self.data[i, idx] for idx in input_vector_non_zero_cols]).flatten()
            input_vector_values = input_vector_non_zero_vals

            # Check if there are values to compare
            if input_vector_values.size > 0 and row_values_at_input_non_zero_indices.size > 0:
                # Calculate norms for the vectors
                norm_test = np.linalg.norm(input_vector_values)
                norm_row = np.linalg.norm(row_values_at_input_non_zero_indices)

                # Handle cases where either norm is zero
                if norm_test == 0 or norm_row == 0:
                    cos_distance = 1  # Maximum cosine distance if one of the vectors is zero
                else:
                    # Calculate cosine similarity for values at the non-zero indices of input_vector
                    cos_similarity = np.dot(input_vector_values, row_values_at_input_non_zero_indices) / (norm_test * norm_row)
                    cos_distance = 1 - cos_similarity

                distances.append((cos_distance, i))

        # Return distances sorted by cosine distance
        return sorted(distances, key=lambda x: x[0])

    def _get_neighbors(self, distances):
        """
        Retrieve the top k neighbors based on the sorted list of distances.
        """
        # Get the top-k nearest neighbors
        top_k_distances = distances[:self.k]
        # Separate distances and indices for output
        top_k_indices = [idx for _, idx in top_k_distances]
        top_k_distances = [dist for dist, _ in top_k_distances]
        
        return top_k_indices, top_k_distances
    
    def kneighbors(self, X):
        """
        For each vector in X, compute the k nearest neighbors based on cosine distance.
        """
        # Ensure X is a sparse matrix
        X = csr_matrix(X)
        
        all_neighbors = []
        all_distances = []

        # Compute neighbors for each test vector
        for i in range(X.shape[0]):
            input_vector = X.getrow(i)
            distances = self.non_zero_cosine(input_vector)
            neighbors, distances = self._get_neighbors(distances)
            all_neighbors.append(neighbors)
            all_distances.append(distances)
        
        return np.array(all_distances), np.array(all_neighbors)
    
    def recommend(self, index, top_n=10):
        """
        Recommend movies to a specific user based on similar users, considering all k neighbors and their ratings.
        
        Parameters:
        user_id (int): ID of the user for whom recommendations are to be made.
        top_n (int): Number of movie recommendations to return.
        
        Returns:
        List[int]: List of recommended movie IDs.
        """
        # Get the user's interaction vector
        user_vector = self.data.getrow(index)
        
        # Find the k nearest neighbors for the user
        distances, neighbors = self.kneighbors(user_vector)
        
        # Get the nearest neighbor indices and their distances
        neighbor_ids = neighbors[0]
        neighbor_distances = distances[0]

        # Movies already watched by the user
        user_movie_indices = set(user_vector.nonzero()[1])

        # Dictionary to store movie scores
        movie_scores = {}

        # Iterate over each neighbor and their distance
        for idx, neighbor_id in enumerate(neighbor_ids):
            neighbor_vector = self.data.getrow(neighbor_id - 1)
            neighbor_movies = neighbor_vector.nonzero()[1]  # Movies watched by this neighbor
            neighbor_ratings = neighbor_vector.data         # Ratings given by the neighbor
            
            # Weight by the inverse of the distance (closer neighbors contribute more)
            weight = 1 / (neighbor_distances[idx] + 1e-9)  # Avoid division by zero

            # Iterate over the movies watched by the neighbor
            for i, movie in enumerate(neighbor_movies):
                rating = neighbor_ratings[i]
                if neighbor_id != user_id-1:
                    if movie not in user_movie_indices:  # Only consider movies the user hasn't seen
                        if movie not in movie_scores:
                            movie_scores[movie] = 0
                        # Add the weighted rating to the movie score
                        movie_scores[movie] += weight * rating
        # Sort movies by their weighted scores
        recommended_movie_ids = sorted(movie_scores, key=movie_scores.get, reverse=True)
        
        return recommended_movie_ids[:top_n]
    
    def recommend_by_movie(self, movie_id, index_to_movie, movie_to_index, top_n=10):
        """
        Recommend movies similar to a given movie based on user interactions.

        Parameters:
        movie_id (int): ID of the movie for which recommendations are to be made.
        index_to_movie (dict): Mapping from row index to movieId.
        movie_to_index (dict): Mapping from movieId to row index.
        top_n (int): Number of movie recommendations to return.

        Returns:
        List[int]: List of recommended movie IDs.
        """
        if movie_id not in movie_to_index:
            raise ValueError(f"Movie ID {movie_id} not found in the dataset.")
        
        # Get the row index for the input movie
        movie_idx = movie_to_index[movie_id]
        print(movie_idx)
        
        # Get the interaction vector for the movie
        movie_vector = self.data.getrow(movie_idx)
        
        # Compute similarities to other movies
        distances, neighbors = self.kneighbors(movie_vector)
        
        # Get the neighbor movie indices and distances
        neighbor_indices = neighbors[0]
        neighbor_distances = distances[0]

        # Exclude the input movie itself
        recommended_indices = [
            idx for idx in neighbor_indices if idx != movie_idx
        ][:top_n]

        # Map indices back to movie IDs
        recommended_movie_ids = [index_to_movie[idx] for idx in recommended_indices]

        return recommended_movie_ids
    
    def recommend_by_movie_with_genre(self, movie_id, index_to_movie, movie_to_index, genre_data, top_n=10):
        """
        Recommend movies similar to a given movie, with genre filtering.

        Parameters:
        movie_id (int): ID of the movie for which recommendations are to be made.
        index_to_movie (dict): Mapping from row index to movieId.
        movie_to_index (dict): Mapping from movieId to row index.
        genre_data (dict): Dictionary mapping movieId to genre.
        top_n (int): Number of movie recommendations to return.

        Returns:
        List[int]: List of recommended movie IDs.
        """
        if movie_id not in movie_to_index:
            raise ValueError(f"Movie ID {movie_id} not found in the dataset.")
        
        # Get the row index for the input movie
        movie_idx = movie_to_index[movie_id]
        print(movie_idx)
        
        # Get the interaction vector for the movie
        movie_vector = self.data.getrow(movie_idx)
        
        # Compute similarities to other movies
        distances, neighbors = self.kneighbors(movie_vector)
        
        # Get the neighbor movie indices and distances
        neighbor_indices = neighbors[0]
        neighbor_distances = distances[0]

        # Exclude the input movie itself
        recommended_indices = [
            idx for idx in neighbor_indices if idx != movie_idx
        ]

        # Map indices back to movie IDs
        recommended_movie_ids = [index_to_movie[idx] for idx in recommended_indices]

        # Filter recommendations by genre
        input_genres = genre_data[movie_id]  # Genres of the input movie
        filtered_recommendations = [
            movie_id for movie_id in recommended_movie_ids
            if any(genre in input_genres for genre in genre_data[movie_id])  # Check for overlap
        ]

        return filtered_recommendations[:top_n]

User-Based Approach

In [132]:
knn_user = KNN(k=5, metric='cosine')
knn_user.fit(user_to_movie_sparse_df)

In [36]:
user_id = 1
user_based_rec_movies = knn_user.recommend(user_id, top_n=11)

In [37]:
print(f"User-based recommendations for User {user_id}:")
for idx in user_based_rec_movies:
    print(f"  {movieId_to_title[index_to_movie[idx]]}")

User-based recommendations for User 1:
  Shawshank Redemption, The (1994)
  Lord of the Rings: The Fellowship of the Ring, The (2001)
  Casino (1995)
  Remember the Titans (2000)
  Meet the Parents (2000)
  Knight's Tale, A (2001)
  Godfather, The (1972)
  To Kill a Mockingbird (1962)
  Good Will Hunting (1997)
  U.S. Marshals (1998)
  Lethal Weapon 3 (1992)


Item-Based Approach

In [137]:
knn_movie = KNN(k=11, metric='cosine')
knn_movie.fit(movie_to_user_sparse_df)

In [138]:
movies_recom= knn_movie.recommend_by_movie(1, index_to_movie, movie_to_index, top_n=10)

0


In [140]:
print(movie_to_index[1])

0


In [142]:
print(f"Movie-based recommendations for Movie {movieId_to_title[1]}:")
for idx in movies_recom:
    print(f"  {movieId_to_title[index_to_movie[idx]]}")

Movie-based recommendations for Movie Toy Story (1995):
  Age of Innocence, The (1993)
  I Love Trouble (1994)
  Virtuosity (1995)
  Terminal Velocity (1994)
  Quiz Show (1994)
  Cinderella (1950)
  Cemetery Man (Dellamorte Dellamore) (1994)
  Love & Human Remains (1993)
  Band of the Hand (1986)
  Hunt for Red October, The (1990)


In [143]:
filtered_movies_recom = knn_movie.recommend_by_movie_with_genre(1, index_to_movie, movie_to_index, genre_data, top_n=10)

0


In [144]:
print(f"Movie-based recommendations for Movie {movieId_to_title[1]}:")
for idx in filtered_movies_recom:
    print(f"  {movieId_to_title[index_to_movie[idx]]}")

Movie-based recommendations for Movie Toy Story (1995):
  Age of Innocence, The (1993)
  Virtuosity (1995)
  Terminal Velocity (1994)
  Quiz Show (1994)
  Cinderella (1950)
  Love & Human Remains (1993)
  Hunt for Red October, The (1990)


In [211]:
user_ids = ratings['userId'].unique()
user_ids_test = np.random.choice(user_ids, size=60, replace=False)

In [244]:
user_to_movie_test = user_to_movie_df.loc[user_ids_test]
user_to_movie_test

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
264,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490,3.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305,0.0,3.5,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [261]:
user_to_index_map = {userId: idx for idx, userId in enumerate(user_to_movie_test.index)}
index_to_movie_test = {idx: movieId for idx, movieId in enumerate(user_to_movie_test.columns)}
print(user_to_index_map)


{440: 0, 105: 1, 264: 2, 142: 3, 238: 4, 490: 5, 371: 6, 208: 7, 164: 8, 305: 9, 282: 10, 100: 11, 261: 12, 394: 13, 461: 14, 200: 15, 80: 16, 49: 17, 146: 18, 334: 19, 497: 20, 522: 21, 369: 22, 18: 23, 132: 24, 324: 25, 442: 26, 122: 27, 39: 28, 205: 29, 574: 30, 42: 31, 477: 32, 471: 33, 19: 34, 430: 35, 578: 36, 365: 37, 481: 38, 609: 39, 206: 40, 491: 41, 38: 42, 439: 43, 364: 44, 385: 45, 473: 46, 220: 47, 446: 48, 520: 49, 426: 50, 472: 51, 86: 52, 484: 53, 342: 54, 199: 55, 56: 56, 22: 57, 501: 58, 333: 59}


In [195]:
row = user_to_movie_test.loc[1]

# Print the row
print("Row with index=43:")
print(row[101])

KeyError: 1

In [246]:
test_change = user_to_movie_test.copy()

In [247]:
top_five = dict()

In [248]:
def process_row_debug(row, user_id, n=5):
    # Step 1: Find top n indices using nlargest
    top_n_indices = row.nlargest(n).index.tolist()
    
    # Store non-zero values before processing
    non_zero_before = row[row != 0].copy()
    print(f"Processing user {user_id}")
    print("Non-zero values before processing:", non_zero_before.to_dict())  # To show indices explicitly
    print(f"Top {n} indices: {top_n_indices}")
    
    # Step 2: Replace the identified indices with 0
    row[top_n_indices] = 0
    
    # Update the "non-zero before" for changes
    modified_non_zero_after = non_zero_before.copy()
    for idx in top_n_indices:
        modified_non_zero_after.loc[idx] = 0
    print("Non-zero values after processing (modified version of before):", modified_non_zero_after.to_dict())
    
    # Step 3: Store the indices in the dictionary
    top_five[user_id] = top_n_indices
    return row


In [176]:
process_row_debug(row, 43)
print(top_five)

Processing user 43
Non-zero values before processing: {1: 5.0, 3: 5.0, 5: 5.0, 7: 5.0, 8: 5.0, 10: 4.0, 11: 4.0, 23: 5.0, 29: 5.0, 34: 5.0, 47: 4.0, 48: 5.0, 57: 5.0, 60: 5.0, 79: 5.0, 95: 4.0, 102: 5.0, 107: 5.0, 110: 3.0, 150: 5.0, 153: 5.0, 158: 5.0, 161: 4.0, 165: 5.0, 168: 5.0, 173: 3.0, 174: 5.0, 185: 4.0, 193: 4.0, 208: 4.0, 217: 5.0, 225: 4.0, 231: 3.0, 236: 3.0, 238: 5.0, 253: 4.0, 256: 5.0, 259: 4.0, 261: 5.0, 262: 5.0, 271: 5.0, 276: 5.0, 277: 5.0, 288: 3.0, 292: 5.0, 296: 3.0, 300: 3.0, 316: 4.0, 317: 4.0, 318: 4.0, 325: 5.0, 329: 5.0, 339: 4.0, 343: 4.0, 344: 4.0, 349: 3.0, 350: 4.0, 351: 4.0, 355: 5.0, 356: 5.0, 362: 5.0, 364: 5.0, 367: 5.0, 374: 5.0, 377: 3.0, 380: 3.0, 382: 5.0, 410: 4.0, 413: 3.0, 419: 5.0, 420: 5.0, 432: 4.0, 434: 5.0, 435: 5.0, 442: 5.0, 454: 4.0, 457: 5.0, 480: 4.0, 484: 5.0, 500: 5.0, 502: 5.0, 519: 5.0, 520: 5.0, 532: 4.0, 539: 3.0, 542: 4.0, 552: 5.0, 553: 4.0, 575: 5.0, 587: 5.0, 588: 5.0, 589: 5.0, 590: 4.0, 592: 5.0, 594: 5.0, 595: 5.0, 596: 5

In [249]:
test_df = test_change.apply(lambda row: process_row_debug(row, row.name), axis=1)


Processing user 440
Non-zero values before processing: {29: 4.5, 193: 3.5, 194: 2.5, 307: 4.0, 532: 3.0, 541: 5.0, 671: 4.0, 924: 4.5, 1093: 3.0, 1186: 3.0, 1199: 4.5, 1214: 4.5, 1235: 4.5, 1274: 3.0, 1378: 3.5, 1653: 4.0, 1748: 4.5, 1921: 3.5, 1960: 4.0, 2359: 3.5, 2707: 4.0, 3174: 4.0, 3363: 4.5, 3386: 3.0, 3471: 4.5, 4361: 4.0, 4878: 4.5, 6502: 4.5, 6774: 3.5, 7361: 4.0, 8690: 4.0, 48774: 3.5, 60069: 4.0}
Top 5 indices: [541, 29, 924, 1199, 1214]
Non-zero values after processing (modified version of before): {29: 0.0, 193: 3.5, 194: 2.5, 307: 4.0, 532: 3.0, 541: 0.0, 671: 4.0, 924: 0.0, 1093: 3.0, 1186: 3.0, 1199: 0.0, 1214: 0.0, 1235: 4.5, 1274: 3.0, 1378: 3.5, 1653: 4.0, 1748: 4.5, 1921: 3.5, 1960: 4.0, 2359: 3.5, 2707: 4.0, 3174: 4.0, 3363: 4.5, 3386: 3.0, 3471: 4.5, 4361: 4.0, 4878: 4.5, 6502: 4.5, 6774: 3.5, 7361: 4.0, 8690: 4.0, 48774: 3.5, 60069: 4.0}
Processing user 105
Non-zero values before processing: {6: 4.0, 16: 4.5, 32: 3.5, 47: 5.0, 50: 5.0, 110: 3.5, 111: 4.5, 213: 4

In [250]:
test_df

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
440,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
105,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
264,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
238,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490,3.5,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
208,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
305,0.0,3.5,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [253]:
test_sparse_df = csr_matrix(test_df.values)

In [254]:
knn_test = KNN(k=10, metric='cosine')
knn_test.fit(test_sparse_df)

In [257]:
user_id = 261

user_based_rec_movies_test = knn_test.recommend(user_to_index_map[user_id], top_n=10)

[296, 356, 2194, 2858, 3949]

In [258]:
print(f"User-based recommendations for User {user_id}:")
for idx in user_based_rec_movies_test:
    print(f"  {movieId_to_title[index_to_movie_test[idx]]}, {index_to_movie_test[idx]}")

User-based recommendations for User 261:
  Dead Poets Society (1989), 1246
  Breakfast Club, The (1985), 1968
  Notting Hill (1999), 2671
  Wizard of Oz, The (1939), 919
  Maverick (1994), 368
  Tombstone (1993), 553
  Pretty Woman (1990), 597
  Grosse Pointe Blank (1997), 1500
  When Harry Met Sally... (1989), 1307
  Jerry Maguire (1996), 1393


In [88]:
indices_test = top_five[user_id]
print(f"Top 5 movies for User {user_id}:")
for idx in indices_test:
    print(f"  {movieId_to_title[idx]}")

Top 5 movies for User 40:
  Dolores Claiborne (1995)
  Like Water for Chocolate (Como agua para chocolate) (1992)
  Madness of King George, The (1994)
  Nobody's Fool (1994)
  Pulp Fiction (1994)
