# Importing Libraries

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import ParameterSampler

# Loading Data

In [2]:
def load_data(movies_path, ratings_path):
  movies = pd.read_csv(movies_path)
  ratings = pd.read_csv(ratings_path)
  return movies, ratings

movies_path = 'https://raw.githubusercontent.com/Bansal0527/Movie-Recomendation-System/master/Dataset/movies.csv'
ratings_path = 'https://raw.githubusercontent.com/Bansal0527/Movie-Recomendation-System/master/Dataset/ratings.csv'
movies, ratings = load_data(movies_path, ratings_path)

# Data Preprocessing

preprocess_movie_name : Preprocesses the input movie name by removing extra spaces and non-alphanumeric characters.

preprocess_movie_database : Preprocesses the movie database by removing extra spaces and non-alphanumeric characters from movie titles.

In [3]:
def preprocess_movie_name(movie_name):
  return re.sub(r'\W+', ' ', movie_name).strip()

def preprocess_movie_database(movies_df):
  movies_df['title'] = movies_df['title'].apply(lambda x: re.sub(r'\W+', ' ', x).strip())
  return movies_df

We wouldn’t want movies that were rated by a small number of users because it’s not credible enough. Similarly, users who have rated only a handful of movies should also not be taken into account.

To qualify a movie, a minimum of 10 users should have voted a movie.

To qualify a user, a minimum of 50 movies should have voted by the user.

In [4]:
param_space = {
    'min_user_votes': range(10, 101),
    'min_movie_votes': range(30, 101)
}
n_iter = 100
best_score = float('inf')
best_params = None
param_sampler = ParameterSampler(param_space, n_iter=n_iter, random_state=42)
for params in param_sampler:
  filtered_ratings = ratings.copy()
  user_counts = filtered_ratings['userId'].value_counts()
  movie_counts = filtered_ratings['movieId'].value_counts()
  filtered_ratings = filtered_ratings[filtered_ratings['userId'].isin(user_counts[user_counts >= params['min_user_votes']].index)]
  filtered_ratings = filtered_ratings[filtered_ratings['movieId'].isin(movie_counts[movie_counts >= params['min_movie_votes']].index)]
  sparsity = 1 - len(filtered_ratings) / (len(ratings) * len(movies))
  if sparsity < best_score:
    best_score = sparsity
    best_params = params

print("Best parameters:", best_params)
print("Best sparsity score:", best_score)

Best parameters: {'min_user_votes': 15, 'min_movie_votes': 31}
Best sparsity score: 0.9999416111038673


In [5]:
def preprocess_data(ratings, min_user_votes=15, min_movie_votes=31):
  user_counts = ratings['userId'].value_counts()
  movie_counts = ratings['movieId'].value_counts()
    
  ratings = ratings[ratings['userId'].isin(user_counts[user_counts >= min_user_votes].index)]
  ratings = ratings[ratings['movieId'].isin(movie_counts[movie_counts >= min_movie_votes].index)]
    
  final_dataset = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
  return final_dataset

final_dataset = preprocess_data(ratings)

In [6]:
movies['genres'] = movies['genres'].str.replace('|', ' ')

# Training the Model

We will apply Naive Bayes Model here using sklearn

In [7]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])

nb_model = MultinomialNB()
nb_model.fit(tfidf_matrix, movies['title'])

MultinomialNB()

# Movie Recommendation

This function takes a movie name, Naive Bayes model, TF-IDF vectorizer, movies dataframe, and optional parameter k (number of recommendations).

It preprocesses the input movie name and the movie database to handle variations in movie titles, computes the similarity between the input movie and other movies based on their genres using cosine similarity, and returns the top k recommended movies along with their genres.

In [8]:
def get_movie_recommendation_nb(movie_name, model, vectorizer, movies_df, k=10):
  movie_name = preprocess_movie_name(movie_name)
  movies_df = preprocess_movie_database(movies_df)
  movie_idx = movies_df[movies_df['title'].str.contains(movie_name, case=False)].index
  if len(movie_idx) > 0:
    movie_features = vectorizer.transform(movies_df.iloc[movie_idx]['genres'])
    similarity_scores = cosine_similarity(movie_features, tfidf_matrix)
    similar_movies_idx = np.argsort(similarity_scores[0])[::-1][1:k+1] 
    recommendations = movies_df.iloc[similar_movies_idx][['title', 'genres']]
    return recommendations
  else:
    return "No movies found. Please check your input."

# Results

In [9]:
movie = "Memento"
recommendations = get_movie_recommendation_nb(movie, nb_model, tfidf_vectorizer, movies)
print(recommendations)

                                               title            genres
1651                                   Saboteur 1942  Mystery Thriller
5215                                     Fear X 2003  Mystery Thriller
3441  Cat o Nine Tails The Gatto a nove code Il 1971  Mystery Thriller
2659                                 Jennifer 8 1992  Mystery Thriller
5695                                    Old Boy 2003  Mystery Thriller
7340                                     Buried 2010  Mystery Thriller
4401                                  Shattered 1991  Mystery Thriller
1119                             Absolute Power 1997  Mystery Thriller
5801                      Bunny Lake Is Missing 1965  Mystery Thriller
4902                              Secret Window 2004  Mystery Thriller
