# Importing Libraries

In [12]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Loading Data

In [3]:
def load_data(movies_path, ratings_path):
    movies = pd.read_csv(movies_path)
    ratings = pd.read_csv(ratings_path)
    return movies, ratings

movies_path = 'https://raw.githubusercontent.com/Rakshitx1/Movie-Recomendation-System/master/Dataset/movies.csv'
ratings_path = 'https://raw.githubusercontent.com/Rakshitx1/Movie-Recomendation-System/master/Dataset/ratings.csv'
movies, ratings = load_data(movies_path, ratings_path)

# Data Preprocessing

preprocess_movie_name : Preprocesses the input movie name by removing extra spaces and non-alphanumeric characters.

preprocess_movie_database : Preprocesses the movie database by removing extra spaces and non-alphanumeric characters from movie titles.

In [14]:
def preprocess_movie_name(movie_name):
    return re.sub(r'\W+', ' ', movie_name).strip()

def preprocess_movie_database(movies_df):
    movies_df['title'] = movies_df['title'].apply(lambda x: re.sub(r'\W+', ' ', x).strip())
    return movies_df

We wouldn’t want movies that were rated by a small number of users because it’s not credible enough. Similarly, users who have rated only a handful of movies should also not be taken into account.

To qualify a movie, a minimum of 10 users should have voted a movie.

To qualify a user, a minimum of 50 movies should have voted by the user.

In [15]:
def preprocess_data(ratings, min_user_votes=50, min_movie_votes=10):
    user_counts = ratings['userId'].value_counts()
    movie_counts = ratings['movieId'].value_counts()
    
    ratings = ratings[ratings['userId'].isin(user_counts[user_counts >= min_user_votes].index)]
    ratings = ratings[ratings['movieId'].isin(movie_counts[movie_counts >= min_movie_votes].index)]
    
    final_dataset = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    return final_dataset

final_dataset = preprocess_data(ratings)

In [16]:
# Merge movie genres into a single string
movies['genres'] = movies['genres'].str.replace('|', ' ')

# Vectorize movie genres using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])

# Training the Model

We will apply Naive Bayes Model here using sklearn

In [17]:
# Vectorize movie genres using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])

# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(tfidf_matrix, movies['title'])

MultinomialNB()

# Movie Recommendation

This function takes a movie name, Naive Bayes model, TF-IDF vectorizer, movies dataframe, and optional parameter k (number of recommendations).

It preprocesses the input movie name and the movie database to handle variations in movie titles, computes the similarity between the input movie and other movies based on their genres using cosine similarity, and returns the top k recommended movies along with their genres.

In [22]:
def get_movie_recommendation_nb(movie_name, model, vectorizer, movies_df, k=10):
    movie_name = preprocess_movie_name(movie_name)
    movies_df = preprocess_movie_database(movies_df)
    movie_idx = movies_df[movies_df['title'].str.contains(movie_name, case=False)].index
    if len(movie_idx) > 0:
        movie_features = vectorizer.transform(movies_df.iloc[movie_idx]['genres'])
        similarity_scores = cosine_similarity(movie_features, tfidf_matrix)
        similar_movies_idx = np.argsort(similarity_scores[0])[::-1][1:k+1] 
        recommendations = movies_df.iloc[similar_movies_idx][['title', 'genres']]
        return recommendations
    else:
        return "No movies found. Please check your input."

In [8]:
# # Example usage with output presented as an HTML table
# recommendations = get_movie_recommendation_nb('Iron Man 2', nb_model, tfidf_vectorizer, movies)
# html_table = recommendations.to_html(index=False)
# print(html_table)

# Results

In [24]:
movie = str(input("Enter Movie: "))
recommendations = get_movie_recommendation_nb(movie, nb_model, tfidf_vectorizer, movies)
print(recommendations)

                                               title            genres
1651                                   Saboteur 1942  Mystery Thriller
5215                                     Fear X 2003  Mystery Thriller
3441  Cat o Nine Tails The Gatto a nove code Il 1971  Mystery Thriller
2659                                 Jennifer 8 1992  Mystery Thriller
5695                                    Old Boy 2003  Mystery Thriller
7340                                     Buried 2010  Mystery Thriller
4401                                  Shattered 1991  Mystery Thriller
1119                             Absolute Power 1997  Mystery Thriller
5801                      Bunny Lake Is Missing 1965  Mystery Thriller
4902                              Secret Window 2004  Mystery Thriller
