# Importing Libraries

In [2]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

# Loading Data

In [3]:
def load_data(movies_path, ratings_path):
    movies = pd.read_csv(movies_path)
    ratings = pd.read_csv(ratings_path)
    return movies, ratings

movies_path = 'https://raw.githubusercontent.com/Rakshitx1/Movie-Recomendation-System/master/Dataset/movies.csv'
ratings_path = 'https://raw.githubusercontent.com/Rakshitx1/Movie-Recomendation-System/master/Dataset/ratings.csv'
movies, ratings = load_data(movies_path, ratings_path)

# Data Preprocessing

preprocess_movie_name : Preprocesses the input movie name by removing extra spaces and non-alphanumeric characters.

preprocess_movie_database : Preprocesses the movie database by removing extra spaces and non-alphanumeric characters from movie titles.

In [4]:
def preprocess_movie_name(movie_name):
    return re.sub(r'\W+', ' ', movie_name).strip()

def preprocess_movie_database(movies_df):
    movies_df['title'] = movies_df['title'].apply(lambda x: re.sub(r'\W+', ' ', x).strip())
    return movies_df

We wouldn’t want movies that were rated by a small number of users because it’s not credible enough. Similarly, users who have rated only a handful of movies should also not be taken into account.

To qualify a movie, a minimum of 10 users should have voted a movie.

To qualify a user, a minimum of 50 movies should have voted by the user.

In [5]:
def preprocess_data(ratings, min_user_votes=50, min_movie_votes=10):
    user_counts = ratings['userId'].value_counts()
    movie_counts = ratings['movieId'].value_counts()
    
    ratings = ratings[ratings['userId'].isin(user_counts[user_counts >= min_user_votes].index)]
    ratings = ratings[ratings['movieId'].isin(movie_counts[movie_counts >= min_movie_votes].index)]
    
    final_dataset = ratings.pivot(index='movieId', columns='userId', values='rating').fillna(0)
    return final_dataset

final_dataset = preprocess_data(ratings)

In [6]:
# Merge movie genres into a single string
movies['genres'] = movies['genres'].str.replace('|', ' ')

# Training the Model

We will apply Naive Bayes Model here using sklearn

In [7]:
# Vectorize movie genres using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])

# Train Naive Bayes classifier
nb_model = MultinomialNB()
nb_model.fit(tfidf_matrix, movies['title'])

MultinomialNB()

# Movie Recommendation

This function takes a movie name, Naive Bayes model, TF-IDF vectorizer, movies dataframe, and optional parameter k (number of recommendations).

It preprocesses the input movie name and the movie database to handle variations in movie titles, computes the similarity between the input movie and other movies based on their genres using cosine similarity, and returns the top k recommended movies along with their genres.

In [8]:
def get_movie_recommendation_nb(movie_name, model, vectorizer, movies_df, k=10):
    movie_name = preprocess_movie_name(movie_name)
    movies_df = preprocess_movie_database(movies_df)
    movie_idx = movies_df[movies_df['title'].str.contains(movie_name, case=False)].index
    if len(movie_idx) > 0:
        movie_features = vectorizer.transform(movies_df.iloc[movie_idx]['genres'])
        similarity_scores = cosine_similarity(movie_features, tfidf_matrix)
        similar_movies_idx = np.argsort(similarity_scores[0])[::-1][1:k+1] 
        recommendations = movies_df.iloc[similar_movies_idx][['title', 'genres']]
        return recommendations
    else:
        return "No movies found. Please check your input."

In [9]:
# # Example usage with output presented as an HTML table
# recommendations = get_movie_recommendation_nb('Iron Man 2', nb_model, tfidf_vectorizer, movies)
# html_table = recommendations.to_html(index=False)
# print(html_table)

# Results

In [10]:
movie = str(input("Enter Movie: "))
recommendations = get_movie_recommendation_nb(movie, nb_model, tfidf_vectorizer, movies)
print(recommendations)

                                   title                genres
1586  Ever After A Cinderella Story 1998  Comedy Drama Romance
4906              Mad Dog and Glory 1993  Comedy Drama Romance
4912                    Jersey Girl 2004  Comedy Drama Romance
6781                   Wackness The 2008  Comedy Drama Romance
4969            Pride and Prejudice 1940  Comedy Drama Romance
6766                   Chaos Theory 2007  Comedy Drama Romance
4994               He Said She Said 1991  Comedy Drama Romance
8875                         5 to 7 2014  Comedy Drama Romance
8876                         Afonya 1975  Comedy Drama Romance
1384              Can t Hardly Wait 1998  Comedy Drama Romance
