<h1>Movie Recommendation System</h1>

<h2>Import all necessary libraries</h2>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from ast import literal_eval

<h2>Loading the data</h2>

In [2]:
df = pd.read_csv('/kaggle/input/tmdb-movies-dataset-2023-930k-movies/TMDB_movie_dataset_v11.csv')
df.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."
2,155,The Dark Knight,8.512,30619,Released,2008-07-16,1004558444,152,False,/nMKdUUepR0i5zn0y1T4CsSB5chy.jpg,...,The Dark Knight,Batman raises the stakes in his war on crime. ...,130.643,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,Welcome to a world without rules.,"Drama, Action, Crime, Thriller","DC Comics, Legendary Pictures, Syncopy, Isobel...","United Kingdom, United States of America","English, Mandarin","joker, sadism, chaos, secret identity, crime f..."
3,19995,Avatar,7.573,29815,Released,2009-12-15,2923706026,162,False,/vL5LR6WdxWPjLPFRLe133jXWsh5.jpg,...,Avatar,"In the 22nd century, a paraplegic Marine is di...",79.932,/kyeqWdyUXW608qlYkRqosgbbJyK.jpg,Enter the world of Pandora.,"Action, Adventure, Fantasy, Science Fiction","Dune Entertainment, Lightstorm Entertainment, ...","United States of America, United Kingdom","English, Spanish","future, society, culture clash, space travel, ..."
4,24428,The Avengers,7.71,29166,Released,2012-04-25,1518815515,143,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,...,The Avengers,When an unexpected enemy emerges and threatens...,98.082,/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg,Some assembly required.,"Science Fiction, Action, Adventure",Marvel Studios,United States of America,"English, Hindi, Russian","new york city, superhero, shield, based on com..."


<h2>Understanding the data</h2>

In [3]:
print(f"DataFrame Shape: {df.shape}")
print("Columns:", df.columns.tolist())

DataFrame Shape: (1182820, 24)
Columns: ['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date', 'revenue', 'runtime', 'adult', 'backdrop_path', 'budget', 'homepage', 'imdb_id', 'original_language', 'original_title', 'overview', 'popularity', 'poster_path', 'tagline', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'keywords']


<h2>Data Preprocessing</h2>

<h4>Checking null values in the dataframe</h4>

In [4]:
df.isnull().sum()

id                            0
title                        13
vote_average                  0
vote_count                    0
status                        0
release_date             206653
revenue                       0
runtime                       0
adult                         0
backdrop_path            872219
budget                        0
homepage                1058399
imdb_id                  569002
original_language             0
original_title               13
overview                 246358
popularity                    0
poster_path              381932
tagline                 1017222
genres                   482322
production_companies     653650
production_countries     532988
spoken_languages         512948
keywords                 867885
dtype: int64

<h4>Filling null values with empty string</h4>

In [5]:
features = ['keywords', 'genres', 'overview', 'title', 'spoken_languages', 'production_countries']
df[features] = df[features].fillna('')

<h4>Checking duplicates in the dataframe</h4>

In [6]:
df.duplicated().sum()

370

<h4>Removing duplicates from the dataframe</h4>

In [7]:
df.drop_duplicates(inplace=True)

<h2>Downloading necessary NLTK packages</h2>

In [8]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

<h4>Initialising stop words and lemmatizer</h4>

In [9]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

<h2>Defining functions for removing stopwords and lemmatization</h2>

In [10]:
def remove_stopwords(text):
    words = [word for word in text.split() if word.lower() not in stop_words]
    return " ".join(words)

def lemmatize_text(text):
    words = [lemmatizer.lemmatize(word) for word in text.split()]
    return " ".join(words)

<h2>Implementing Jaccard Similarity function</h2>

In [11]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

<h2>Safe Literal Evaluation</h2>

In [12]:
def safe_literal_eval(value):
    if isinstance(value, str):
        try:
            return literal_eval(value)
        except (ValueError, SyntaxError):
            return value.split(',') if value else []
    return value if isinstance(value, list) else []

features = ['keywords', 'genres', 'spoken_languages', 'production_countries']
for feature in features:
    df[feature] = df[feature].apply(safe_literal_eval)

<h2>Cleaning the data</h2>

In [13]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        return ''

features = ['keywords', 'genres', 'spoken_languages', 'production_countries']
for feature in features:
    df[feature] = df[feature].apply(clean_data)

<h2>Calculating cosine similarity for overview and keywords</h2>

In [14]:
df = df[:30000]

tfidf_overview = TfidfVectorizer(stop_words='english')
tfidf_overview_matrix = tfidf_overview.fit_transform(df['overview'])

keywords_text = [' '.join(kw) if isinstance(kw, list) else kw for kw in df['keywords']]
vector = CountVectorizer(stop_words='english')
vector_keywords_matrix = vector.fit_transform(keywords_text)

cosine_sim_overview = cosine_similarity(tfidf_overview_matrix)
cosine_sim_keywords = cosine_similarity(vector_keywords_matrix)

<h2>Converting features into Individual Lists</h2>

In [15]:
movie_titles = df['title'].tolist()
movie_genres = df['genres'].tolist()
movie_countries = df['production_countries'].tolist()
movie_languages = df['spoken_languages'].tolist()

<h2>Defining function to calculate combined similarity</h2>

In [16]:
def combined_similarity(idx1, idx2):
    weight_overview = 0.3
    weight_keywords = 0.3
    weight_genres = 0.2
    weight_countries = 0.1
    weight_languages = 0.1
    
    genre_sim = jaccard_similarity(set(movie_genres[idx1]), set(movie_genres[idx2]))
    country_sim = jaccard_similarity(set(movie_countries[idx1]), set(movie_countries[idx2]))
    language_sim = jaccard_similarity(set(movie_languages[idx1]), set(movie_languages[idx2]))

    return (
        weight_overview * cosine_sim_overview[idx1, idx2] +
        weight_keywords * cosine_sim_keywords[idx1, idx2] +
        weight_genres * genre_sim +
        weight_countries * country_sim +
        weight_languages * language_sim
    )

<h2>Implementing function to retrieve recommendations</h2>

In [17]:
def get_recommendations(movie_title, top_n=10):
    if not isinstance(movie_title, str):
        return "Error: Movie title must be a string."

    try:
        movie_index = df[df['title'] == movie_title].index[0]
    except IndexError:
        return f"Error: '{movie_title}' not found in the dataset."
    
    similarities = [(i, combined_similarity(movie_index, i)) for i in range(len(df)) if i != movie_index]
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    print(f"\nTop {top_n} recommendations for '{movie_title}':")
    for i, (movie_idx, sim) in enumerate(similarities[:top_n], 1):
        print(f"{i}. {df.iloc[movie_idx]['title']} (Similarity: {sim:.3f})")

<h2>Testing the recommendation system</h2>

In [18]:
print(get_recommendations("Interstellar"))


Top 10 recommendations for 'Interstellar':
1. Destination Moon (Similarity: 0.413)
2. The Martian (Similarity: 0.402)
3. Marooned (Similarity: 0.393)
4. A.I. Artificial Intelligence (Similarity: 0.392)
5. Stowaway (Similarity: 0.388)
6. 1984 (Similarity: 0.386)
7. Gravity (Similarity: 0.375)
8. Finch (Similarity: 0.367)
9. Ex Machina (Similarity: 0.366)
10. Love (Similarity: 0.361)
None
