In [1]:
# Importing relevant libraries

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
import requests
import time
#from surprise import Reader, Dataset, SVD, evaluate
import warnings; warnings.simplefilter('ignore')
import os
from dotenv import load_dotenv
load_dotenv()

In [2]:
# Reading Dataset

md = pd.read_csv("TMDB_all_movies.csv")

In [3]:
md.count()

id                         1078995
title                      1078982
vote_average               1078993
vote_count                 1078993
status                     1078993
release_date                955771
revenue                    1078993
runtime                    1078993
budget                     1078993
imdb_id                     612768
original_language          1078993
original_title             1078983
overview                    888240
popularity                 1078993
tagline                     163150
genres                      770057
production_companies        501789
production_countries        652361
spoken_languages            666815
cast                        723344
director                    886328
director_of_photography     265001
writers                     527621
producers                   348684
music_composer              108839
imdb_rating                 444042
imdb_votes                  444042
poster_path                 777162
dtype: int64

In [4]:
# Filtering out movies that have already released

md = md[md['status'] == 'Released']

In [5]:
# removing columns that are irrelevant

cols_to_drop = [
    'status', 'revenue', 'runtime', 'budget', 'tagline', 'production_companies', 'production_countries', 'spoken_languages'
]
md = md.drop(columns=cols_to_drop)


In [6]:
# Dropping movies that don't have an IMDB rating

md = md[(md['imdb_rating'].notna())]

In [7]:
# Filtering out 'bad' and 'obscure' movies according to imdb

md = md[(md['imdb_rating'] >= 6) & (md['imdb_votes'] >= 1000)]

In [8]:
md.count()

id                         35326
title                      35326
vote_average               35326
vote_count                 35326
release_date               35123
imdb_id                    35326
original_language          35326
original_title             35326
overview                   35075
popularity                 35326
genres                     34926
cast                       34617
director                   34905
director_of_photography    27270
writers                    32880
producers                  29906
music_composer             21608
imdb_rating                35326
imdb_votes                 35326
poster_path                34639
dtype: int64

In [9]:
# Dealing with missing values

In [10]:
# Removing rows where 'overview' is missing

#md[md['overview'].isna()].sort_values(by='imdb_votes', ascending=False).head(10)
md = md[md['overview'].notna()]

In [11]:
# Removing rows with missing 'genres'

md = md[md['genres'].notna()]

In [12]:
# Removing rows with missing 'release_date'

#md[md['release_date'].isna()].sort_values(by='imdb_votes', ascending=False)
md = md[md['release_date'].notna()]

In [13]:
# Dropping the 'popularity' column since 'imdb_votes' indicates popularity anyways

md = md.drop(columns='popularity')

In [14]:
# Dropping the 'vote_average' and 'vote_count' columns as TMDB ratings are not required as we are going to be using IMDB ratings instead

md = md.drop(columns=['vote_average','vote_count'])

In [15]:
# Renaming columns

md.columns = ['tmdb_id', 'title', 'release_date', 'imdb_id', 'original_language', 'original_title', 'overview', 'genres', 'cast',
              'directed_by', 'cinematography', 'written_by', 'produced_by', 'music_by', 'imdb_rating', 'imdb_votes', 'poster_path']

In [16]:
# Re-ordering columns

md = md[['imdb_id', 'tmdb_id', 'title', 'original_title', 'overview', 'imdb_rating', 'imdb_votes', 'genres', 'cast', 'directed_by',
         'cinematography', 'written_by', 'produced_by', 'music_by', 'release_date', 'original_language', 'poster_path']]

In [17]:
# Removing rows where both cast and crew info is missing

md = md[md[['cast', 'directed_by', 'cinematography', 'written_by', 'produced_by', 'music_by']].notna().any(axis=1)]

In [18]:
# Fetching missing cast and crew details using the OMDB API

#md[md['cast'].isna()].count()
#md[md['directed_by'].isna()].sort_values(by='imdb_votes', ascending=False).head(10)
#md[md['cinematography'].isna()].count()
#md[md['written_by'].isna()].sort_values(by='imdb_votes', ascending=False).head(10)
#md[md['produced_by'].isna()].count()
#md[md['music_by'].isna()].count()

missing_data = md[(md['cast'].isna()) | (md['directed_by'].isna()) | (md['written_by'].isna())]

# making a copy for comparision later
to_compare = missing_data.copy()

missing_data.count()

imdb_id              2220
tmdb_id              2220
title                2220
original_title       2220
overview             2220
imdb_rating          2220
imdb_votes           2220
genres               2220
cast                 1806
directed_by          2133
cinematography        745
written_by            221
produced_by          1325
music_by              538
release_date         2220
original_language    2220
poster_path          2108
dtype: int64

In [19]:
OMDB_API_KEY = os.getenv("OMDB_API_KEY")

def fetch_omdb_data(imdb_id):
    """Fetching data from OMDB API for a movie via its imdb_id"""
    url = f"http://www.omdbapi.com/?i={imdb_id}&apikey={OMDB_API_KEY}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    return None

# Dropping rows where OMDB API response key 'Type' is NOT 'movie'
rows_to_drop = []
for idx, row in missing_data.iterrows():
    omdb_data = fetch_omdb_data(row['imdb_id'])
    if not omdb_data or omdb_data.get('Type') != 'movie':
        rows_to_drop.append(idx)

# Dropping non-movie rows in-place
missing_data.drop(rows_to_drop, inplace=True)

# Fetching missing data for remaining rows (confirmed movies)
for idx, row in missing_data.iterrows():
    omdb_data = fetch_omdb_data(row['imdb_id'])
    if omdb_data:  # Already confirmed type='movie' in Step 1
        if pd.isna(row['cast']) and 'Actors' in omdb_data:
            missing_data.at[idx, 'cast'] = omdb_data['Actors']
        if pd.isna(row['directed_by']) and 'Director' in omdb_data:
            missing_data.at[idx, 'directed_by'] = omdb_data['Director']
        if pd.isna(row['written_by']) and 'Writer' in omdb_data:
            missing_data.at[idx, 'written_by'] = omdb_data['Writer']

missing_data.count()

imdb_id              974
tmdb_id              974
title                974
original_title       974
overview             974
imdb_rating          974
imdb_votes           974
genres               974
cast                 974
directed_by          974
cinematography       292
written_by           974
produced_by          586
music_by             218
release_date         974
original_language    974
poster_path          953
dtype: int64

In [52]:
# Updating md with the new data fetched via the OMDB API

md.update(missing_data)

In [56]:
# Dropping the rows that contain empty values in the cast, director or writer columns

md = md.dropna(subset=['cast', 'directed_by', 'written_by'],how='any')

md.shape

(33395, 17)

In [58]:
md.count()

imdb_id              33395
tmdb_id              33395
title                33395
original_title       33395
overview             33395
imdb_rating          33395
imdb_votes           33395
genres               33395
cast                 33395
directed_by          33395
cinematography       26753
written_by           33395
produced_by          29048
music_by             21250
release_date         33395
original_language    33395
poster_path          33030
dtype: int64

In [62]:
# Building the content-based recommendation system

In [64]:
# Implementing content based filtering based on Overviews, Cast, Crew, Keywords, and Genre