In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
import warnings; warnings.simplefilter('ignore')
import os
import re
from collections import Counter
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Reading Dataset

md = pd.read_csv("TMDB_all_movies.csv")

## Data Preprocessing

In [4]:
# Filtering out movies that have already released

md = md[md['status'] == 'Released']

In [5]:
# removing columns that are irrelevant

cols_to_drop = [
    'status', 'revenue', 'runtime', 'budget', 'tagline', 'production_companies', 'production_countries', 'spoken_languages'
]
md = md.drop(columns=cols_to_drop)

In [6]:
md.columns.tolist()

['id',
 'title',
 'vote_average',
 'vote_count',
 'release_date',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'genres',
 'cast',
 'director',
 'director_of_photography',
 'writers',
 'producers',
 'music_composer',
 'imdb_rating',
 'imdb_votes',
 'poster_path']

In [7]:
# Dropping movies that don't have an IMDB rating

md = md[(md['imdb_id'].notna())]

In [8]:
# Filtering out 'bad' and 'obscure' movies according to imdb

md = md[
    ((md['original_language'] == 'en') & (md['imdb_votes'] >= 5000)) |
    ((md['original_language'] != 'en') & (md['imdb_votes'] >= 2000))
]
md = md[md['imdb_rating'] >= 6]

In [9]:
# Dropping the 'popularity' column since 'imdb_votes' indicates popularity anyways

md = md.drop(columns='popularity')

In [10]:
# Dropping the 'vote_average' and 'vote_count' columns as TMDB ratings are not required as we are going to be using IMDB ratings instead

md = md.drop(columns=['vote_average','vote_count'])

In [11]:
# Renaming columns

md.columns = ['tmdb_id', 'title', 'release_date', 'imdb_id', 'original_language', 'original_title', 'overview', 'genres', 'cast',
              'directed_by', 'cinematography', 'written_by', 'produced_by', 'music_by', 'imdb_rating', 'imdb_votes', 'poster_path']

In [12]:
# Re-ordering columns

md = md[['imdb_id', 'tmdb_id', 'title', 'original_title', 'overview', 'imdb_rating', 'imdb_votes', 'genres', 'cast', 'directed_by',
         'written_by', 'produced_by', 'cinematography', 'music_by', 'release_date', 'original_language', 'poster_path']]

In [13]:
md.count()

imdb_id              19115
tmdb_id              19115
title                19115
original_title       19115
overview             18976
imdb_rating          19115
imdb_votes           19115
genres               18966
cast                 18923
directed_by          18958
written_by           18489
produced_by          17317
cinematography       16347
music_by             13775
release_date         19028
original_language    19115
poster_path          18859
dtype: int64

### Missing Values

In [15]:
# Removing rows where 'overview' is missing

#md[md['overview'].isna()].sort_values(by='imdb_votes', ascending=False).head(10)
md = md[md['overview'].notna()]

In [16]:
# Removing rows with missing 'genres'

md = md[md['genres'].notna()]

In [17]:
# Removing rows with missing 'release_date'

md = md[md['release_date'].notna()]

In [18]:
md.count()

imdb_id              18843
tmdb_id              18843
title                18843
original_title       18843
overview             18843
imdb_rating          18843
imdb_votes           18843
genres               18843
cast                 18742
directed_by          18806
written_by           18376
produced_by          17267
cinematography       16325
music_by             13760
release_date         18843
original_language    18843
poster_path          18686
dtype: int64

In [19]:
# Checking missing cast and crew data and preparing to fetch the missing data from the OMDB API

#md[md['cast'].isna()].count()
#md[md['directed_by'].isna()].sort_values(by='imdb_votes', ascending=False).head(10)
#md[md['cinematography'].isna()].count()
#md[md['written_by'].isna()].sort_values(by='imdb_votes', ascending=False).head(10)
#md[md['produced_by'].isna()].count()
#md[md['music_by'].isna()].count()

missing_data = md[(md['cast'].isna()) | (md['directed_by'].isna()) | (md['written_by'].isna())]

# making a copy for comparision later
to_compare = missing_data.copy()

missing_data.count()

imdb_id              535
tmdb_id              535
title                535
original_title       535
overview             535
imdb_rating          535
imdb_votes           535
genres               535
cast                 434
directed_by          498
written_by            68
produced_by          331
cinematography       206
music_by             146
release_date         535
original_language    535
poster_path          497
dtype: int64

In [48]:
# Using the OMDB API to fetch missing values

OMDB_API_KEY = os.getenv("OMDB_API_KEY")

def fetch_omdb_data(imdb_id):
    """Fetch data from OMDB API (keeping 'N/A' as strings)."""
    url = f"http://www.omdbapi.com/?i={imdb_id}&apikey={OMDB_API_KEY}"
    try:
        response = requests.get(url, timeout=5)
        return response.json() if response.status_code == 200 else None
    except requests.exceptions.RequestException:
        return None

# Track rows to keep (movies only) and update missing data
valid_rows = []
for idx, row in missing_data.iterrows():
    omdb_data = fetch_omdb_data(row['imdb_id'])
    time.sleep(0.1)  # Avoid rate limiting
    
    # Skip if not a movie or API failed
    if not omdb_data or omdb_data.get('Type') != 'movie':
        continue
    
    # Update missing values (keeping 'N/A' if OMDB returns it)
    if pd.isna(row['cast']) and 'Actors' in omdb_data:
        missing_data.at[idx, 'cast'] = omdb_data['Actors']
    if pd.isna(row['directed_by']) and 'Director' in omdb_data:
        missing_data.at[idx, 'directed_by'] = omdb_data['Director']
    if pd.isna(row['written_by']) and 'Writer' in omdb_data:
        missing_data.at[idx, 'written_by'] = omdb_data['Writer']
    
    valid_rows.append(idx)

# Optional: Filter to keep only valid movies
missing_data = missing_data.loc[valid_rows]

missing_data.count()

imdb_id              482
tmdb_id              482
title                482
original_title       482
overview             482
imdb_rating          482
imdb_votes           482
genres               482
cast                 482
directed_by          482
written_by           482
produced_by          318
cinematography       203
music_by             144
release_date         482
original_language    482
poster_path          465
dtype: int64

In [50]:
# Updating md with the new data fetched via the OMDB API

md.update(missing_data)

In [54]:
# Dropping the rows that contain empty values in the cast, director or writer columns

md = md.dropna(subset=['cast', 'directed_by', 'written_by'],how='any')

md.shape

(18790, 17)

In [110]:
#md[(md['cast'] == 'N/A') | (md['directed_by'] == 'N/A') | (md['written_by'] == 'N/A')]

# Replacing 'N/A' values returned by the OMDB API with NaNs

md.loc[:, ['cast', 'directed_by', 'written_by']] = md[['cast', 'directed_by', 'written_by']].replace('N/A', pd.NA)

In [114]:
# Export md to CSV for recommender system

md.to_csv('../backend/data/good_cinema.csv', index=False)  # Excludes row numbers