## Cell: 1 importing requirements

In [1]:
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import requests
from IPython.display import display, HTML
import time

## Cell 2: loading and cleaning dataset

In [2]:
df = pd.read_csv('dataset/movies_metadata.csv', low_memory=False)
df['id'] = pd.to_numeric(df['id'], errors='coerce')
df.dropna(subset=['id'], inplace=True)
df['id'] = df['id'].astype(int)

df['vote_average'] = pd.to_numeric(df['vote_average'], errors='coerce')
df['vote_count'] = pd.to_numeric(df['vote_count'], errors='coerce')

df.drop_duplicates(subset=['id'], inplace=True)
df = df[['id', 'title', 'overview', 'genres', 'imdb_id', 
         'release_date', 'vote_average', 'vote_count']]

print(f"Loaded main metadata for {df.shape[0]} movies.")

Loaded main metadata for 45433 movies.


## Cell 3: Genre cleaning

In [3]:
def parse_genres(text):
    if pd.isna(text):
        return []
    try:
        L = []
        for i in ast.literal_eval(text):
            L.append(i['name'])
        return L
    except Exception as e:
        return []
df['genres_list'] = df['genres'].apply(parse_genres)

## Cell 4: Applying imdb's weighted rating formula

In [4]:
C = df['vote_average'].mean()
m = df['vote_count'].quantile(0.90) 
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    if v is None or R is None:
        return C

    return (v / (v + m)) * R + (m / (v + m)) * C

df['weighted_rating'] = df.apply(weighted_rating, axis=1)

## Cell 5:  genre rec function

In [5]:
def get_genre_recommendations(genre_name, top_n=10):
    def contains_genre(genre_list):
        return genre_name in genre_list

    genre_movies = df[df['genres_list'].apply(contains_genre)]
    
    if genre_movies.empty:
        print(f"No movies found for the genre: {genre_name}")
        return
    top_movies = genre_movies.sort_values('weighted_rating', ascending=False)
    return top_movies[['title', 'release_date', 'vote_average', 'weighted_rating']].head(top_n)

#### Cell 6: Testing

In [6]:
print("--- Top 10 Animation Movies (includes Anime) ---")
display(get_genre_recommendations('Animation'))

print("\n--- Top 10 'War' Movies ---")
display(get_genre_recommendations('War'))

print("\n--- Top 10 'Drama' Movies (includes Indian cinema) ---")
display(get_genre_recommendations('Drama'))

--- Top 10 Animation Movies (includes Anime) ---


Unnamed: 0,title,release_date,vote_average,weighted_rating
5481,Spirited Away,2001-07-20,8.3,8.196059
40251,Your Name.,2016-08-26,8.5,8.112548
9698,Howl's Moving Castle,2004-11-19,8.2,8.013007
2884,Princess Mononoke,1997-07-12,8.2,8.012327
359,The Lion King,1994-06-23,8.0,7.932911
30315,Inside Out,2015-06-09,7.9,7.847069
5553,Grave of the Fireflies,1988-04-16,8.2,7.835743
5833,My Neighbor Totoro,1988-04-16,8.0,7.798377
13724,Up,2009-05-13,7.8,7.751572
12704,WALL·E,2008-06-22,7.8,7.747103



--- Top 10 'War' Movies ---


Unnamed: 0,title,release_date,vote_average,weighted_rating
522,Schindler's List,1993-11-29,8.3,8.206643
24860,The Imitation Game,2014-11-14,8.0,7.937066
5857,The Pianist,2002-09-24,8.1,7.909743
13605,Inglourious Basterds,2009-08-18,7.9,7.84598
5553,Grave of the Fireflies,1988-04-16,8.2,7.835743
1165,Apocalypse Now,1979-08-15,8.0,7.832277
1919,Saving Private Ryan,1998-07-24,7.9,7.831223
1179,Full Metal Jacket,1987-06-26,7.9,7.767489
732,Dr. Strangelove or: How I Learned to Stop Worr...,1964-01-29,8.0,7.766503
43190,Band of Brothers,2001-09-09,8.2,7.733257



--- Top 10 'Drama' Movies (includes Indian cinema) ---


Unnamed: 0,title,release_date,vote_average,weighted_rating
314,The Shawshank Redemption,1994-09-23,8.5,8.445871
834,The Godfather,1972-03-14,8.5,8.425442
10309,Dilwale Dulhania Le Jayenge,1995-10-20,9.1,8.421477
12481,The Dark Knight,2008-07-16,8.3,8.265479
2843,Fight Club,1999-10-15,8.3,8.256387
522,Schindler's List,1993-11-29,8.3,8.206643
23673,Whiplash,2014-10-10,8.3,8.205408
2211,Life Is Beautiful,1997-12-20,8.3,8.187177
1178,The Godfather: Part II,1974-12-20,8.3,8.180082
1152,One Flew Over the Cuckoo's Nest,1975-11-18,8.3,8.164262


## Cell 7 credits and keywords merging

In [7]:
try:
    credits = pd.read_csv('dataset/credits.csv')
    credits['id'] = pd.to_numeric(credits['id'], errors='coerce')
    credits.dropna(subset=['id'], inplace=True)
    credits['id'] = credits['id'].astype(int)
except Exception as e:
    print(f"Error loading credits.csv: {e}")
try:
    keywords = pd.read_csv('dataset/keywords.csv')
    keywords['id'] = pd.to_numeric(keywords['id'], errors='coerce')
    keywords.dropna(subset=['id'], inplace=True)
    keywords['id'] = keywords['id'].astype(int)
except Exception as e:
    print(f"Error loading keywords.csv: {e}")
df_content = df.merge(credits, on='id').merge(keywords, on='id')
print(f"Created content based dataset with {df_content.shape[0]} movies.")

Created content based dataset with 46496 movies.


## Cell 8: cleaning cast and keywords

In [8]:
def get_top_3_cast(text):
    if pd.isna(text):
        return []
    try:
        L = []
        counter = 0
        for i in ast.literal_eval(text):
            if counter < 3:
                L.append(i['name'].replace(" ",""))
                counter += 1
            else:
                break
        return L
    except Exception as e:
        return []
def get_all_keywords(text):
    if pd.isna(text):
        return []
    try:
        L = []
        for i in ast.literal_eval(text):
            L.append(i['name'].replace(" ",""))
        return L
    except Exception as e:
        return []
df_content['cast_list'] = df_content['cast'].apply(get_top_3_cast)
df_content['keywords_list'] = df_content['keywords'].apply(get_all_keywords)
df_content['genres_list'] = df_content['genres_list'].apply(lambda x: [i.replace(" ","") for i in x])

## Cell 9: creating combination function

In [9]:
df_content['overview'] = df_content['overview'].fillna('').apply(lambda x: x.split())

df_content['combined_features'] = (df_content['genres_list'] * 3) + \
                                  (df_content['cast_list'] * 3) + \
                                  (df_content['keywords_list'] * 2) + \
                                  df_content['overview']

                                  
df_content['combined_string'] = df_content['combined_features'].apply(lambda x: " ".join(x).lower())

new_df = df_content[['id', 'title', 'combined_string', 'imdb_id']].copy()

## Cell 10: vectorizing

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
print("Starting vectorization...")
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
feature_matrix = tfidf.fit_transform(new_df['combined_string'])
new_df = new_df.reset_index()
indices = pd.Series(new_df.index, index=new_df['title']).drop_duplicates()

Starting vectorization...


## Cell 11: getting movie posters

In [11]:
def fetch_poster(movie_id):
    api_key = "64b7e3b5f132d55aa10fc2a4ad36ffc6"
    base_url = "https://image.tmdb.org/t/p/w500"
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}&language=en-US"
    placeholder = "https://via.placeholder.com/500x750.png?text=Poster+Not+Found"
    imdb_id = None
    
    for attempt in range(3):
        try:
            data = requests.get(url, timeout=5).json()
            
            if data.get('success') == False:
                print(f"  >! ERROR: TMDb API call failed. Message: {data.get('status_message')}")
                return placeholder, None

            imdb_id = data.get('imdb_id')
            poster_path = data.get('poster_path') 
            
            if poster_path:
                full_path = base_url + poster_path 
                return full_path, imdb_id
            else:
                return placeholder, imdb_id

        except requests.exceptions.ConnectionError as e:
            print(f"  >! attempt {attempt+1}: Network error ({e}). Retrying...")
            time.sleep(1)
        except Exception as e:
            print(f"  >! attempt {attempt+1}: An exception occurred: {e}. Retrying...")
            time.sleep(1)
    
    print(f"  >! All attempts failed for movie ID {movie_id}.")
    return placeholder, None

## Cell 12: Alternative poster finder

In [17]:
def fetch_poster_omdb(imdb_id):
    if not imdb_id or imdb_id == 'N/A':
        return "https://via.placeholder.com/500x750.png?text=Poster+Not+Found"
        
    api_key = "ed8618a4"
    url = f"http://www.omdbapi.com/?i={imdb_id}&apikey={api_key}"
    placeholder = "https://via.placeholder.com/500x750.png?text=Poster+Not+Found"

    for attempt in range(3):
        try:
            data = requests.get(url, timeout=5).json()
            poster_url = data.get('Poster')
            
            if poster_url and poster_url != "N/A":
                return poster_url
            else:
                return placeholder
                
        except Exception as e:
            print(f"  >! OMDb fallback failed: {e}. Retrying...")
            time.sleep(1)
    
    return placeholder

In [18]:
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

print("Creating a persistent and resilient network session...")

retry_strategy = Retry(
    total=3,
    status_forcelist=[429, 500, 502, 503, 504],
    backoff_factor=1
)

adapter = HTTPAdapter(max_retries=retry_strategy)
http_session = requests.Session()
http_session.mount("http://", adapter)
http_session.mount("https://", adapter)

print("Network session created.")

Creating a persistent and resilient network session...
Network session created.


## Cell 13 genre ids

In [19]:
api_key = "64b7e3b5f132d55aa10fc2a4ad36ffc6"
url = f"https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US"
genre_map = {}

try:
    data = http_session.get(url, timeout=5).json()
    for genre in data['genres']:
        genre_map[genre['name']] = genre['id']
    print("TMDb Genre Map Loaded Successfully.")
except Exception as e:
    print(f"Error fetching genre map: {e}")
    print("Using a fallback map.")
    genre_map = {
        'Action': 28, 'Adventure': 12, 'Animation': 16, 'Comedy': 35, 'Crime': 80,
        'Documentary': 99, 'Drama': 18, 'Family': 10751, 'Fantasy': 14, 'History': 36,
        'Horror': 27, 'Music': 10402, 'Mystery': 9648, 'Romance': 10749,
        'Science Fiction': 878, 'TV Movie': 10770, 'Thriller': 53, 'War': 10752, 'Western': 37
    }

TMDb Genre Map Loaded Successfully.


### api recs

In [23]:
def get_api_recs(genre_ids, api_key, http_session, num_recs=5):
    api_recs = []
    if not genre_ids: 
        return []
    
    genre_id_str = ",".join(map(str, genre_ids))
    url = (
        f"https://api.themoviedb.org/3/discover/movie?api_key={api_key}"
        f"&with_genres={genre_id_str}&primary_release_date.gte=2018-01-01"
        f"&sort_by=popularity.desc&page=1"
    )
    
    try:
        data = http_session.get(url, timeout=5).json()
        for movie in data.get('results', []):
            if len(api_recs) < num_recs:
                api_recs.append({
                    'title': movie['title'],
                    'id': movie['id'],
                    'imdb_id': None 
                })
    except Exception as e:
        print(f"  >! API rec fetch failed: {e}")
    return api_recs

def get_brain_recs(idx, actual_title, new_df, feature_matrix):
    brain_recs = []
    seen = {actual_title}
    sims_arr = cosine_similarity(feature_matrix[idx], feature_matrix)
    sims = list(enumerate(sims_arr[0]))
    sims = sorted(sims, key=lambda x: x[1], reverse=True)

    for i in sims[1:]:
        if len(brain_recs) >= 5:
            break
        rec = new_df.iloc[i[0]]
        title = rec['title']
        if title not in seen:
            brain_recs.append({
                'title': title,
                'id': rec['id'],
                'imdb_id': rec['imdb_id']
            })
            seen.add(title)
    return brain_recs

def get_recs(search_title):
    print(f"\n--- Searching for: '{search_title}' ---")
    
    search_url = (
        f"https://api.themoviedb.org/3/search/movie?api_key={api_key}"
        f"&query={search_title}"
    )
    all_recs = []
    actual_title = search_title

    try:
        data = http_session.get(search_url, timeout=5).json()
        if not data.get('results'):
            print(f"No API results for '{search_title}'.")
            return [], actual_title
        
        movie = data['results'][0]
        actual_title = movie['title']
        api_id = movie['id']
        api_genre_ids = movie['genre_ids']
        
        print(f"Found '{actual_title}' on TMDb (ID: {api_id}).")

        matches = new_df[new_df['id'] == api_id]
        
        if not matches.empty:
            idx = matches.iloc[0]['index'] 
            print(f"Movie is in our dataset. Getting 5 brain recs...")
            all_recs.extend(get_brain_recs(idx, actual_title, new_df, feature_matrix))
            print(f"Getting 5 new API recs...")
            all_recs.extend(get_api_recs(api_genre_ids, api_key, http_session, num_recs=5))
        else:
            print(f"Movie is not in our dataset. Getting 10 new API recs...")
            all_recs.extend(get_api_recs(api_genre_ids, api_key, http_session, num_recs=10))

    except Exception as e:
        print(f"  >! API search failed: {e}")

    return all_recs, actual_title

In [25]:
movie_name = "the dark knight"

recommendations, found_title = get_recs(movie_name)

names = []
posters = []
placeholder = "https://via.placeholder.com/500x750.png?text=Poster+Not+Found"

print(f"\n--- Fetching {len(recommendations)} posters... ---")
for movie in recommendations:
    names.append(movie['title'])
    
    poster_url, fetched_imdb_id = fetch_poster(movie['id'])
    
    final_imdb_id = fetched_imdb_id if fetched_imdb_id else movie['imdb_id']

    if poster_url == placeholder and final_imdb_id:
        poster_url = fetch_poster_omdb(final_imdb_id)
        
    posters.append(poster_url)
print("--- Poster fetching complete. ---")


html_output = f"<h2>Recommendations for: {found_title}</h2><div style='display: flex; flex-flow: row wrap; justify-content: center;'>"
for name, poster in zip(names, posters):
    html_output += f"""
    <div style='margin: 10px; text-align: center; width: 150px;'>
        <img src="{poster}" width="150">
        <p style='font-size: 14px; font-weight: bold;'>{name}</p>
    </div>
    """
html_output += "</div>"
display(HTML(html_output))


--- Searching for: 'the dark knight' ---
Found 'The Dark Knight' on TMDb (ID: 155).
Movie is in our dataset. Getting 5 brain recs...
Getting 5 new API recs...

--- Fetching 10 posters... ---
  >! attempt 1: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! attempt 1: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! attempt 2: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! attempt 3: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! All attempts failed for movie ID 342917.
--- Poster fetching complete. ---


In [27]:
movie_name = "oppenheimer"

# --- 1. Get the list of 10 recommendations ---
recommendations, found_title = get_recs(movie_name)

# --- 2. Fetch posters for all 10 movies ---
names = []
posters = []
placeholder = "https://via.placeholder.com/500x750.png?text=Poster+Not+Found"

print(f"\n--- Fetching {len(recommendations)} posters... ---")
for movie in recommendations:
    names.append(movie['title'])
    
    poster_url, fetched_imdb_id = fetch_poster(movie['id'])
    
    final_imdb_id = fetched_imdb_id if fetched_imdb_id else movie['imdb_id']

    if poster_url == placeholder and final_imdb_id:
        poster_url = fetch_poster_omdb(final_imdb_id)
        
    posters.append(poster_url)
print("--- Poster fetching complete. ---")

# --- 3. Display the final HTML grid ---
html_output = f"<h2>Recommendations for: {found_title}</h2><div style='display: flex; flex-flow: row wrap; justify-content: center;'>"
for name, poster in zip(names, posters):
    html_output += f"""
    <div style='margin: 10px; text-align: center; width: 150px;'>
        <img src="{poster}" width="150">
        <p style='font-size: 14px; font-weight: bold;'>{name}</p>
    </div>
    """
html_output += "</div>"
display(HTML(html_output))


--- Searching for: 'oppenheimer' ---
Found 'Oppenheimer' on TMDb (ID: 872585).
Movie is not in our dataset. Getting 10 new API recs...

--- Fetching 10 posters... ---
  >! attempt 1: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! attempt 1: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! attempt 2: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! attempt 3: Network error (('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))). Retrying...
  >! All attempts failed for movie ID 1185528.
  >! attempt 1: Network error (('Connection aborted.', Co