In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
key_path = "/content/drive/My Drive/colab_secrets/tmdb_api_key.txt"

#read the api key
with open(key_path, "r") as f:
  tmdb_api_key = f.read().strip()

print("API Key loaded successfully!")

API Key loaded successfully!


In [3]:
!pip install tmdbv3api

Collecting tmdbv3api
  Downloading tmdbv3api-1.9.0-py3-none-any.whl.metadata (8.0 kB)
Downloading tmdbv3api-1.9.0-py3-none-any.whl (25 kB)
Installing collected packages: tmdbv3api
Successfully installed tmdbv3api-1.9.0


In [4]:
# Import dependencies
import requests
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load API key from Google Drive secrets file
with open("/content/drive/MyDrive/colab_secrets/tmdb_api_key.txt", "r") as f:
    tmdb_api_key = f.read().strip()

BASE_URL = "https://api.themoviedb.org/3"

# Google Drive path for cache
cache_path = "/content/drive/My Drive/colab_secrets/movies.csv"

# Fetch genre lookup
def get_genre_mapping():
    url = f"{BASE_URL}/genre/movie/list?api_key={tmdb_api_key}&language=en-US"
    response = requests.get(url).json()
    return {g["id"]: g["name"] for g in response["genres"]}

genre_mapping = get_genre_mapping()

# Fetch top movies
def fetch_top_movies(n=1000):
    movies = []
    page = 1
    while len(movies) < n:
        url = f"{BASE_URL}/movie/popular?api_key={tmdb_api_key}&language=en-US&page={page}"
        response = requests.get(url)
        data = response.json()
        if "results" not in data:
            break

        for movie in data["results"]:
            genres = [genre_mapping.get(gid, "Unknown") for gid in movie.get("genre_ids", [])]
            if not genres:  # if empty, add Unknown
                genres = ["Unknown"]

            movies.append({
                "id": movie["id"],
                "title": movie["title"],
                "overview": movie.get("overview", ""),
                "genres": genres,
                "release_date": movie.get("release_date", "")
            })

            if len(movies) >= n:
                break

        page += 1
        if page > data.get("total_pages", page):  # stop if no more pages
            break

    return pd.DataFrame(movies[:n])

# Load from cache if exists, else fetch + save
if os.path.exists(cache_path):
    print("Loading movies from cache...")
    movies_df = pd.read_csv(cache_path)
    # Convert genres back from semicolon string to list
    movies_df["genres"] = movies_df["genres"].apply(
        lambda x: x.split(";") if pd.notna(x) and x != "" else ["Unknown"]
    )
else:
    print("Fetching movies from TMDB API...")
    movies_df = fetch_top_movies(1000)
    # Save genres as semicolon-separated string
    movies_df.to_csv(cache_path, index=False,
                     columns=["id", "title", "overview", "genres", "release_date"])
    # Reload properly
    movies_df = pd.read_csv(cache_path)
    movies_df["genres"] = movies_df["genres"].apply(
        lambda x: x.split(";") if pd.notna(x) and x != "" else ["Unknown"]
    )
    print(f"Saved to cache at {cache_path}")

print("Fetched movies:", movies_df.shape)
movies_df.head()


Loading movies from cache...
Fetched movies: (1000, 5)


Unnamed: 0,id,title,overview,genres,release_date
0,755898,War of the Worlds,Will Radford is a top analyst for Homeland Sec...,"[['Science Fiction', 'Thriller']]",2025-07-29
1,1061474,Superman,"Superman, a journalist in Metropolis, embarks ...","[['Science Fiction', 'Adventure', 'Action']]",2025-07-09
2,575265,Mission: Impossible - The Final Reckoning,Ethan Hunt and team continue their search for ...,"[['Action', 'Adventure', 'Thriller']]",2025-05-17
3,1234821,Jurassic World Rebirth,Five years after the events of Jurassic World ...,"[['Science Fiction', 'Adventure', 'Action']]",2025-07-01
4,1382406,Striking Rescue,A veteran Muay Thai expert goes on a take-no-p...,"[['Action', 'Crime', 'Thriller']]",2024-12-05


In [5]:
# Preprocess text separately for TF-IDF

# Fill missing values to avoid NaN issues
movies_df["overview"] = movies_df["overview"].fillna("")
movies_df["genres"] = movies_df["genres"].apply(lambda g: " ".join(g) if isinstance(g, list) else "")

# TF-IDF for overview
tfidf_overview = TfidfVectorizer(stop_words="english", max_features=5000)
tfidf_overview_matrix = tfidf_overview.fit_transform(movies_df["overview"])

# TF-IDF for genres (smaller vocab, so fewer features)
tfidf_genres = TfidfVectorizer(stop_words="english")
tfidf_genres_matrix = tfidf_genres.fit_transform(movies_df["genres"])

print("Overview TF-IDF shape:", tfidf_overview_matrix.shape)
print("Genres TF-IDF shape:", tfidf_genres_matrix.shape)


Overview TF-IDF shape: (1000, 5000)
Genres TF-IDF shape: (1000, 21)


In [6]:
# Build similarity matrix

from sklearn.metrics.pairwise import linear_kernel
from scipy.sparse import hstack

# Weighted combination of overview & genres
combined_matrix = hstack([0.7 * tfidf_overview_matrix, 0.3 * tfidf_genres_matrix])

# Compute cosine similarity
cosine_sim = linear_kernel(combined_matrix, combined_matrix)

print("Cosine similarity matrix shape:", cosine_sim.shape)

# Helper: Get recommendations
indices = pd.Series(movies_df.index, index=movies_df["title"]).drop_duplicates()

def recommend_movies(title, n=10):
    if title not in indices:
        return f"Movie '{title}' not found in dataset."

    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:n+1]  # skip the movie itself

    movie_indices = [i[0] for i in sim_scores]
    return movies_df.iloc[movie_indices][["title", "genres", "release_date"]]


Cosine similarity matrix shape: (1000, 1000)


In [7]:
# renaming for efficiency
movies_df = movies_df.rename(columns={"id": "movieId"})


In [8]:
# Ensure clean_title is lowercase for matching
movies_df["clean_title"] = movies_df["title"].str.lower().str.strip()


In [9]:
# Download MovieLens 1M dataset
!wget http://files.grouplens.org/datasets/movielens/ml-1m.zip

# Unzip
!unzip -o ml-1m.zip


--2025-08-26 10:44:16--  http://files.grouplens.org/datasets/movielens/ml-1m.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5917549 (5.6M) [application/zip]
Saving to: ‘ml-1m.zip’


2025-08-26 10:44:17 (6.49 MB/s) - ‘ml-1m.zip’ saved [5917549/5917549]

Archive:  ml-1m.zip
   creating: ml-1m/
  inflating: ml-1m/movies.dat        
  inflating: ml-1m/ratings.dat       
  inflating: ml-1m/README            
  inflating: ml-1m/users.dat         


In [10]:
import pandas as pd

# Load ratings
ratings = pd.read_csv("ml-1m/ratings.dat",
                      sep="::",
                      engine="python",
                      names=["userId", "movieId", "rating", "timestamp"],
                      encoding="latin-1")

# Load movies
ml_movies = pd.read_csv("ml-1m/movies.dat",
                        sep="::",
                        engine="python",
                        names=["movieId", "title", "genres"],
                        encoding="latin-1")

print("Ratings shape:", ratings.shape)
print("Movies shape:", ml_movies.shape)

ratings.head(), ml_movies.head()


Ratings shape: (1000209, 4)
Movies shape: (3883, 3)


(   userId  movieId  rating  timestamp
 0       1     1193       5  978300760
 1       1      661       3  978302109
 2       1      914       3  978301968
 3       1     3408       4  978300275
 4       1     2355       5  978824291,
    movieId                               title                        genres
 0        1                    Toy Story (1995)   Animation|Children's|Comedy
 1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
 2        3             Grumpier Old Men (1995)                Comedy|Romance
 3        4            Waiting to Exhale (1995)                  Comedy|Drama
 4        5  Father of the Bride Part II (1995)                        Comedy)

In [11]:
# Extract year from MovieLens titles (e.g., "Toy Story (1995)")
ml_movies["year"] = ml_movies["title"].str.extract(r"\((\d{4})\)").astype(float)
ml_movies["clean_title"] = ml_movies["title"].str.replace(r"\(\d{4}\)", "").str.strip()

# Extract year from TMDB release_date
movies_df["year"] = pd.to_datetime(movies_df["release_date"], errors="coerce").dt.year
movies_df["clean_title"] = movies_df["title"].str.strip()

# Merge on title + year
merged_movies = pd.merge(movies_df, ml_movies,
                         left_on=["clean_title", "year"],
                         right_on=["clean_title", "year"],
                         how="inner")

print("Merged movies:", merged_movies.shape)
merged_movies.head()


Merged movies: (0, 10)


Unnamed: 0,movieId_x,title_x,overview,genres_x,release_date,clean_title,year,movieId_y,title_y,genres_y


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Hybrid Recommendation Function

# Make sure ratings DataFrame is available
ml_ratings = ratings.copy()

# Collaborative signal: average rating
avg_ratings = ml_ratings.groupby("movieId")["rating"].mean()

def hybrid_recommendations(title, top_n=10, content_weight=0.7, rating_weight=0.3):
    # Content-based similarity
    idx = movies_df[movies_df["title"].str.lower() == title.lower()].index
    if len(idx) == 0:
        return f"Movie '{title}' not found in dataset."

    idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_indices = [i for i, _ in sim_scores[1: top_n+50]]  # take more to filter later

    candidates = movies_df.iloc[sim_indices].copy()

    # Collaborative filtering signal
    avg_ratings = ratings.groupby("movieId")["rating"].mean()

    # Merge with ratings
    candidates = candidates.merge(
        ml_movies, left_on="title", right_on="title", how="left"
    )
    candidates = candidates.merge(
        avg_ratings, left_on="movieId", right_index=True, how="left"
    )

    candidates["rating"].fillna(candidates["rating"].mean(), inplace=True)

    #  Final Score
    candidates["final_score"] = (
        content_weight * sim_scores[1: top_n+50][0][1] +
        rating_weight * candidates["rating"]
    )

    # Pick top_n
    candidates = candidates.sort_values("final_score", ascending=False).head(top_n)

    return candidates[["title", "genres", "release_date", "final_score"]]


In [13]:
# --- Fuzzy title resolver ---
import difflib

# Build a lowercase index of titles once
_title_lookup = movies_df['title'].dropna().astype(str)
_lower_to_title = {t.lower(): t for t in _title_lookup.unique()}
_all_lower_titles = list(_lower_to_title.keys())

def resolve_title(query, n_suggestions=5, cutoff=0.55):
    """
    Return (best_title, suggestions_list).
    - If exact (case-insensitive) match exists: best_title is that title, suggestions empty.
    - Else try fuzzy matches; best_title is the top match; suggestions are the top N matches.
    - If nothing clears the cutoff: best_title=None, suggestions=[]
    """
    q = (query or "").strip().lower()
    if not q:
        return None, []

    # exact case-insensitive hit
    if q in _lower_to_title:
        return _lower_to_title[q], []

    # fuzzy candidates
    matches = difflib.get_close_matches(q, _all_lower_titles, n=n_suggestions, cutoff=cutoff)
    if matches:
        suggestions = [_lower_to_title[m] for m in matches]
        return suggestions[0], suggestions  # best + list
    return None, []


In [14]:
def hybrid_recommendations_fuzzy(query_title, top_n=10, content_weight=0.7, rating_weight=0.3):
    best_title, suggestions = resolve_title(query_title)

    if best_title is None:
        return f"Sorry, couldn’t find '{query_title}'. Try another title."

    return hybrid_recommendations(best_title, top_n=top_n,
                                  content_weight=content_weight,
                                  rating_weight=rating_weight)


In [18]:
from difflib import get_close_matches

def hybrid_recommendations(title, top_n=10, content_weight=0.7, rating_weight=0.3):
    title = title.lower().strip()

    # Try exact/partial match
    matches = movies_df[movies_df["clean_title"].str.contains(title, na=False)]

    # If still empty, use fuzzy matching
    if matches.empty:
        close = get_close_matches(title, movies_df["clean_title"], n=3, cutoff=0.6)
        if close:
            matches = movies_df[movies_df["clean_title"].isin(close)]

    if matches.empty:
        print(f"Movie '{title}' not found in dataset.")
        return

    # Take the first match
    movie_id = matches.iloc[0]["movieId"]
    movie_title = matches.iloc[0]["title"]
    print(f"\nShowing results for: {movie_title}\n")

    # Get index in similarity matrix
    idx = movies_df.index[movies_df["movieId"] == movie_id][0]

    # Content similarity
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n+20]

    # Candidate movies
    movie_indices = [i[0] for i in sim_scores]
    candidates = movies_df.iloc[movie_indices].copy()
    candidates["similarity"] = [i[1] for i in sim_scores]

    # Merge with ratings
    candidates = candidates.merge(avg_ratings, on="movieId", how="left")
    candidates["rating"].fillna(candidates["rating"].mean(), inplace=True)

    # Hybrid score
    candidates["score"] = (content_weight * candidates["similarity"] +
                           rating_weight * (candidates["rating"] / 5))

    # Sort & select
    candidates = candidates.sort_values("score", ascending=False).head(top_n)

    return candidates[["title", "genres", "overview", "year", "score"]]


In [16]:
!pip install fuzzywuzzy[speedup]


Collecting fuzzywuzzy[speedup]
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-levenshtein>=0.12 (from fuzzywuzzy[speedup])
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.1->python-levenshtein>=0.12->fuzzywuzzy[speedup])
  Downloading rapidfuzz-3.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (159 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m159.9/159.9 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Do

In [19]:
from fuzzywuzzy import process

def interactive_recommender():
    while True:
        user_input = input("Enter a movie title (or 'quit' to stop): ")
        if user_input.lower() == "quit":
            break

        # Get top 5 matches using fuzzy matching
        matches = process.extract(user_input, movies_df["clean_title"], limit=5)

        if not matches:
            print(f"Movie '{user_input}' not found in dataset.\n")
            continue

        # Show choices
        print("\nDid you mean:")
        for i, (match, score, _) in enumerate(matches, start=1):
            year = movies_df[movies_df["clean_title"] == match]["year"].values
            year = int(year[0]) if len(year) > 0 else "Unknown"
            print(f"{i}. {match} ({year}) [Score: {score}]")


        try:
            choice = int(input("\nEnter the number of the correct movie (0 to cancel): "))
        except ValueError:
            print("Invalid input, try again.\n")
            continue

        if choice == 0:
            print("Cancelled.\n")
            continue
        elif 1 <= choice <= len(matches):
            selected_title = matches[choice - 1][0]
            print(f"\nShowing recommendations for: {selected_title}\n")

            #get recommendations
            results = hybrid_recommendations(selected_title)

            if "id" in results.columns:
                results = results.drop_duplicates(subset="id")
            else:
                results = results.drop_duplicates(subset="title")

            print(results)
        else:
            print("Invalid choice, try again.\n")
interactive_recommender()

Enter a movie title (or 'quit' to stop): avatar

Did you mean:
1. Avatar (2009) [Score: 100]
2. Avatar: The Way of Water (2022) [Score: 90]
3. Norma: Antara Mertua dan Menantu (2025) [Score: 60]
4. Norma: Antara Mertua dan Menantu (2025) [Score: 60]
5. A Star Is Born (2018) [Score: 60]

Enter the number of the correct movie (0 to cancel): 1


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  candidates["rating"].fillna(candidates["rating"].mean(), inplace=True)



Showing recommendations for: Avatar


Showing results for: Avatar

                              title  \
21                         Predator   
1                        The Matrix   
2    Guardians of the Galaxy Vol. 3   
3                        Iron Man 3   
6                    Thor: Ragnarok   
8                       Pacific Rim   
9   Godzilla x Kong: The New Empire   

                                               genres  \
21  ['Science Fiction', 'Action', 'Adventure', 'Th...   
1                       ['Action', 'Science Fiction']   
2   ['Science Fiction', 'Adventure', 'Action', 'Co...   
3          ['Action', 'Adventure', 'Science Fiction']   
6          ['Action', 'Adventure', 'Science Fiction']   
8          ['Action', 'Science Fiction', 'Adventure']   
9          ['Action', 'Adventure', 'Science Fiction']   

                                             overview    year     score  
21  A team of elite commandos on a secret mission ...  1987.0  0.278101  
1   Set in the