# Movie Recommendation System 



## 1. Import libraries

In [None]:
#system handling
import ast, re, os, pickle
import time

#data handling
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from collections import Counter

#machine learning handling 
import sklearn.preprocessing 
from sklearn.metrics.pairwise import linear_kernel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import normalize

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

#remove warning
import warnings
warnings.filterwarnings('ignore')
print("Libraries imported successfully.")

Libraries imported ✅


## 2. Load dataset
We load the TMDB movies dataset from the Kaggle input folder.

In [None]:
csv_path = '/content/TMDB_movie_dataset_v11.csv'

df_big = pd.read_csv(csv_path, on_bad_lines='skip', engine='python')
print("Dataset loaded with shape:", df_big.shape)

Dataset loaded with shape: (1307623, 24)


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,Inception,"Cobb, a skilled thief who commits corporate es...",83.952,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc..."
1,157336,Interstellar,8.417,32571,Released,2014-11-05,701729206,169,False,/pbrkL804c8yAv3zBZR4QPEafpAR.jpg,...,Interstellar,The adventures of a group of explorers who mak...,140.241,/gEU2QniE6E77NI6lCU6MxlNBvIx.jpg,Mankind was born on Earth. It was never meant ...,"Adventure, Drama, Science Fiction","Legendary Pictures, Syncopy, Lynda Obst Produc...","United Kingdom, United States of America",English,"rescue, future, spacecraft, race against time,..."


## 3. Data Cleaning
- Drop duplicates
- Select important columns
- Fill missing values

In [None]:
# Drop duplicates
df_big = df_big.drop_duplicates().reset_index(drop=True)

keep_cols = ['id','title','original_title','overview','genres','keywords','cast','crew',
             'release_date','runtime','vote_average','vote_count','popularity']
df_big = df_big[[c for c in keep_cols if c in df_big.columns]]

# Fill NA
for c in ['title','overview','genres','keywords','cast','crew']:
    if c in df_big.columns:
        df_big[c] = df_big[c].fillna('')

print("Remaining columns:", df_big.columns.tolist())

df = df_big.sample(frac=0.02, random_state=42)  
df.reset_index(drop=True, inplace=True)
print(df.shape)

df.head(2)

Remaining columns: ['id', 'title', 'original_title', 'overview', 'genres', 'keywords', 'release_date', 'runtime', 'vote_average', 'vote_count', 'popularity']


Unnamed: 0,id,title,original_title,overview,genres,keywords,release_date,runtime,vote_average,vote_count,popularity
0,27205,Inception,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",2010-07-15,148,8.364,34495,83.952
1,157336,Interstellar,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction","rescue, future, spacecraft, race against time,...",2014-11-05,169,8.417,32571,140.241


## important columns

In [6]:
df['genres'].unique()


array(['Action, Science Fiction, Adventure',
       'Adventure, Drama, Science Fiction',
       'Drama, Action, Crime, Thriller', ...,
       'Crime, Comedy, Fantasy, Horror',
       'Animation, Drama, Thriller, War', 'History, Drama, TV Movie, War'],
      dtype=object)

In [7]:
df['overview'].head()


0    Cobb, a skilled thief who commits corporate es...
1    The adventures of a group of explorers who mak...
2    Batman raises the stakes in his war on crime. ...
3    In the 22nd century, a paraplegic Marine is di...
4    When an unexpected enemy emerges and threatens...
Name: overview, dtype: object

In [8]:
df['keywords'].head()

0    rescue, mission, dream, airplane, paris, franc...
1    rescue, future, spacecraft, race against time,...
2    joker, sadism, chaos, secret identity, crime f...
3    future, society, culture clash, space travel, ...
4    new york city, superhero, shield, based on com...
Name: keywords, dtype: object

## 4. Feature Engineering
Convert `genres`, `keywords`, `cast`, and `crew` 
into plain text and combine them with the overview.

In [None]:
def extract_names(x):
    if pd.isna(x) or x == '':
        return ''
    try:
        parsed = ast.literal_eval(x)
        if isinstance(parsed, list):
            names = [d.get('name') for d in parsed if isinstance(d, dict) and 'name' in d]
            return ' '.join([str(n) for n in names if n])
        if isinstance(parsed, dict):
            return ' '.join([str(v) for v in parsed.values() if isinstance(v, (str,int))])
    except Exception:
        pass
    return re.sub(r'[\[\]\{\}\"\']', ' ', str(x))

for c in ['genres','keywords','cast','crew']:
    if c in df.columns:
        df[c + '_clean'] = df[c].apply(extract_names)

components = []
for c in ['overview','genres_clean','keywords_clean','cast_clean','crew_clean']:
  if c in df.columns:
       components.append(c)

def create_soup(row):
    return ' '.join([str(row.get(c,'')) for c in components])

df['soup'] = df.apply(create_soup, axis=1)

df[['title','soup']].head(3)



df['soup'] = df.apply(create_soup, axis=1)

Unnamed: 0,title,soup
0,Inception,"Cobb, a skilled thief who commits corporate es..."
1,Interstellar,The adventures of a group of explorers who mak...
2,The Dark Knight,Batman raises the stakes in his war on crime. ...


## 5. TF-IDF Vectorization
Transform text `soup` into numerical vectors using **TF-IDF**.

In [10]:
tfidf = TfidfVectorizer(stop_words='english', max_features=50000)
tfidf_matrix = tfidf.fit_transform(df['soup'])
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (1307246, 50000)


## 6. linear_kernel


In [11]:
def recommend(movie_index, tfidf_matrix, top_n=10):
    sim = linear_kernel(tfidf_matrix[movie_index], tfidf_matrix).flatten()
    top_idx = sim.argsort()[-top_n-1:-1][::-1]
    return top_idx


In [12]:
movie_index = 0
indices = recommend(movie_index, tfidf_matrix, top_n=10)
print("Top recommended indices:", indices)


Top recommended indices: [1227335  482619  586849  742092 1177785  427460  866626 1305783 1167741
  841625]


# K Means

In [None]:
def fit_and_save_clusters(df,
                          tfidf_matrix,
                          output_prefix="movie_cluster",
                          sample_frac=0.25,
                          n_svd=100,
                          k_for_kmeans=20,
                          n_clusters_hier=10,
                          random_state=42,
                          max_silhouette_samples=5000):
    """
    Fit TruncatedSVD -> KMeans and AgglomerativeClustering.
    Parameters:
      df: pandas DataFrame (will be updated with cluster labels)
      tfidf_matrix: sparse TF-IDF matrix (n_samples x n_features)
      output_prefix: prefix for saved files (pickle)
      sample_frac: fraction of rows to sample for silhouette / hyperparam checks (0-1)
      n_svd: number of SVD components to use
      k_for_kmeans: number of clusters for KMeans (can be a list if you want multiple)
      n_clusters_hier: number of clusters for AgglomerativeClustering
      random_state: for reproducibility
      max_silhouette_samples: maximum number of samples to use when computing silhouette_score
    Returns:
      dict with fitted models and labels paths
    """

    n_rows = tfidf_matrix.shape[0]
    print(f"Data rows: {n_rows}. TruncatedSVD -> {n_svd} components.")

    # 1) Truncated SVD (Lsa) to reduce dimensionality (works on sparse input)
    svd = TruncatedSVD(n_components=n_svd, random_state=random_state)
    start = time.time()
    X_reduced = svd.fit_transform(tfidf_matrix)           # dense (n x n_svd)
    elapsed = time.time() - start
    print(f"TruncatedSVD done in {elapsed:.1f}s. Result shape: {X_reduced.shape}")

    # optional: normalize components (helps clustering)
    X_reduced = normalize(X_reduced)

    # 2) Fit KMeans
    print(f"Fitting KMeans with k = {k_for_kmeans} ...")
    kmeans = KMeans(n_clusters=k_for_kmeans, random_state=random_state, n_init=10)
    start = time.time()
    kmeans.fit(X_reduced)
    elapsed = time.time() - start
    print(f"KMeans fitted in {elapsed:.1f}s.")

    # 3) Silhouette score (compute on a sample to save memory/time)
    # silhouette requires dense features — we have X_reduced (dense), so sample if needed
    n_sample = min(max_silhouette_samples, n_rows)
    if n_rows > n_sample:
        rng = np.random.default_rng(seed=random_state)
        sample_idx = rng.choice(n_rows, size=n_sample, replace=False)
    else:
        sample_idx = np.arange(n_rows)

    try:
        sil = silhouette_score(X_reduced[sample_idx], kmeans.labels_[sample_idx])
        print(f"KMeans silhouette (sample of {len(sample_idx)}): {sil:.4f}")
    except Exception as e:
        print("Could not compute silhouette:", e)

    # 4) Agglomerative (hierarchical) clustering
    # NOTE: Agglomerative can be expensive O(n^2) in memory/time. If dataset large, run on a sample or on reduced n_svd small.
    run_hier_on_full = (n_rows <= 5000)  # conservative default
    if not run_hier_on_full:
        print("Dataset is large — running Agglomerative on a sample to avoid O(n^2) costs.")
        hier_idx = sample_idx  # reuse the sample selected above
        X_hier = X_reduced[hier_idx]
        hier_prefix = f"{output_prefix}_hier_sample"
    else:
        hier_idx = np.arange(n_rows)
        X_hier = X_reduced
        hier_prefix = f"{output_prefix}_hier"

    print(f"Fitting AgglomerativeClustering with n_clusters = {n_clusters_hier} on {X_hier.shape[0]} rows ...")
    agg = AgglomerativeClustering(n_clusters=n_clusters_hier, linkage='ward')  # ward needs euclidean and dense
    start = time.time()
    agg_labels = agg.fit_predict(X_hier)
    elapsed = time.time() - start
    print(f"Agglomerative fitted in {elapsed:.1f}s.")

    # If we only ran hier on a sample, create a full-length column with -1 for "not computed"
    hier_labels_full = np.full(n_rows, -1, dtype=int)
    hier_labels_full[hier_idx] = agg_labels

    # 5) Persist models and transformer
    kmeans_path = f"{output_prefix}_kmeans.pkl"
    svd_path = f"{output_prefix}_svd.pkl"
    agg_path = f"{hier_prefix}_agg.pkl"
    df_out_path = f"{output_prefix}_with_clusters.csv"
    df_pickle = f"{output_prefix}_df.pkl"

    with open(kmeans_path, "wb") as f:
        pickle.dump(kmeans, f)
    with open(svd_path, "wb") as f:
        pickle.dump(svd, f)
    with open(agg_path, "wb") as f:
        pickle.dump({'agg_model': agg, 'hier_idx': hier_idx}, f)  # save indices if sample used

    # add labels to df (careful: original df alignment must match tfidf_matrix order)
    df = df.copy()
    df[f'kmeans_{k_for_kmeans}'] = kmeans.labels_
    df[f'agg_{n_clusters_hier}'] = hier_labels_full

    # Save df and a small preview
    df.to_csv(df_out_path, index=False)
    with open(df_pickle, "wb") as f:
        pickle.dump(df, f)

    print(f"Saved: {kmeans_path}, {svd_path}, {agg_path}, {df_out_path}, {df_pickle}")
    return {
        'kmeans_path': kmeans_path,
        'svd_path': svd_path,
        'agg_path': agg_path,
        'df_csv': df_out_path,
        'df_pickle': df_pickle,
        'kmeans': kmeans,
        'svd': svd,
        'agg': agg
    }


# Example usage:
# - reduce n_svd if you still run out of memory (e.g., n_svd=50)
# - lower sample_frac to 0.1 to compute silhouette on smaller sample
results_kmeans = fit_and_save_clusters(df,
                                tfidf_matrix,
                                output_prefix="movie_recommender_clusters",
                                sample_frac=0.25,
                                n_svd=100,
                                k_for_kmeans=20,
                                n_clusters_hier=10,
                                random_state=42,
                                max_silhouette_samples=5000)

## show recommendation 

In [13]:
def show_recommendations(movie_index, tfidf_matrix, top_n=10):
    idxs = recommend(movie_index, tfidf_matrix, top_n)
    return df.iloc[idxs][['title', 'overview']]

show_recommendations(0, tfidf_matrix, top_n=10)


Unnamed: 0,title,overview
1227335,Slumber is Golden,A young woman named Tyler Ann travels into her...
482619,The Universe App,Enter deep into your subconscious...
586849,Inception,"Dom Cobb is a skilled thief, the best in the f..."
742092,"And For The First Time, I Was Free",A subconscious journey towards salvation.
1177785,Mystery Reel,A look into the subconscious of the 20th century.
427460,Sound of Waves,Waves break upon a young woman's subconscious.
866626,Sweet Dreams,An exploration of subconscious environments.
1305783,Domo Dreams,A dream house travels across space and time in...
1167741,Dream Lover,A man dreams of a woman who is now lost in his...
841625,Reflections,A young man finds himself amidst a battle with...


In [14]:
def get_index_from_title(title):
    return df[df['title'].str.lower() == title.lower()].index.values[0]

movie_index = get_index_from_title("The Dark Knight")
show_recommendations(movie_index, tfidf_matrix, top_n=10)


Unnamed: 0,title,overview
487,Batman,Batman must face his most ruthless nemesis whe...
6933,"Batman: The Long Halloween, Part Two","As Gotham City's young vigilante, the Batman, ..."
6084,"Batman: The Long Halloween, Part One",Following a brutal series of murders taking pl...
25,The Dark Knight Rises,Following the death of District Attorney Harve...
4035,Batman: Mask of the Phantasm,"When a powerful criminal, who is connected to ..."
2867,Batman: Under the Red Hood,"One part vigilante, one part criminal kingpin,..."
1280680,The Batman - Part II,Sequel to the 2022 film The Batman.
381598,Batman Gotham Awaits,Batman realized he couldn't focus on being Got...
598059,Dying Is Easy,
434944,Joker rising,This is a 2013 fan film focused primarily on t...


## 7. Recommendation Function
We define a function to recommend similar movies given a title.

In [15]:
def recommend_movies(title, n=10):
    # search about film name
    idx = df[df['title'].str.lower() == title.lower()].index
    if len(idx) == 0:
        return f"Movie '{title}' not found."
    idx = idx[0]
    
 
    movie_indices = recommend(idx, tfidf_matrix, top_n=n)
    return df[['title','release_date','vote_average','vote_count']].iloc[movie_indices]

# Example
recommend_movies("Avatar", 5)


Unnamed: 0,title,release_date,vote_average,vote_count
174177,The Brother from Space,1988-01-01,2.0,3
6462,Cosmic Sin,2021-03-12,4.139,495
167740,Space Terror,2021-03-12,9.7,3
439984,The Galaxy War: A Vingança do Rei,2024-08-05,0.0,0
18415,Babylon 5: A Call to Arms,1999-05-12,6.816,98


## 8. Save Model Artifacts


In [None]:
import pickle

#save data and model
with open("movie_recommender.pkl", "wb") as f:
    pickle.dump((df, tfidf_matrix), f)

print("Model and data saved successfully!")


✅ Model and data saved successfully!
