In [None]:
!pip install kagglehub --quiet


In [None]:
import kagglehub

# Download latest version of the dataset
path = kagglehub.dataset_download("tmdb/tmdb-movie-metadata")
print("Path to dataset files:", path)


Path to dataset files: /kaggle/input/tmdb-movie-metadata


In [None]:
import pandas as pd
import os

csv_path = os.path.join(path, 'tmdb_5000_movies.csv')
df = pd.read_csv(csv_path)
df = df[['title', 'genres', 'overview']].dropna()


In [None]:
import ast
def extract_genres(genre_str):
    try:
        genres = ast.literal_eval(genre_str)
        return " ".join([genre['name'] for genre in genres])
    except:
        return ""

df['genres'] = df['genres'].apply(extract_genres)

In [None]:
df['combined'] = df['genres'] + " " + df['overview']


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(tfidf_matrix)


In [None]:
def recommend(movie_title):
    if movie_title not in df['title'].values:
        return "Movie not found."

    idx = df[df['title'] == movie_title].index[0]
    scores = list(enumerate(cosine_sim[idx]))
    scores = sorted(scores, key=lambda x: x[1], reverse=True)[1:6]  # Top 5 excluding itself
    return [df.iloc[i[0]]['title'] for i in scores]

In [None]:
movie_name = "Men in Black 3"
print(f"Movies similar to '{movie_name}':")
for movie in recommend(movie_name):
    print("➤", movie)


Movies similar to 'Men in Black 3':
➤ Timecop
➤ The Book of Eli
➤ Meet Dave
➤ The Helix... Loaded
➤ Project Almanac
