In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load the Datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge Datasets on 'title'
movies = movies.merge(credits, on='title')

# Select Required Columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Helper Functions
def convert(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)]
    except:
        return []

def convert_cast(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)[:3]]
    except:
        return []

def fetch_director(obj):
    try:
        for i in ast.literal_eval(obj):
            if i['job'] == 'Director':
                return i['name']
        return ''
    except:
        return ''

# Apply functions to extract features
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

# Fill missing overviews
movies['overview'] = movies['overview'].fillna('')

# Combine all features into a single 'tags' column
movies['genres'] = movies['genres'].apply(lambda x: " ".join(x))
movies['keywords'] = movies['keywords'].apply(lambda x: " ".join(x))
movies['cast'] = movies['cast'].apply(lambda x: " ".join(x))
movies['crew'] = movies['crew'].apply(lambda x: str(x))

movies['tags'] = movies['overview'] + ' ' + movies['genres'] + ' ' + \
                 movies['keywords'] + ' ' + movies['cast'] + ' ' + movies['crew']

# Create new DataFrame with cleaned data
new_df = movies[['movie_id', 'title', 'tags']]
new_df.loc[:, 'tags'] = new_df['tags'].apply(lambda x: x.lower())

# Text Vectorization
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()

# Compute Cosine Similarity
similarity = cosine_similarity(vectors)

# Recommendation Function
def recommend(movie):
    movie = movie.lower()
    if movie not in new_df['title'].str.lower().values:
        print("❌ Movie not found in dataset.")
        return
    idx = new_df[new_df['title'].str.lower() == movie].index[0]
    distances = list(enumerate(similarity[idx]))
    movies_list = sorted(distances, reverse=True, key=lambda x: x[1])[1:6]
    
    print(f"\n🎬 Top 5 recommendations for '{new_df.iloc[idx].title}':")
    for i in movies_list:
        print("👉", new_df.iloc[i[0]].title)

# Try It Out
recommend("Avatar")

# Optional: Save data for future use
pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())



🎬 Top 5 recommendations for 'Avatar':
👉 Aliens
👉 Moonraker
👉 Alien
👉 Alien³
👉 Silent Running
