In [7]:
import pandas as pd
import ast

# Load files
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge on title
movies = movies.merge(credits, on='title')

# Select relevant columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [8]:
# Convert JSON-like columns to lists
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

def get_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return i['name']
    return ''

def get_top_cast(text):
    return [i['name'] for i in ast.literal_eval(text)[:3]]  # Top 3

# Apply conversions
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(get_top_cast)
movies['crew'] = movies['crew'].apply(get_director)

# Fill missing overviews
movies['overview'] = movies['overview'].fillna('')


In [9]:
# Combine into a single string column
movies['tags'] = (
    movies['overview'] + ' ' +
    movies['genres'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['keywords'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['cast'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['crew']
)

# Lowercase
movies['tags'] = movies['tags'].str.lower()

# Final dataframe
new_df = movies[['movie_id', 'title', 'tags']]


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorize
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(new_df['tags']).toarray()

# Compute similarity matrix
similarity = cosine_similarity(vectors)


In [11]:
def recommend(movie_title):
    movie_title = movie_title.lower()
    
    if movie_title not in new_df['title'].str.lower().values:
        return "Movie not found in dataset."
    
    index = new_df[new_df['title'].str.lower() == movie_title].index[0]
    distances = list(enumerate(similarity[index]))
    sorted_movies = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]

    recommendations = [new_df.iloc[i[0]].title for i in sorted_movies]
    return recommendations


In [12]:
recommend("Avatar")


['Aliens', 'Alien', 'Moonraker', 'Alien³', 'Silent Running']