In [3]:

import pandas as pd
import ast

# Load your data
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge movies and credits
movies = movies.merge(credits, on='title')

# Keep only necessary columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Helper functions to process JSON-like columns
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

def get_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return i['name']
    return ""

def get_top_3_cast(text):
    return [i['name'] for i in ast.literal_eval(text)[:3]]

# Apply the helper functions
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(get_top_3_cast)
movies['crew'] = movies['crew'].apply(get_director)

# Handle missing overviews
movies['overview'] = movies['overview'].fillna('')

# Combine everything into a single 'tags' column
movies['tags'] = (
    movies['overview'] + ' ' +
    movies['genres'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['keywords'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['cast'].apply(lambda x: ' '.join(x)) + ' ' +
    movies['crew']
).str.lower()

# Final dataframe
new_df = movies[['movie_id', 'title', 'tags']]


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vector = tfidf.fit_transform(new_df['tags']).toarray()

# Similarity matrix
similarity = cosine_similarity(vector)


In [5]:
import pickle

pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(tfidf, open('vectorizer.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
