In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle

In [2]:
# Load data
movies = pd.read_csv(r"C:\Users\parth\Downloads\tmdb_5000_movies.csv")
credits = pd.read_csv(r"C:\Users\parth\Downloads\tmdb_5000_credits.csv") 

# Merge dataframes
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]


In [3]:
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

movies.dropna(inplace=True)
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x: x[:3])
movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])

In [4]:
# Tokenize and preprocess text data
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Combine different types of features intelligently
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))


In [5]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['tags'])


In [6]:
# Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix)

In [7]:
def recommend(movie, n=5):
    index = movies[movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(cosine_sim[index])), reverse=True, key=lambda x: x[1])
    recommended_movies = [movies.iloc[i[0]].title for i in distances[1:n+1]]
    return recommended_movies


In [8]:
# Example usage:
recommended_movies = recommend('Gandhi', n=5)
print(recommended_movies)

['Gandhi, My Father', 'A Passage to India', 'The Wind That Shakes the Barley', 'Water', 'Lawrence of Arabia']


In [9]:
# Save processed data and model
pickle.dump(movies, open('movie_list.pkl', 'wb'))
pickle.dump(cosine_sim, open('cosine_similarity.pkl', 'wb'))
