In [244]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle

# Load and merge datasets
movies = pd.read_csv('/tmdb_5000_movies.csv')
credits = pd.read_csv('/tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)

# Helper function to extract names from JSON-like fields
def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name'])
    return L

# Apply the convert function to necessary columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert).apply(lambda x: x[:3])  # Take top 3 cast members

# Extract director's name from the crew
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L

movies['crew'] = movies['crew'].apply(fetch_director)

# Remove spaces within names to create single tokens
def collapse(L):
    return [i.replace(" ", "") for i in L]

movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)

# Create 'tags' by combining relevant features, giving more weight to 'genres' and 'keywords'
movies['overview'] = movies['overview'].apply(lambda x: x.split())
movies['tags'] = (movies['overview']*5) + (movies['genres'] * 2) + (movies['keywords'] * 3) + (movies['cast']) + (movies['crew']*3)
new = movies.drop(columns=['overview', 'genres', 'keywords', 'cast', 'crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# Use TF-IDF Vectorizer instead of Count Vectorizer
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
vector = tfidf.fit_transform(new['tags']).toarray()

# Calculate cosine similarity
similarity = cosine_similarity(vector)

# Save the processed data and similarity matrix for later use
pickle.dump(new, open('movie_list.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

# Recommendation function with additional details
def recommend(movie):
    if movie not in new['title'].values:
        print(f"Movie '{movie}' not found in the dataset.")
        return

    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])

    print(f"Movies recommended for '{movie}':\n")
    for i in distances[1:6]:
        title = new.iloc[i[0]].title
        genres = ", ".join(movies.iloc[i[0]]['genres'])
        print(f"{title} (Genres: {genres})")

In [246]:
recommend("Harry Potter and the Philosopher's Stone")

Movies recommended for 'Harry Potter and the Philosopher's Stone':

Harry Potter and the Goblet of Fire (Genres: Adventure, Fantasy, Family)
Harry Potter and the Prisoner of Azkaban (Genres: Adventure, Fantasy, Family)
Harry Potter and the Chamber of Secrets (Genres: Adventure, Fantasy, Family)
Harry Potter and the Order of the Phoenix (Genres: Adventure, Fantasy, Family, Mystery)
Harry Potter and the Half-Blood Prince (Genres: Adventure, Fantasy, Family)
