In [2]:
# Data handling
import pandas as pd
import numpy as np

# NLP & similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Optional: For displaying results
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load your movies dataset (example: movies.csv)
movies = pd.read_csv('movies.csv')

# Preview the dataset
movies.head()

Unnamed: 0,title,genre,director,cast,overview
0,Movie 1,Action,Christopher Nolan,Leonardo DiCaprio; Brad Pitt,An epic story of love and betrayal.
1,Movie 2,Comedy,Steven Spielberg,Tom Hanks; Scarlett Johansson,A comedic series of unfortunate events.
2,Movie 3,Drama,Quentin Tarantino,Brad Pitt; Morgan Freeman,A drama exploring human emotions and relations...
3,Movie 4,Romance,Martin Scorsese,Angelina Jolie; Johnny Depp,A story about friendship and overcoming challe...
4,Movie 5,Sci-Fi,James Cameron,Chris Hemsworth; Anne Hathaway; Christian Bale,An intense journey through time and space.


In [5]:
# Fill missing values with empty string
for feature in ['genre', 'director', 'cast', 'overview']:
    movies[feature] = movies[feature].fillna('')

# Combine features into a single string for content-based recommendation
def combine_features(row):
    return row['genre'] + " " + row['director'] + " " + row['cast'] + " " + row['overview']

movies['combined_features'] = movies.apply(combine_features, axis=1)

In [6]:
# Convert text to TF-IDF vectors
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [7]:
# Helper function to get movie index
def get_index(title):
    return movies[movies.title == title].index[0]

# Recommendation function
def recommend_movies(movie_title, num_recommendations=5):
    movie_index = get_index(movie_title)
    similar_movies = list(enumerate(cosine_sim[movie_index]))
    # Sort movies based on similarity score
    sorted_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:num_recommendations+1]
    
    recommended_titles = [movies.iloc[i[0]].title for i in sorted_movies]
    return recommended_titles

In [9]:
recommend_movies('Movie 1', num_recommendations=5)

['Movie 41', 'Movie 31', 'Movie 21', 'Movie 11', 'Movie 49']

In [12]:
print(movies.head(20))

       title      genre           director  \
0    Movie 1     Action  Christopher Nolan   
1    Movie 2     Comedy   Steven Spielberg   
2    Movie 3      Drama  Quentin Tarantino   
3    Movie 4    Romance    Martin Scorsese   
4    Movie 5     Sci-Fi      James Cameron   
5    Movie 6     Horror       Ridley Scott   
6    Movie 7   Thriller      Peter Jackson   
7    Movie 8      Crime      David Fincher   
8    Movie 9    Fantasy         Tim Burton   
9   Movie 10  Adventure   Alfred Hitchcock   
10  Movie 11     Action  Christopher Nolan   
11  Movie 12     Comedy   Steven Spielberg   
12  Movie 13      Drama  Quentin Tarantino   
13  Movie 14    Romance    Martin Scorsese   
14  Movie 15     Sci-Fi      James Cameron   
15  Movie 16     Horror       Ridley Scott   
16  Movie 17   Thriller      Peter Jackson   
17  Movie 18      Crime      David Fincher   
18  Movie 19    Fantasy         Tim Burton   
19  Movie 20  Adventure   Alfred Hitchcock   

                                 