In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv('netflix_titles.csv')

# Show basic info
print(df.shape)
df.head()

(8807, 12)


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
# Drop nulls in critical fields
df.dropna(subset=['title', 'type', 'director', 'cast', 'description'], inplace=True)

# Combine relevant fields for content-based filtering
df['combined'] = df['type'] + ' ' + df['director'] + ' ' + df['cast'] + ' ' + df['description'] + ' ' + df['listed_in']
df['combined'] = df['combined'].str.lower()

df[['title', 'combined']].head()


Unnamed: 0,title,combined
2,Ganglands,"tv show julien leclercq sami bouajila, tracy g..."
5,Midnight Mass,"tv show mike flanagan kate siegel, zach gilfor..."
6,My Little Pony: A New Generation,"movie robert cullen, josé luis ucha vanessa hu..."
7,Sankofa,"movie haile gerima kofi ghanaba, oyafunmike og..."
8,The Great British Baking Show,"tv show andy devonshire mel giedroyc, sue perk..."


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

# Cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [5]:
# Reverse map of movie titles to index
indices = pd.Series(df.index, index=df['title'].str.lower())

def recommend(title, cosine_sim=cosine_sim):
    title = title.lower()
    if title not in indices:
        return "Movie not found in the database."
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]  # top 5

    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]
