In [1]:

import pandas as pd
# for text data tokenization 
from sklearn.feature_extraction.text import TfidfVectorizer
# for similarity calculation
from sklearn.metrics.pairwise import linear_kernel

## Read the data

In [3]:
df = pd.read_csv('netflix_titles.csv')

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Tokenize text data

In [4]:
# remove stop words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the data to a tfidf matrix
tfidf_matrix = tfidf.fit_transform(df['description'])

tfidf_matrix.shape

(8807, 18895)

## Calculate Similarity

In [7]:
# cosine similarity
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
indices = pd.Series(df.index, index=df['title']).drop_duplicates()
indices

title
Dick Johnson Is Dead        0
Blood & Water               1
Ganglands                   2
Jailbirds New Orleans       3
Kota Factory                4
                         ... 
Zodiac                   8802
Zombie Dumb              8803
Zombieland               8804
Zoom                     8805
Zubaan                   8806
Length: 8807, dtype: int64

In [10]:
def get_recommendations(title, cosine_sim=cosine_sim, num_recommend=10):
    """Function to get recommendations

    Args:
        title (string): Movie or TV show title.
        cosine_sim (float[m][n], optional): Degree of similarity. Defaults to cosine_sim.
        num_recommend (int, optional): Choose how many recommendations to let out. Defaults to 10.
    """
    idx = indices[title]
    
    #get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # sort movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # get scores of the 10 most similar movies
    top_similar = sim_scores[1:num_recommend+1]
    
    # get the movie indices
    movie_indices = [i[0] for i in top_similar]
    
    #return the top 10 most similar movies
    return df['title'].iloc[movie_indices]
    

In [18]:
get_recommendations('Inception')

1939                       Dedh Ishqiya
6304           Big Dreams, Small Spaces
6726                    Eyyvah Eyyvah 2
5202                    Strange Weather
3796    Hyper HardBoiled Gourmet Report
1536                          Incarnate
1359                       The Bank Job
1621                 Quigley Down Under
7855                           Rememory
4131                  Violet Evergarden
Name: title, dtype: object