In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [2]:
# Importing the dataset
df = pd.read_csv('netflix_titles.csv', header=0)
df.head(10)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, H...",,"September 24, 2021",2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",The arrival of a charismatic young priest brin...
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, ...",,"September 24, 2021",2021,PG,91 min,Children & Family Movies,Equestria's divided. But a bright-eyed hero be...
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies","On a photo shoot in Ghana, an American model s..."
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,"September 24, 2021",2021,TV-14,9 Seasons,"British TV Shows, Reality TV",A talented batch of amateur bakers face off in...
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,"September 24, 2021",2021,PG-13,104 min,"Comedies, Dramas",A woman adjusting to life after a loss contend...


# Data Preprocessing

In [3]:
df['director'].isnull().sum()

2634

In [4]:
df.shape

(8807, 12)

In [5]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [6]:
df_non_null = df.fillna(' ')
df_non_null

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [7]:
# Taking the necessary columns as features for machine learing
df_cols = df_non_null[['show_id','type', 'director', 'cast', 'country', 'listed_in']]
df_cols

Unnamed: 0,show_id,type,director,cast,country,listed_in
0,s1,Movie,Kirsten Johnson,,United States,Documentaries
1,s2,TV Show,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"International TV Shows, TV Dramas, TV Mysteries"
2,s3,TV Show,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"Crime TV Shows, International TV Shows, TV Act..."
3,s4,TV Show,,,,"Docuseries, Reality TV"
4,s5,TV Show,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"International TV Shows, Romantic TV Shows, TV ..."
...,...,...,...,...,...,...
8802,s8803,Movie,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"Cult Movies, Dramas, Thrillers"
8803,s8804,TV Show,,,,"Kids' TV, Korean TV Shows, TV Comedies"
8804,s8805,Movie,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"Comedies, Horror Movies"
8805,s8806,Movie,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"Children & Family Movies, Comedies"


In [8]:
# This function converts all upercases into lowercases and removes the space in between words
for i in df_cols:
    df_cols[i] = df_cols[i].apply(lambda x: x.replace(' ',''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cols[i] = df_cols[i].apply(lambda x: x.replace(' ',''))


In [9]:
df_cols

Unnamed: 0,show_id,type,director,cast,country,listed_in
0,s1,Movie,KirstenJohnson,,UnitedStates,Documentaries
1,s2,TVShow,,"AmaQamata,KhosiNgema,GailMabalane,ThabangMolab...",SouthAfrica,"InternationalTVShows,TVDramas,TVMysteries"
2,s3,TVShow,JulienLeclercq,"SamiBouajila,TracyGotoas,SamuelJouy,NabihaAkka...",,"CrimeTVShows,InternationalTVShows,TVAction&Adv..."
3,s4,TVShow,,,,"Docuseries,RealityTV"
4,s5,TVShow,,"MayurMore,JitendraKumar,RanjanRaj,AlamKhan,Ahs...",India,"InternationalTVShows,RomanticTVShows,TVComedies"
...,...,...,...,...,...,...
8802,s8803,Movie,DavidFincher,"MarkRuffalo,JakeGyllenhaal,RobertDowneyJr.,Ant...",UnitedStates,"CultMovies,Dramas,Thrillers"
8803,s8804,TVShow,,,,"Kids'TV,KoreanTVShows,TVComedies"
8804,s8805,Movie,RubenFleischer,"JesseEisenberg,WoodyHarrelson,EmmaStone,Abigai...",UnitedStates,"Comedies,HorrorMovies"
8805,s8806,Movie,PeterHewitt,"TimAllen,CourteneyCox,ChevyChase,KateMara,Ryan...",UnitedStates,"Children&FamilyMovies,Comedies"


In [10]:
# Creating a tags columns with all the keywords
df_cols['tags'] = df_cols['type'] + ' ' + df_cols['director'] + ' ' + df_cols['cast'] + ' ' + df_cols['country'] + ' ' + df_cols['listed_in'] + ' ' + df_non_null['rating'] + ' ' + df_non_null['description']
df_cols['tags']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cols['tags'] = df_cols['type'] + ' ' + df_cols['director'] + ' ' + df_cols['cast'] + ' ' + df_cols['country'] + ' ' + df_cols['listed_in'] + ' ' + df_non_null['rating'] + ' ' + df_non_null['description']


0       Movie KirstenJohnson  UnitedStates Documentari...
1       TVShow  AmaQamata,KhosiNgema,GailMabalane,Thab...
2       TVShow JulienLeclercq SamiBouajila,TracyGotoas...
3       TVShow    Docuseries,RealityTV TV-MA Feuds, fl...
4       TVShow  MayurMore,JitendraKumar,RanjanRaj,Alam...
                              ...                        
8802    Movie DavidFincher MarkRuffalo,JakeGyllenhaal,...
8803    TVShow    Kids'TV,KoreanTVShows,TVComedies TV-...
8804    Movie RubenFleischer JesseEisenberg,WoodyHarre...
8805    Movie PeterHewitt TimAllen,CourteneyCox,ChevyC...
8806    Movie MozezSingh VickyKaushal,Sarah-JaneDias,R...
Name: tags, Length: 8807, dtype: object

In [11]:
df_cols = df_cols.merge(df_non_null[['show_id', 'title']])
df_cols

Unnamed: 0,show_id,type,director,cast,country,listed_in,tags,title
0,s1,Movie,KirstenJohnson,,UnitedStates,Documentaries,Movie KirstenJohnson UnitedStates Documentari...,Dick Johnson Is Dead
1,s2,TVShow,,"AmaQamata,KhosiNgema,GailMabalane,ThabangMolab...",SouthAfrica,"InternationalTVShows,TVDramas,TVMysteries","TVShow AmaQamata,KhosiNgema,GailMabalane,Thab...",Blood & Water
2,s3,TVShow,JulienLeclercq,"SamiBouajila,TracyGotoas,SamuelJouy,NabihaAkka...",,"CrimeTVShows,InternationalTVShows,TVAction&Adv...","TVShow JulienLeclercq SamiBouajila,TracyGotoas...",Ganglands
3,s4,TVShow,,,,"Docuseries,RealityTV","TVShow Docuseries,RealityTV TV-MA Feuds, fl...",Jailbirds New Orleans
4,s5,TVShow,,"MayurMore,JitendraKumar,RanjanRaj,AlamKhan,Ahs...",India,"InternationalTVShows,RomanticTVShows,TVComedies","TVShow MayurMore,JitendraKumar,RanjanRaj,Alam...",Kota Factory
...,...,...,...,...,...,...,...,...
8802,s8803,Movie,DavidFincher,"MarkRuffalo,JakeGyllenhaal,RobertDowneyJr.,Ant...",UnitedStates,"CultMovies,Dramas,Thrillers","Movie DavidFincher MarkRuffalo,JakeGyllenhaal,...",Zodiac
8803,s8804,TVShow,,,,"Kids'TV,KoreanTVShows,TVComedies","TVShow Kids'TV,KoreanTVShows,TVComedies TV-...",Zombie Dumb
8804,s8805,Movie,RubenFleischer,"JesseEisenberg,WoodyHarrelson,EmmaStone,Abigai...",UnitedStates,"Comedies,HorrorMovies","Movie RubenFleischer JesseEisenberg,WoodyHarre...",Zombieland
8805,s8806,Movie,PeterHewitt,"TimAllen,CourteneyCox,ChevyChase,KateMara,Ryan...",UnitedStates,"Children&FamilyMovies,Comedies","Movie PeterHewitt TimAllen,CourteneyCox,ChevyC...",Zoom


In [12]:
# Converting all the tags to lower case
df_cols['tags'] = df_cols['tags'].apply(lambda x: str.lower(x))
df_cols

Unnamed: 0,show_id,type,director,cast,country,listed_in,tags,title
0,s1,Movie,KirstenJohnson,,UnitedStates,Documentaries,movie kirstenjohnson unitedstates documentari...,Dick Johnson Is Dead
1,s2,TVShow,,"AmaQamata,KhosiNgema,GailMabalane,ThabangMolab...",SouthAfrica,"InternationalTVShows,TVDramas,TVMysteries","tvshow amaqamata,khosingema,gailmabalane,thab...",Blood & Water
2,s3,TVShow,JulienLeclercq,"SamiBouajila,TracyGotoas,SamuelJouy,NabihaAkka...",,"CrimeTVShows,InternationalTVShows,TVAction&Adv...","tvshow julienleclercq samibouajila,tracygotoas...",Ganglands
3,s4,TVShow,,,,"Docuseries,RealityTV","tvshow docuseries,realitytv tv-ma feuds, fl...",Jailbirds New Orleans
4,s5,TVShow,,"MayurMore,JitendraKumar,RanjanRaj,AlamKhan,Ahs...",India,"InternationalTVShows,RomanticTVShows,TVComedies","tvshow mayurmore,jitendrakumar,ranjanraj,alam...",Kota Factory
...,...,...,...,...,...,...,...,...
8802,s8803,Movie,DavidFincher,"MarkRuffalo,JakeGyllenhaal,RobertDowneyJr.,Ant...",UnitedStates,"CultMovies,Dramas,Thrillers","movie davidfincher markruffalo,jakegyllenhaal,...",Zodiac
8803,s8804,TVShow,,,,"Kids'TV,KoreanTVShows,TVComedies","tvshow kids'tv,koreantvshows,tvcomedies tv-...",Zombie Dumb
8804,s8805,Movie,RubenFleischer,"JesseEisenberg,WoodyHarrelson,EmmaStone,Abigai...",UnitedStates,"Comedies,HorrorMovies","movie rubenfleischer jesseeisenberg,woodyharre...",Zombieland
8805,s8806,Movie,PeterHewitt,"TimAllen,CourteneyCox,ChevyChase,KateMara,Ryan...",UnitedStates,"Children&FamilyMovies,Comedies","movie peterhewitt timallen,courteneycox,chevyc...",Zoom


In [13]:
# Final dataset
df_clean = df_cols[['title', 'tags']]
df_clean

Unnamed: 0,title,tags
0,Dick Johnson Is Dead,movie kirstenjohnson unitedstates documentari...
1,Blood & Water,"tvshow amaqamata,khosingema,gailmabalane,thab..."
2,Ganglands,"tvshow julienleclercq samibouajila,tracygotoas..."
3,Jailbirds New Orleans,"tvshow docuseries,realitytv tv-ma feuds, fl..."
4,Kota Factory,"tvshow mayurmore,jitendrakumar,ranjanraj,alam..."
...,...,...
8802,Zodiac,"movie davidfincher markruffalo,jakegyllenhaal,..."
8803,Zombie Dumb,"tvshow kids'tv,koreantvshows,tvcomedies tv-..."
8804,Zombieland,"movie rubenfleischer jesseeisenberg,woodyharre..."
8805,Zoom,"movie peterhewitt timallen,courteneycox,chevyc..."


In [14]:
# Applying stemming on tags
ps = PorterStemmer()
def stem(text):
    y = []
    for i in text.split(): 
        y.append(ps.stem(i))
    return ' '.join(y)

In [15]:
df_clean['tags'] = df_clean['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['tags'] = df_clean['tags'].apply(stem)


In [16]:
df_clean['tags']

0       movi kirstenjohnson unitedst documentari pg-13...
1       tvshow amaqamata,khosingema,gailmabalane,thaba...
2       tvshow julienleclercq samibouajila,tracygotoas...
3       tvshow docuseries,realitytv tv-ma feuds, flirt...
4       tvshow mayurmore,jitendrakumar,ranjanraj,alamk...
                              ...                        
8802    movi davidfinch markruffalo,jakegyllenhaal,rob...
8803    tvshow kids'tv,koreantvshows,tvcomedi tv-y7 wh...
8804    movi rubenfleisch jesseeisenberg,woodyharrelso...
8805    movi peterhewitt timallen,courteneycox,chevych...
8806    movi mozezsingh vickykaushal,sarah-janedias,ra...
Name: tags, Length: 8807, dtype: object

In [17]:
# Performing vectorization to convert the tags into vector values
cv = CountVectorizer(max_features=10000, stop_words='english')
vectorized = cv.fit_transform(df_clean['tags']).toarray()

In [18]:
vectorized.shape

(8807, 10000)

In [19]:
# Using cosine similarity to detect the similarity in movies using distance
# Distance and similarity are inveresely proportional
cs = cosine_similarity(vectorized)
cs.shape

(8807, 8807)

In [20]:
cs[0]

array([1.        , 0.        , 0.16116459, ..., 0.09534626, 0.17407766,
       0.31980107])

In [21]:
# Extracting the index of the title in the dataset
df_clean[df_clean['title'] == 'Dick Johnson Is Dead'].index[0]

0

In [22]:
# Sorting the values based on the top 5 highest similarity without loosing the index
list(sorted(enumerate(cs[0]), reverse=True, key = lambda x: x[1]))[1:6]

[(2988, 0.4599331055039),
 (7854, 0.4212117695871159),
 (1102, 0.41812100500354543),
 (8693, 0.414578098794425),
 (8691, 0.41030496993110904)]

In [23]:
def recommend(movie):
    idx = df_clean[df_clean['title'] == movie].index[0]
    movie_list = list(sorted(enumerate(cs[idx]), reverse=True, key = lambda x: x[1]))[1:6]

    for i in movie_list:
        print(df_clean.iloc[i[0]].title)

In [24]:
recommend('Zubaan')

Ek Jaan Hain Hum
The Music of Silence
Bilu Rakkhosh
Kaminey
Once In A Lifetime Sessions with Moby


In [25]:
# pickle.dump(df_clean,open('movies.pkl' ,'wb'))

In [26]:
df_clean['title'].values

array(['Dick Johnson Is Dead', 'Blood & Water', 'Ganglands', ...,
       'Zombieland', 'Zoom', 'Zubaan'], dtype=object)

In [27]:
pickle.dump(cs, open('similarity.pkl', 'wb'))

In [28]:
df_dict = df_clean.to_dict()

In [29]:
pickle.dump(df_dict,open('movies_dict.pkl' ,'wb'))