In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df_netflix = pd.read_csv("netflix_titles.csv")
df_netflix

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...
...,...,...,...,...,...,...,...,...,...,...,...,...
7782,s7783,Movie,Zozo,Josef Fares,"Imad Creidi, Antoinette Turk, Elias Gergi, Car...","Sweden, Czech Republic, United Kingdom, Denmar...","October 19, 2020",2005,TV-MA,99 min,"Dramas, International Movies",When Lebanon's Civil War deprives Zozo of his ...
7783,s7784,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,"March 2, 2019",2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...
7784,s7785,Movie,Zulu Man in Japan,,Nasty C,,"September 25, 2020",2019,TV-MA,44 min,"Documentaries, International Movies, Music & M...","In this documentary, South African rapper Nast..."
7785,s7786,TV Show,Zumbo's Just Desserts,,"Adriano Zumbo, Rachel Khoo",Australia,"October 31, 2020",2019,TV-PG,1 Season,"International TV Shows, Reality TV",Dessert wizard Adriano Zumbo looks for the nex...


In [3]:
#drop unnecessary columns and fill in missing data
df_netflix.drop(columns=['show_id', 'date_added'], inplace = True)
df_netflix["director"].fillna("Not available", inplace = True)
df_netflix["cast"].fillna("Not available", inplace = True)
df_netflix

Unnamed: 0,type,title,director,cast,country,release_year,rating,duration,listed_in,description
0,TV Show,3%,Not available,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...
...,...,...,...,...,...,...,...,...,...,...
7782,Movie,Zozo,Josef Fares,"Imad Creidi, Antoinette Turk, Elias Gergi, Car...","Sweden, Czech Republic, United Kingdom, Denmar...",2005,TV-MA,99 min,"Dramas, International Movies",When Lebanon's Civil War deprives Zozo of his ...
7783,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...
7784,Movie,Zulu Man in Japan,Not available,Nasty C,,2019,TV-MA,44 min,"Documentaries, International Movies, Music & M...","In this documentary, South African rapper Nast..."
7785,TV Show,Zumbo's Just Desserts,Not available,"Adriano Zumbo, Rachel Khoo",Australia,2019,TV-PG,1 Season,"International TV Shows, Reality TV",Dessert wizard Adriano Zumbo looks for the nex...


In [4]:
#create new dataframe to only show tv shows and drop additional columns not needed
df_shows = df_netflix[df_netflix['type'] == 'TV Show'].reset_index()
df_shows = df_shows.drop(['index', 'type', 'rating', 'duration', 'release_year'], axis=1)

df_shows

Unnamed: 0,title,director,cast,country,listed_in,description
0,3%,Not available,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...
2,1983,Not available,"Robert Więckiewicz, Maciej Musiał, Michalina O...","Poland, United States","Crime TV Shows, International TV Shows, TV Dramas","In this dark alt-history thriller, a naïve law..."
3,1994,Diego Enrique Osorno,Not available,Mexico,"Crime TV Shows, Docuseries, International TV S...",Archival video and new interviews examine Mexi...
4,Feb-09,Not available,"Shahd El Yaseen, Shaila Sabt, Hala, Hanadi Al-...",,"International TV Shows, TV Dramas","As a psychology professor faces Alzheimer's, h..."
...,...,...,...,...,...,...
2405,Zindagi Gulzar Hai,Not available,"Sanam Saeed, Fawad Khan, Ayesha Omer, Mehreen ...",Pakistan,"International TV Shows, Romantic TV Shows, TV ...","Strong-willed, middle-class Kashaf and carefre..."
2406,Zoids Wild,Not available,"Kensho Ono, Takahiro Sakurai, Mikako Komatsu, ...",Japan,"Anime Series, Kids' TV",A quest for freedom and legendary treasure beg...
2407,Zombie Dumb,Not available,Not available,,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
2408,Zona Rosa,Not available,"Manu NNa, Ana Julia Yeyé, Ray Contreras, Pablo...",Mexico,"International TV Shows, Spanish-Language TV Sh...",An assortment of talent takes the stage for a ...


In [5]:
df_shows[df_shows['director'].str.contains('Not available')]

Unnamed: 0,title,director,cast,country,listed_in,description
0,3%,Not available,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
2,1983,Not available,"Robert Więckiewicz, Maciej Musiał, Michalina O...","Poland, United States","Crime TV Shows, International TV Shows, TV Dramas","In this dark alt-history thriller, a naïve law..."
4,Feb-09,Not available,"Shahd El Yaseen, Shaila Sabt, Hala, Hanadi Al-...",,"International TV Shows, TV Dramas","As a psychology professor faces Alzheimer's, h..."
5,​SAINT SEIYA: Knights of the Zodiac,Not available,"Bryson Baugus, Emily Neves, Blake Shepard, Pat...",Japan,"Anime Series, International TV Shows",Seiya and the Knights of the Zodiac rise again...
6,(Un)Well,Not available,Not available,United States,Reality TV,This docuseries takes a deep dive into the luc...
...,...,...,...,...,...,...
2405,Zindagi Gulzar Hai,Not available,"Sanam Saeed, Fawad Khan, Ayesha Omer, Mehreen ...",Pakistan,"International TV Shows, Romantic TV Shows, TV ...","Strong-willed, middle-class Kashaf and carefre..."
2406,Zoids Wild,Not available,"Kensho Ono, Takahiro Sakurai, Mikako Komatsu, ...",Japan,"Anime Series, Kids' TV",A quest for freedom and legendary treasure beg...
2407,Zombie Dumb,Not available,Not available,,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
2408,Zona Rosa,Not available,"Manu NNa, Ana Julia Yeyé, Ray Contreras, Pablo...",Mexico,"International TV Shows, Spanish-Language TV Sh...",An assortment of talent takes the stage for a ...


In [6]:
#drop directors column for the count vector because it would be a similarity even if the shows aren't similar
df_shows = df_shows.drop(['director'], axis=1)
df_shows

Unnamed: 0,title,cast,country,listed_in,description
0,3%,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,46,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...
2,1983,"Robert Więckiewicz, Maciej Musiał, Michalina O...","Poland, United States","Crime TV Shows, International TV Shows, TV Dramas","In this dark alt-history thriller, a naïve law..."
3,1994,Not available,Mexico,"Crime TV Shows, Docuseries, International TV S...",Archival video and new interviews examine Mexi...
4,Feb-09,"Shahd El Yaseen, Shaila Sabt, Hala, Hanadi Al-...",,"International TV Shows, TV Dramas","As a psychology professor faces Alzheimer's, h..."
...,...,...,...,...,...
2405,Zindagi Gulzar Hai,"Sanam Saeed, Fawad Khan, Ayesha Omer, Mehreen ...",Pakistan,"International TV Shows, Romantic TV Shows, TV ...","Strong-willed, middle-class Kashaf and carefre..."
2406,Zoids Wild,"Kensho Ono, Takahiro Sakurai, Mikako Komatsu, ...",Japan,"Anime Series, Kids' TV",A quest for freedom and legendary treasure beg...
2407,Zombie Dumb,Not available,,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
2408,Zona Rosa,"Manu NNa, Ana Julia Yeyé, Ray Contreras, Pablo...",Mexico,"International TV Shows, Spanish-Language TV Sh...",An assortment of talent takes the stage for a ...


In [7]:
#removed spaces inbetween cast column
df_shows['cast'] = df_shows['cast'].str.replace(" ","")
df_shows

Unnamed: 0,title,cast,country,listed_in,description
0,3%,"JoãoMiguel,BiancaComparato,MichelGomes,Rodolfo...",Brazil,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,46,"ErdalBeşikçioğlu,YaseminAllen,MelisBirkan,Sayg...",Turkey,"International TV Shows, TV Dramas, TV Mysteries",A genetics professor experiments with a treatm...
2,1983,"RobertWięckiewicz,MaciejMusiał,MichalinaOlszań...","Poland, United States","Crime TV Shows, International TV Shows, TV Dramas","In this dark alt-history thriller, a naïve law..."
3,1994,Notavailable,Mexico,"Crime TV Shows, Docuseries, International TV S...",Archival video and new interviews examine Mexi...
4,Feb-09,"ShahdElYaseen,ShailaSabt,Hala,HanadiAl-Kandari...",,"International TV Shows, TV Dramas","As a psychology professor faces Alzheimer's, h..."
...,...,...,...,...,...
2405,Zindagi Gulzar Hai,"SanamSaeed,FawadKhan,AyeshaOmer,MehreenRaheel,...",Pakistan,"International TV Shows, Romantic TV Shows, TV ...","Strong-willed, middle-class Kashaf and carefre..."
2406,Zoids Wild,"KenshoOno,TakahiroSakurai,MikakoKomatsu,Etsuko...",Japan,"Anime Series, Kids' TV",A quest for freedom and legendary treasure beg...
2407,Zombie Dumb,Notavailable,,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
2408,Zona Rosa,"ManuNNa,AnaJuliaYeyé,RayContreras,PabloMorán",Mexico,"International TV Shows, Spanish-Language TV Sh...",An assortment of talent takes the stage for a ...


In [None]:
#combined all columns into a new column for keywords that will be vectorized
df_shows['text'] = df_shows[df_shows.columns[1:]].apply(lambda x: ', '.join(x.dropna().astype(str).str.lower()), axis=1)
df_shows

In [None]:
vect_count = CountVectorizer()

#creating matrix counting each word in text column
count_matrix = vect_count.fit_transform(df_shows['text'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

def show_recommendations(title, cosine_sim = cosine_sim):

    # initializing empty list
    shows_rec = []
    
    #finding index of movie that matches the title
    idx = df_shows.loc[df_shows['title'] == title].index[0]
    print('idx is ', idx)
    
    #creating Series with the similarity scores in descending order
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    print(scores)
    
    # finding the top 5 indexes that are similar to movie
    top_50_index = list(scores.iloc[1:51].index)
    
    # populating the list with the titles of the best 10 matching movies
    for x in top_50_index:
        shows_rec.append(list(df_shows.title)[x])
        
    return shows_rec

In [None]:
#create new dataframe to only show movies and drop additional columns not needed
df_movies = df_netflix[df_netflix['type'] == 'Movie'].reset_index()
df_movies = df_movies.drop(['index', 'type', 'rating', 'duration', 'release_year'], axis=1)
df_movies

In [None]:
#removed spaces inbetween director and cast columns
df_movies['director'] = df_movies['director'].str.replace(" ","")
df_movies['cast'] = df_movies['cast'].str.replace(" ","")
df_movies

In [None]:
#combined all columns into a new column for keywords that will be vectorized
df_movies['text'] = df_movies[df_movies.columns[1:]].apply(lambda x: ', '.join(x.dropna().astype(str).str.lower()), axis=1)
df_movies

In [None]:
vect_count = CountVectorizer()

#creating matrix counting each word in text column
count_matrix = vect_count.fit_transform(df_movies['text'])

# generating the cosine similarity matrix
cosine_sim = cosine_similarity(count_matrix, count_matrix)

def movie_recommendations(title, cosine_sim = cosine_sim):

    # initializing empty list
    movie_rec = []
    
    #finding index of movie that matches the title
    idx = df_movies.loc[df_movies['title'] == title].index[0]
    print('idx is ', idx)
    
    #creating Series with the similarity scores in descending order
    scores = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    print(scores)
    
    # finding the top 5 indexes that are similar to movie
    top_50_index = list(scores.iloc[1:51].index)
    
    # populating the list with the titles of the best 10 matching movies
    for x in top_50_index:
        movie_rec.append(list(df_movies.title)[x])
        
    return movie_rec

In [None]:
m = input("Enter a movie title: ")
#putting list into dataframe & rename column
df_movie_rec = pd.DataFrame(movie_recommendations(m), columns=['Movie Titles'])
#shift index by 1
df_movie_rec.index += 1
#print(movie_recommendations("13TH"))
df_movie_rec

In [None]:
y = input("Enter a show title: ")
#putting list into dataframe & rename column
df_show_rec = pd.DataFrame(show_recommendations(y), columns=['TV Show Titles'])
#shift index by 1
df_show_rec.index += 1
df_show_rec

In [None]:
import matplotlib.pyplot as plt

In [None]:
#comparing percentage of shows to movies
movie_count = df_movies['title'].count()
show_count = df_shows['title'].count()
x = ['Movie', 'Show']
y = [movie_count, show_count]
plt.pie(y, labels=x, autopct='%.2f%%', colors=['orange', 'blue'])
plt.title("Count of Shows vs Movies")
plt.show()
#avg_rating = [avg_english_rating, avg_bengali_rating, avg_hindi_rating, avg_tamil_rating, avg_telugu_rating]
#avg_language = ["English", "Bengali", "Hindi", "Tamil", "Telugu"]

#plt.bar(avg_language, avg_rating, color=['yellow', 'red', 'green', 'blue', 'cyan'])
#plt.xlabel("Language")
#plt.ylabel("IMDB Rating")
#plt.title("Average Rating For Each Language")

#plt.show()

In [None]:
movie_textcount = pd.Series(','.join(df_movies['text']).split()).value_counts()[:10]
movie_textlist = movie_textcount.index.tolist()
movie_list = str(movie_textlist).replace(',','')

#Show the top 10 words in consolidated column for Movies
plt.bar(movie_textlist, movie_textcount, color=(0.2, 0.4, 0.6, 0.6))
plt.show()


In [None]:
#Top 15 genres in both Movies and Shows
df_genre = pd.DataFrame(df_netflix['listed_in'])
genre_count = df_genre.listed_in.str.split(', ',expand=True).stack().value_counts()
final_genre = pd.DataFrame(genre_count, columns=['# of genre'])
top_15_genre = pd.DataFrame(genre_count[:15], columns=['# of movies/shows'])
top_15_genre.plot(kind='bar')
plt.xlabel("Genre name")
plt.ylabel("Count of genre")
plt.title("Top 15 Genres in Netflix Dataset")
plt.show()


In [None]:
#Top 10 Movie Genres
df_genre_movies = pd.DataFrame(df_movies['listed_in'])
genre_count = df_genre_movies.listed_in.str.split(', ',expand=True).stack().value_counts()
final_genre = pd.DataFrame(genre_count, columns=['# of genre'])
top_10_genre = pd.DataFrame(genre_count[:10], columns=['# of movies'])
top_10_genre.plot(kind='bar')
plt.xlabel("Genre name")
plt.ylabel("Count of genre")
plt.title("Top 10 Movie Genres")
plt.show()


In [None]:
#Top 10 TV Show Genres
df_genre_show = pd.DataFrame(df_shows['listed_in'])
genre_count = df_genre_show.listed_in.str.split(', ',expand=True).stack().value_counts()
final_genre = pd.DataFrame(genre_count, columns=['# of genre'])
top_10_genre = pd.DataFrame(genre_count[:10], columns=['# of tv shows'])
top_10_genre.plot(kind='bar')
plt.xlabel("Genre name")
plt.ylabel("Count of genre")
plt.title("Top 10 TV Show Genres")
plt.show()


In [None]:
import seaborn as sns
sns.set(style="darkgrid")
ax = sns.countplot(x="rating", data=df_netflix, order=df_netflix['rating'].value_counts().index[0:10])
#Count of movie ratings

In [None]:
ax = sns.countplot(x="rating", hue="country", data=df_netflix, order=df_netflix['rating'].value_counts().index[0:10])
