In [61]:
import numpy as np
import pandas as pd
import plotly.express as px
from textblob import TextBlob

In [18]:
df=pd.read_csv('netflix_titles.csv')
df.shape

(8807, 12)

In [19]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [20]:
df['director']

0       Kirsten Johnson
1                   NaN
2       Julien Leclercq
3                   NaN
4                   NaN
             ...       
8802      David Fincher
8803                NaN
8804    Ruben Fleischer
8805       Peter Hewitt
8806        Mozez Singh
Name: director, Length: 8807, dtype: object

**RATINGS ON NETFLIX**

In [7]:
df['rating']

0       PG-13
1       TV-MA
2       TV-MA
3       TV-MA
4       TV-MA
        ...  
8802        R
8803    TV-Y7
8804        R
8805       PG
8806    TV-14
Name: rating, Length: 8807, dtype: object

In [9]:
rating_group = df.groupby(['rating']).size().reset_index(name='counts')
rating_group

Unnamed: 0,rating,counts
0,66 min,1
1,74 min,1
2,84 min,1
3,G,41
4,NC-17,3
5,NR,80
6,PG,287
7,PG-13,490
8,R,799
9,TV-14,2160


In [10]:
pieChart = px.pie(rating_group, values='counts', names='rating',
                  title='Distribution of Content Ratings on Netflix',
                  color_discrete_sequence=px.colors.qualitative.Set3)
pieChart.show()

**TOP 5 DIRECTORS ON NETFLIX**

In [23]:
df['director']=df['director'].fillna('No Director Mentioned')
df['director']

0             Kirsten Johnson
1       No Director Specified
2             Julien Leclercq
3       No Director Specified
4       No Director Specified
                ...          
8802            David Fincher
8803    No Director Specified
8804          Ruben Fleischer
8805             Peter Hewitt
8806              Mozez Singh
Name: director, Length: 8807, dtype: object

In [24]:
director_df=df['director'].str.split(',',expand=True).stack()
director_df

0     0          Kirsten Johnson
1     0    No Director Specified
2     0          Julien Leclercq
3     0    No Director Specified
4     0    No Director Specified
                   ...          
8802  0            David Fincher
8803  0    No Director Specified
8804  0          Ruben Fleischer
8805  0             Peter Hewitt
8806  0              Mozez Singh
Length: 9612, dtype: object

In [25]:
director_df=director_df.to_frame()
director_df

Unnamed: 0,Unnamed: 1,0
0,0,Kirsten Johnson
1,0,No Director Specified
2,0,Julien Leclercq
3,0,No Director Specified
4,0,No Director Specified
...,...,...
8802,0,David Fincher
8803,0,No Director Specified
8804,0,Ruben Fleischer
8805,0,Peter Hewitt


In [27]:
director_df.columns=['Director']
director_df

Unnamed: 0,Unnamed: 1,Director
0,0,Kirsten Johnson
1,0,No Director Specified
2,0,Julien Leclercq
3,0,No Director Specified
4,0,No Director Specified
...,...,...
8802,0,David Fincher
8803,0,No Director Specified
8804,0,Ruben Fleischer
8805,0,Peter Hewitt


In [28]:
directors=director_df.groupby(['Director']).size().reset_index(name='Total Content')
directors

Unnamed: 0,Director,Total Content
0,Aaron Moorhead,2
1,Aaron Woolf,1
2,Abbas Alibhai Burmawalla,1
3,Abdullah Al Noor,1
4,Abhinav Shiv Tiwari,1
...,...,...
5116,Çagan Irmak,1
5117,Ísold Uggadóttir,1
5118,Óskar Thór Axelsson,1
5119,Ömer Faruk Sorak,2


In [32]:
directors=directors[directors.Director !='No Director Specified']
directors=directors.sort_values(by=['Total Content'],ascending=False)
top_directors=directors.head()
top_directors

Unnamed: 0,Director,Total Content
4021,Rajiv Chilaka,22
4068,Raúl Campos,18
261,Jan Suter,18
4652,Suhas Kadav,16
3235,Marcus Raboy,16


In [34]:
top_directors=top_directors.sort_values(by=['Total Content'])
top_directors

Unnamed: 0,Director,Total Content
4652,Suhas Kadav,16
3235,Marcus Raboy,16
4068,Raúl Campos,18
261,Jan Suter,18
4021,Rajiv Chilaka,22


In [36]:
director_plot=px.bar(top_directors,x='Director',y='Total Content',title='Top 5 Directors on Netflix')
director_plot.show()

From above, Rajiv Chilaka is top Director on Netflix.

**TOP 5 ACTORS ON NETFLIX**

In [37]:
df['cast']=df['cast'].fillna('No Cast Specified')
df['cast']

0                                       No Cast Specified
1       Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...
2       Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...
3                                       No Cast Specified
4       Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...
                              ...                        
8802    Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...
8803                                    No Cast Specified
8804    Jesse Eisenberg, Woody Harrelson, Emma Stone, ...
8805    Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...
8806    Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...
Name: cast, Length: 8807, dtype: object

In [40]:
cast_df=df['cast'].str.split(',',expand=True).stack()
cast_df=cast_df.to_frame()
cast_df.columns=['Actor']
cast_df

Unnamed: 0,Unnamed: 1,Actor
0,0,No Cast Specified
1,0,Ama Qamata
1,1,Khosi Ngema
1,2,Gail Mabalane
1,3,Thabang Molaba
...,...,...
8806,3,Manish Chaudhary
8806,4,Meghna Malik
8806,5,Malkeet Rauni
8806,6,Anita Shabdish


In [42]:
actors_df=cast_df.groupby(['Actor']).size().reset_index(name='Total Content')
actors_df=actors_df[actors_df.Actor !='No Cast Specified']
actors_df

Unnamed: 0,Actor,Total Content
0,Jr.,2
1,"""Riley"" Lakdhar Dridi",1
2,'Najite Dede,1
3,2 Chainz,1
4,2Mex,1
...,...,...
39292,İbrahim Büyükak,1
39293,İbrahim Çelikkol,1
39294,Şahin Irmak,1
39295,Şükrü Özyıldız,1


In [45]:
actors_df=actors_df.sort_values(by=['Total Content'],ascending=False)
top_actors=actors_df.head()
top_actors=top_actors.sort_values(by=['Total Content'])
top_actors

Unnamed: 0,Actor,Total Content
23624,Om Puri,27
15541,Julie Tejwani,28
30303,Takahiro Sakurai,30
26941,Rupa Bhimani,31
2612,Anupam Kher,39


In [47]:
actor_plot=px.bar(top_actors,x='Actor',y='Total Content', title='Top 5 Actors on Netflix')
actor_plot.show()

From above plot, Anupam Kher is the top Actor on Netflix.

**CONTENT PRODUCED OVER YEARS**

In [48]:
release_df=df[['type','release_year']]
release_df

Unnamed: 0,type,release_year
0,Movie,2020
1,TV Show,2021
2,TV Show,2021
3,TV Show,2021
4,TV Show,2021
...,...,...
8802,Movie,2007
8803,TV Show,2018
8804,Movie,2009
8805,Movie,2006


In [49]:
release_df=release_df.rename(columns={"release_year": "Release Year"})
release_df

Unnamed: 0,type,Release Year
0,Movie,2020
1,TV Show,2021
2,TV Show,2021
3,TV Show,2021
4,TV Show,2021
...,...,...
8802,Movie,2007
8803,TV Show,2018
8804,Movie,2009
8805,Movie,2006


In [50]:
release_df=release_df.groupby(['Release Year','type']).size().reset_index(name='Total Content')
release_df

Unnamed: 0,Release Year,type,Total Content
0,1925,TV Show,1
1,1942,Movie,2
2,1943,Movie,3
3,1944,Movie,3
4,1945,Movie,3
...,...,...,...
114,2019,TV Show,397
115,2020,Movie,517
116,2020,TV Show,436
117,2021,Movie,277


In [51]:
release_df=release_df[release_df['Release Year']>=2010]
release_df

Unnamed: 0,Release Year,type,Total Content
95,2010,Movie,154
96,2010,TV Show,40
97,2011,Movie,145
98,2011,TV Show,40
99,2012,Movie,173
100,2012,TV Show,64
101,2013,Movie,225
102,2013,TV Show,63
103,2014,Movie,264
104,2014,TV Show,88


In [53]:
release_plot = px.line(release_df, x="Release Year", y="Total Content", color='type',title='Content produced over the years on Netflix')
release_plot.show()

 Both movies and TV shows have experienced decrease since 2018.







**SENTIMENT ANALYSIS ON NETFLIX**

In [55]:
final_df=df[['release_year','description']]
final_df=final_df.rename(columns={'release_year':'Release Year'})
final_df

Unnamed: 0,Release Year,description
0,2020,"As her father nears the end of his life, filmm..."
1,2021,"After crossing paths at a party, a Cape Town t..."
2,2021,To protect his family from a powerful drug lor...
3,2021,"Feuds, flirtations and toilet talk go down amo..."
4,2021,In a city of coaching centers known to train I...
...,...,...
8802,2007,"A political cartoonist, a crime reporter and a..."
8803,2018,"While living alone in a spooky town, a young g..."
8804,2009,Looking to survive in a world taken over by zo...
8805,2006,"Dragged from civilian life, a former superhero..."


In [57]:
for index,row in final_df.iterrows():
    z=row['description']
    testimonial=TextBlob(z)
    p=testimonial.sentiment.polarity
    if p==0:
        sent='Neutral'
    elif p>0:
        sent='Positive'
    else:
        sent='Negative'
    final_df.loc[[index,2],'Sentiment']=sent

In [58]:
final_df

Unnamed: 0,Release Year,description,Sentiment
0,2020,"As her father nears the end of his life, filmm...",Positive
1,2021,"After crossing paths at a party, a Cape Town t...",Neutral
2,2021,To protect his family from a powerful drug lor...,Negative
3,2021,"Feuds, flirtations and toilet talk go down amo...",Negative
4,2021,In a city of coaching centers known to train I...,Neutral
...,...,...,...
8802,2007,"A political cartoonist, a crime reporter and a...",Negative
8803,2018,"While living alone in a spooky town, a young g...",Positive
8804,2009,Looking to survive in a world taken over by zo...,Neutral
8805,2006,"Dragged from civilian life, a former superhero...",Positive


In [59]:
final_df=final_df.groupby(['Release Year','Sentiment']).size().reset_index(name='Total Content')
final_df

Unnamed: 0,Release Year,Sentiment,Total Content
0,1925,Neutral,1
1,1942,Neutral,2
2,1943,Negative,1
3,1943,Neutral,2
4,1944,Negative,1
...,...,...,...
180,2020,Neutral,161
181,2020,Positive,519
182,2021,Negative,164
183,2021,Neutral,85


In [60]:
final_df=final_df[final_df['Release Year']>=2010]
sentiment_plot = px.bar(final_df, x="Release Year", y="Total Content", color="Sentiment", title="Sentiment of content on Netflix")
sentiment_plot.show()


The graph illustrates that the combined quantity of positive content consistently surpasses the sum of neutral and negative content.