The aim of this notebook to:
Importing Libraies--Data preprocessing--Handling missing data--Data visualization
1) Type content is available . 
2) Top Five Rating Category. 
3) Top Five Directors. 
4) Top Five Actors. 
5) Trend of focus on TV Shows and movies in
recent years. 
6) Top Ten countries with most content

#Importing library

In [None]:
!pip install cutecharts

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cutecharts
  Downloading cutecharts-1.2.0-py3-none-any.whl (17 kB)
Installing collected packages: cutecharts
Successfully installed cutecharts-1.2.0


In [None]:
import pandas as pd
import numpy as np
import cutecharts.charts as ctc
from cutecharts.charts import Line
from cutecharts.faker import Faker
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

This dataset contains information about Netflix Movies and TV Shows.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data=pd.read_csv('drive/MyDrive/DS/netflix_titles.csv')

In [None]:
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,TV Show,3%,,"João Miguel, Bianca Comparato, Michel Gomes, R...",Brazil,"August 14, 2020",2020,TV-MA,4 Seasons,"International TV Shows, TV Dramas, TV Sci-Fi &...",In a future where the elite inhabit an island ...
1,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,"December 23, 2016",2016,TV-MA,93 min,"Dramas, International Movies",After a devastating earthquake hits Mexico Cit...
2,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,"December 20, 2018",2011,R,78 min,"Horror Movies, International Movies","When an army recruit is found dead, his fellow..."
3,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,"November 16, 2017",2009,PG-13,80 min,"Action & Adventure, Independent Movies, Sci-Fi...","In a postapocalyptic world, rag-doll robots hi..."
4,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,"January 1, 2020",2008,PG-13,123 min,Dramas,A brilliant group of students become card-coun...


In [None]:
print('-'*50)
print('\nSize of Netflix data is {}\n'.format(data.shape))
print('-'*50)


--------------------------------------------------

Size of Netflix data is (7787, 12)

--------------------------------------------------


#data preprocessing

In [None]:
print('-'*50)
print('\nStatistical data of given data')
print('-'*50)
data.describe()

--------------------------------------------------

Statistical data of given data
--------------------------------------------------


Unnamed: 0,release_year
count,7787.0
mean,2013.93258
std,8.757395
min,1925.0
25%,2013.0
50%,2017.0
75%,2018.0
max,2021.0


#Handling missing data
1. replace missing with 'No Director'
2. replace missing cast with 'No Cast'
3. replace missing countries with 'Not Specify'

In [None]:
data['director'].replace(np.nan,'No Director',inplace=True)
data['cast'].replace(np.nan,'No Cast',inplace=True)
data['country'].replace(np.nan,'Not Specify',inplace=True)
data.isnull().sum()

show_id          0
type             0
title            0
director         0
cast             0
country          0
date_added      10
release_year     0
rating           7
duration         0
listed_in        0
description      0
dtype: int64

In [None]:
#drop null values
data=data.dropna()
data.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [None]:
print('-'*50)
print("Check Duplicates")
print('-' * 50)
print('Total Duplicates values: ',data.duplicated().sum())
print('-' * 50)

--------------------------------------------------
Check Duplicates
--------------------------------------------------
Total Duplicates values:  0
--------------------------------------------------


#Data visualization
1) Type of the content available

In [None]:
data['type'].value_counts()

Movie      5372
TV Show    2398
Name: type, dtype: int64

Pie chart of content

In [None]:

t_labels = data['type'].unique()
t_labels

array(['TV Show', 'Movie'], dtype=object)

In [None]:
# pie chart
pie = ctc.Pie('Type of content', # title
width='600px',height='300px')
# set the chart options
pie.set_options(labels=list(t_labels), # names as labels
inner_radius=0, # inner radius set to 0
colors=['Red','blue'])
# label to be shown on graph
pie.add_series(list(t_labels))
# display the charts
pie.render_notebook()

#Top Five Rating Category

In [None]:
newdata = data.groupby('rating').size().rename_axis('Rating').reset_index(name='Count')
nd = newdata.sort_values(by ='Count', ascending=True)
nd = nd.tail(5)

In [None]:
chart = ctc.Bar('Top Five Rating Category', width='600px', height='300px')
chart.set_options(labels=list(nd.Rating), x_label='Category', y_label='Count', colors=Faker.colors)
chart.add_series('Geners',list(nd['Count']))
chart.render_notebook()

# Top Five Directors

In [None]:
fil_directors = data['director'].str.split(',',expand=True).stack()
fil_directors= pd.DataFrame(fil_directors)
fil_directors.columns = ['director']
directors = fil_directors.groupby(['director']).size().reset_index(name='counts')
directors = directors.sort_values(by='counts',ascending=False)
directors = directors[directors['director'] != 'No Director']
directors = directors.head(5)
directors

Unnamed: 0,director,counts
3283,No director,2376
3638,Raúl Campos,18
236,Jan Suter,18
2892,Marcus Raboy,16
2200,Jay Karas,15


In [None]:
chart = ctc.Bar('Top Five Director', width='500px', height='100px')
chart.set_options(labels=list(directors.director),x_label='Director',y_label='Number of Movie', colors=Faker.colors)
chart.add_series('Geners',list(directors.counts))
chart.render_notebook()


#Top Five Actors

In [None]:
fil_actors = data['cast'].str.split(',',expand=True).stack()
fil_actors= pd.DataFrame(fil_actors)
fil_actors.columns = ['cast']
actors = fil_actors.groupby(['cast']).size().reset_index(name='counts')
actors = actors.sort_values(by='counts',ascending=False)
actors = actors[actors['cast'] != 'No Cast']
actors = actors.head(5)
actors

Unnamed: 0,cast,counts
2314,Anupam Kher,38
27200,Takahiro Sakurai,28
34547,Shah Rukh Khan,27
21165,Om Puri,27
3750,Boman Irani,25


In [None]:
chart = ctc.Bar('Top Five Actor', width='500px', height='100px')
chart.set_options(labels=list(actors.cast),x_label='Actor',y_label='Number of Movie', colors=Faker.colors)
chart.add_series('Geners',list(actors.counts))
chart.render_notebook()


#Trend of focus on TV Shows and movies in recent years

In [None]:
dff = data[['type','release_year']]
dff = dff.rename(columns = {'release_year' : 'Release Year'})
dff2 = dff.groupby(['Release Year','type']).size().reset_index(name='Total Content')
dff2 = dff2[dff2['Release Year']>=2011]
dff3 = dff2[dff2['type']=='Movie']
dff4 = dff2[dff2['type']=='TV Show']

In [None]:
chart = Line('Last 10 Years of trends', width='500px', height='100px')
chart.set_options(labels=list(dff3['Release Year']), x_label='Year', y_label='Count',)
chart.add_series('Movie', list(dff3['Total Content']))
chart.add_series('TV Show', list(dff4['Total Content']))
chart.render_notebook()

#Top Ten countries with most content

In [None]:
top_countries=data['country'].value_counts()[:10].to_frame(name='count')
top_countries

Unnamed: 0,count
United States,2546
India,923
Not Specify,505
United Kingdom,396
Japan,224
South Korea,183
Canada,177
Spain,134
France,115
Egypt,101


In [None]:
pie =ctc.Pie('Countries with most content',width='600px',height='300px')
pie.set_options(labels=list(top_countries.index),inner_radius=0)
pie.add_series(list(top_countries['count']))
pie.render_notebook()
