### Importing Libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import boxcox

pd.options.display.max_rows = 200

### Loading Data Set

In [15]:
netflix = pd.read_csv('netflix_titles.csv')
netflix

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


### Calculating percentage of missing values

In [16]:
percent_missing = netflix.isnull().sum() * 100 / len(netflix)
netflix_missing_values = pd.DataFrame({'column_name': netflix.columns,
                                 'percent_missing': percent_missing})
netflix_missing_values

Unnamed: 0,column_name,percent_missing
show_id,show_id,0.0
type,type,0.0
title,title,0.0
director,director,29.908028
cast,cast,9.367549
country,country,9.435676
date_added,date_added,0.113546
release_year,release_year,0.0
rating,rating,0.045418
duration,duration,0.034064


## Data Cleaning
### - remove the string international from the column listed_in if it appears as the first string in the column
This step is to allow us to be able to classify the different rows in the column listed_in into various genres and removing the string international when it appears as first string in row then replacing it with others in a case where the only string in the row is the one that contains international. 
This will make visualisation more plausible and clearer

In [17]:
netflix['listed_in'] = netflix['listed_in'].str.split(',').apply(lambda x: [cat.strip() for cat in x if 'International' not in cat])

In [18]:
netflix['listed_in'] = netflix['listed_in'].apply(lambda x: x[0] if x else 'Others')

In [19]:
netflix['listed_in']

0                  Documentaries
1                      TV Dramas
2                 Crime TV Shows
3                     Docuseries
4              Romantic TV Shows
                  ...           
8802                 Cult Movies
8803                    Kids' TV
8804                    Comedies
8805    Children & Family Movies
8806                      Dramas
Name: listed_in, Length: 8807, dtype: object

#### The count of the number of Unique values in the column listed_in

In [20]:
netflix.listed_in.value_counts()

Dramas                          1600
Comedies                        1210
Action & Adventure               859
Documentaries                    829
Children & Family Movies         605
Crime TV Shows                   399
Kids' TV                         389
Stand-Up Comedy                  334
Horror Movies                    275
TV Dramas                        262
British TV Shows                 253
Romantic TV Shows                250
Docuseries                       221
TV Comedies                      198
Reality TV                       180
Anime Series                     176
Thrillers                        132
Korean TV Shows                  108
Classic Movies                    80
TV Action & Adventure             74
Movies                            57
Spanish-Language TV Shows         53
Stand-Up Comedy & Talk Shows      46
Music & Musicals                  38
Sci-Fi & Fantasy                  26
Romantic Movies                   24
TV Horror                         24
C

In [21]:
# Unique vales before grouping
netflix.listed_in.unique()

array(['Documentaries', 'TV Dramas', 'Crime TV Shows', 'Docuseries',
       'Romantic TV Shows', 'Children & Family Movies', 'Dramas',
       'British TV Shows', 'Comedies', 'TV Comedies', 'Thrillers',
       'Spanish-Language TV Shows', 'TV Action & Adventure',
       'Horror Movies', "Kids' TV", 'Action & Adventure', 'Reality TV',
       'Anime Series', 'Sci-Fi & Fantasy', 'Classic Movies', 'TV Shows',
       'Stand-Up Comedy', 'Music & Musicals', 'Movies',
       'Stand-Up Comedy & Talk Shows', 'Classic & Cult TV',
       'Anime Features', 'Romantic Movies', 'Korean TV Shows',
       'Cult Movies', 'TV Horror', 'Independent Movies', 'TV Mysteries',
       'Others', 'LGBTQ Movies', 'TV Sci-Fi & Fantasy', 'Sports Movies'],
      dtype=object)

### Grouping listed_in columns

In [22]:
netflix['listed_in'] = netflix['listed_in'].replace({'Docuseries':'Documentary',"Documentaries": "Documentary", 
                                                     'TV Dramas' : "Dramas", 'Crime TV Shows':'Thrillers',
                                                     'Romantic TV Shows':'Romantic','Comedies': 'Comedy',
                                                     "Children & Family Movies": 'Children & Family','TV Comedies': 'Comedy',
                                                     "Action & Adventure":"Thrillers", 'Horror Movies':'Horror',
                                                     "Kids' TV":'Children & Family', 'TV Action & Adventure': 'Thrillers',
                                                     'Reality TV':'Dramas', 'Anime Series':'Fantasy', 
                                                     'Sci-Fi & Fantasy':'Fantasy', 'Classic Movies':'Classics',
                                                     'TV Shows': 'Others', 'Stand-Up Comedy': "Comedy",'Movies':'Others',
                                                     'Stand-Up Comedy & Talk Shows':'Comedy','Classic & Cult TV':'Classics',
                                                     'Anime Features':'Fantasy','Romantic Movies':'Romantic', 
                                                     'Cult Movies':'Classics', 'TV Horror': 'Horror',
                                                     'Independent Movies': 'Others','TV Mysteries': 'Thrillers',
                                                     'Other Movies': 'Others', 'LGBTQ Movies': 'Others',
                                                     'TV Sci-Fi & Fantasy': 'Fantasy','Sports Movies':'Others'})

In [23]:
netflix.listed_in.value_counts()

Dramas                       2042
Comedy                       1788
Thrillers                    1466
Documentary                  1050
Children & Family             994
Horror                        299
Romantic                      274
British TV Shows              253
Fantasy                       224
Classics                      114
Korean TV Shows               108
Others                        104
Spanish-Language TV Shows      53
Music & Musicals               38
Name: listed_in, dtype: int64

#### Checking number of Unique values after grouping

In [24]:
netflix.listed_in.nunique()

14

### finding unique values of rating column

In [25]:
# before grouping the rating
netflix.rating.unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [26]:
netflix.rating.value_counts()

TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64

In [27]:
def clean_rating(x):
    if x in ['TV-G', 'G', 'TV-Y']:
        return 'G'
    elif x in ['PG','TV-PG','TV-Y7', 'TV-Y']:
        return 'PG'
    elif x in ['PG-13', 'TV-14', 'TV-Y7-FV']:
        return 'PG-13'
    elif x in ['R']:
        return 'R'
    elif x in ['TV-MA','NC-17']:
        return 'NC-17'
    elif x in ['UR','NR']:
        return 'UNRATED'
    else:
        return 'nan'

In [28]:
netflix['rating'] = netflix['rating'].apply(clean_rating)
netflix

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentary,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,NC-17,2 Seasons,Dramas,"After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,NC-17,1 Season,Thrillers,To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,NC-17,1 Season,Documentary,"Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,NC-17,2 Seasons,Romantic,In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,Classics,"A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,PG,2 Seasons,Children & Family,"While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,Comedy,Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,Children & Family,"Dragged from civilian life, a former superhero..."


In [29]:
netflix.rating.value_counts()

NC-17      3210
PG-13      2656
PG         1484
R           799
G           568
UNRATED      83
nan           7
Name: rating, dtype: int64

In [30]:
cast = netflix['cast'].str.split(', ').explode().value_counts()
cast

Anupam Kher                43
Shah Rukh Khan             35
Julie Tejwani              33
Naseeruddin Shah           32
Takahiro Sakurai           32
                           ..
Maryam Zaree                1
Melanie Straub              1
Gabriela Maria Schmeide     1
Helena Zengel               1
Chittaranjan Tripathy       1
Name: cast, Length: 36439, dtype: int64

In [32]:
netflix.to_csv("Netflix_cleaned_dataset.csv")

In [34]:
director = netflix['director'].str.split(', ').explode().value_counts()
director

Rajiv Chilaka     22
Jan Suter         21
Raúl Campos       19
Suhas Kadav       16
Marcus Raboy      16
                  ..
Raymie Muzquiz     1
Stu Livingston     1
Joe Menendez       1
Eric Bross         1
Mozez Singh        1
Name: director, Length: 4993, dtype: int64