In [30]:
import pandas as pd
import matplotlib.pyplot as plt

# **1 Dataset Overview**

In [31]:
# read csv file
df_netflix = pd.read_csv('netflix_titles.csv')

In [32]:
# data types
df_netflix.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [33]:
# shape of dataframe
df_netflix.shape

(8807, 12)

In [34]:
df_netflix.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# **2 Identifying Missing Data**

In [35]:
# Number of rows missing in each column
df_netflix.isnull().sum().sort_values(ascending=False)

director        2634
country          831
cast             825
date_added        10
rating             4
duration           3
show_id            0
type               0
title              0
release_year       0
listed_in          0
description        0
dtype: int64

In [36]:
# % of rows missing in each column

for column in df_netflix.columns:
    percentage = df_netflix[column].isnull().mean()
    print(column + ': ' + str(round(percentage*100, 2)))

show_id: 0.0
type: 0.0
title: 0.0
director: 29.91
cast: 9.37
country: 9.44
date_added: 0.11
release_year: 0.0
rating: 0.05
duration: 0.03
listed_in: 0.0
description: 0.0


# **3 Dealing with missing data**

In [42]:
# remove a column or row with .drop, .dropna or .isnull

# drop column
# df_netflix.drop('director', axis=1)
# drop row
# no_director = df_netflix[df_netflix['director'].isnull()].index
# df_netflix.drop(no_director, axis=0)

# ~ + .isnull()
# df_netflix[~(df_netflix['director'].isnull())].isnull().sum()

# dropna()
df_netflix.dropna(subset=['director']).isnull().sum()

show_id           0
type              0
title             0
director          0
cast            473
country         422
date_added        0
release_year      0
rating            1
duration          3
listed_in         0
description       0
dtype: int64

In [None]:
# Replace it by the mean, median (numeric data) or mode (categorical data)

mode = ''.join(df_netflix['rating'].mode())

df_netflix['rating'].fillna(mode, inplace=True)

show_id         0.000000
type            0.000000
title           0.000000
director        0.299080
cast            0.093675
country         0.094357
date_added      0.001135
release_year    0.000000
rating          0.000000
duration        0.000341
listed_in       0.000000
description     0.000000
dtype: float64

In [52]:
# Replace it by an arbitrary number with .fillna()
df_netflix['duration'].fillna(0, inplace=True)

# backward, forward
df_netflix['director'].bfill()
df_netflix['director'].ffill()

0       Kirsten Johnson
1       Kirsten Johnson
2       Julien Leclercq
3       Julien Leclercq
4       Julien Leclercq
             ...       
8802      David Fincher
8803      David Fincher
8804    Ruben Fleischer
8805       Peter Hewitt
8806        Mozez Singh
Name: director, Length: 8807, dtype: object