In [318]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Get an overview of the data

In [319]:
# Load dataframe
df = pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [320]:
df.shape

(8807, 12)

In [321]:
df.columns.tolist()

['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description']

# Check columns with nulls

In [322]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [323]:
# From above info: 'director', 'country', 'cast' have most nulls consecutively
# Now confirm the column with most nulls
most_nulls_column = df.isnull().sum().idxmax()
most_nulls_column

'director'

### Dealing with null values in 'director' column

In [324]:
# Since 'director' column has most nulls, view what's the null designation
# In this case it's 'NaN'
df['director'].unique()

array(['Kirsten Johnson', nan, 'Julien Leclercq', ..., 'Majid Al Ansari',
       'Peter Hewitt', 'Mozez Singh'], dtype=object)

In [325]:
# Get the null values count
df['director'].isnull().sum()

2634

In [326]:
# Find how missing values are represented in the column: 
def missing_val_representation(column):
    missing_val = set()
    for val in column:
        if pd.isna(val) or val == "NaN" or val == "NA":
            missing_val.add(val)
    return missing_val

missing_val_representation(df['director'])

{nan}

In [327]:
# Replace null values in director column with 'Unavailable'
df['director'] = df['director'].apply(lambda x: 'Unavailable' if pd.isna(x) == True else x)

In [328]:
# Check how many null values in 'director' column
df['director'].isnull().sum()  # The count is now 0

0

In [329]:
# Check the info summary once more
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


### Dealing with null values in 'country' column

In [330]:
# Confirm count of non-null values is equal to one above 
null_value_count_country_col = df['country'].isnull().sum()
non_null_value_count_country_col = len(df) - null_value_count_country_col

print(f"Non-null value count in 'country': {non_null_value_count_country_col}")
print(f"Null value count in 'country': {null_value_count_country_col}")

Non-null value count in 'country': 7976
Null value count in 'country': 831


In [331]:
# Replace null values in ‘country’ column with ‘Not Available’
df['country'] = df['country'].apply(lambda x: 'Unavailable' if pd.isna(x) == True else x)

In [332]:
# Confirm all null values in 'country' have been replaced
null_value_count_country_col = df['country'].isnull().sum()
print(f"Null value count 'country' col after removal: {null_value_count_country_col}")

Null value count 'country' col after removal: 0


In [333]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          7982 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


### Dealing with null values in ‘cast’ column

In [334]:
# Confirm count of non values in 'cast' column (Hence all other values are null)
null_vals_count_cast_col = df['cast'].isnull().sum()
non_null_vals_in_cast_col = len(df) - null_vals_count_cast_col

print(f"Non-null value count in 'country': {non_null_vals_in_cast_col}")
print(f"Null value count in 'country': {null_vals_count_cast_col}")

Non-null value count in 'country': 7982
Null value count in 'country': 825


In [335]:
# Replacing all null values in 'cast' column with "Unavailable"
df['cast'] = df['cast'].apply(lambda x: "Unavailable" if pd.isna(x) == True else x)

In [336]:
# Confirm all null values in 'cast' have been removed
df['cast'].isnull().sum()

0

In [337]:
# Confirm summary info once more
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


### Cleaning date_added column

In [338]:
# Checking the number of null and non-null values
# Confirm count of non values in 'cast' column (Hence all other values are null)
null_vals_date_added_col = df['date_added'].isnull().sum()
non_null_vals_date_added_col = len(df) - null_vals_date_added_col

print(f"Non-null value count in 'country': {non_null_vals_date_added_col}")
print(f"Null value count in 'country': {null_vals_date_added_col}")

Non-null value count in 'country': 8797
Null value count in 'country': 10


In [339]:
# Returning rows with null values
rows_with_null_date_added_col = df[df['date_added'].isnull()]
rows_with_null_date_added_col

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
6066,s6067,TV Show,A Young Doctor's Notebook and Other Stories,Unavailable,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ..."
6174,s6175,TV Show,Anthony Bourdain: Parts Unknown,Unavailable,Anthony Bourdain,United States,,2018,TV-PG,5 Seasons,Docuseries,This CNN original series has chef Anthony Bour...
6795,s6796,TV Show,Frasier,Unavailable,"Kelsey Grammer, Jane Leeves, David Hyde Pierce...",United States,,2003,TV-PG,11 Seasons,"Classic & Cult TV, TV Comedies",Frasier Crane is a snooty but lovable Seattle ...
6806,s6807,TV Show,Friends,Unavailable,"Jennifer Aniston, Courteney Cox, Lisa Kudrow, ...",United States,,2003,TV-14,10 Seasons,"Classic & Cult TV, TV Comedies",This hit sitcom follows the merry misadventure...
6901,s6902,TV Show,Gunslinger Girl,Unavailable,"Yuuka Nanri, Kanako Mitsuhashi, Eri Sendai, Am...",Japan,,2008,TV-14,2 Seasons,"Anime Series, Crime TV Shows","On the surface, the Social Welfare Agency appe..."
7196,s7197,TV Show,Kikoriki,Unavailable,Igor Dmitriev,Unavailable,,2010,TV-Y,2 Seasons,Kids' TV,A wacky rabbit and his gang of animal pals hav...
7254,s7255,TV Show,La Familia P. Luche,Unavailable,"Eugenio Derbez, Consuelo Duval, Luis Manuel Áv...",United States,,2012,TV-14,3 Seasons,"International TV Shows, Spanish-Language TV Sh...","This irreverent sitcom featues Ludovico, Feder..."
7406,s7407,TV Show,Maron,Unavailable,"Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh...",United States,,2016,TV-MA,4 Seasons,TV Comedies,"Marc Maron stars as Marc Maron, who interviews..."
7847,s7848,TV Show,Red vs. Blue,Unavailable,"Burnie Burns, Jason Saldaña, Gustavo Sorola, G...",United States,,2015,NR,13 Seasons,"TV Action & Adventure, TV Comedies, TV Sci-Fi ...","This parody of first-person shooter games, mil..."
8182,s8183,TV Show,The Adventures of Figaro Pho,Unavailable,"Luke Jurevicius, Craig Behenna, Charlotte Haml...",Australia,,2015,TV-Y7,2 Seasons,"Kids' TV, TV Comedies","Imagine your worst fears, then multiply them: ..."


In [340]:
# Returning null values rows together with adjacent rows to decide on the filling date
# Whether to ffill or bfill

null_indexes = df[df['date_added'].isnull()].index

rows_to_include = []

for index in null_indexes:
    if index > 0:
        rows_to_include.append(index - 1)
    rows_to_include.append(index)
    if index < len(df) - 1:
        rows_to_include.append(index + 1)

# iloc is used for integer-based indexing, selects rows at those indexes
null_date_added_rows_plus_adjacent_ones = df.iloc[rows_to_include]
null_date_added_rows_plus_adjacent_ones 

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
6065,s6066,Movie,A Wrinkle in Time,Ava DuVernay,"Storm Reid, Oprah Winfrey, Reese Witherspoon, ...",United States,"September 25, 2018",2018,PG,110 min,Children & Family Movies,"Years after their father disappears, Meg and h..."
6066,s6067,TV Show,A Young Doctor's Notebook and Other Stories,Unavailable,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ..."
6067,s6068,TV Show,A.D. Kingdom and Empire,Unavailable,"Juan Pablo Di Pace, Adam Levy, Chipo Chung, Ba...",United States,"December 15, 2017",2015,TV-14,1 Season,TV Dramas,"In the wake of Jesus Christ's crucifixion, his..."
6173,s6174,Movie,Antariksha Ke Rakhwale,Anirban Majumder,"Sonal Kaushal, Rupa Bhimani, Julie Tejwani, Sa...",Unavailable,"June 18, 2019",2018,TV-Y7,65 min,Children & Family Movies,Space villains have abducted all the superhero...
6174,s6175,TV Show,Anthony Bourdain: Parts Unknown,Unavailable,Anthony Bourdain,United States,,2018,TV-PG,5 Seasons,Docuseries,This CNN original series has chef Anthony Bour...
6175,s6176,Movie,Antidote,Ken Barbet,"Randy Couture, Chuck Zito, Wil Traval, Celeste...",United States,"April 14, 2019",2018,TV-MA,90 min,"Action & Adventure, Horror Movies",A tough-as-nails treasure hunter protects a hu...
6794,s6795,Movie,Frances Ha,Noah Baumbach,"Greta Gerwig, Mickey Sumner, Charlotte d'Amboi...","United States, Brazil","October 1, 2017",2012,R,86 min,"Comedies, Dramas, Independent Movies",Determined to make it as a modern dancer in Ne...
6795,s6796,TV Show,Frasier,Unavailable,"Kelsey Grammer, Jane Leeves, David Hyde Pierce...",United States,,2003,TV-PG,11 Seasons,"Classic & Cult TV, TV Comedies",Frasier Crane is a snooty but lovable Seattle ...
6796,s6797,Movie,Freak Show,Trudie Styler,"Alex Lawther, Abigail Breslin, AnnaSophia Robb...",United States,"August 13, 2020",2018,TV-MA,91 min,"Comedies, Dramas, Independent Movies","Forced to attend a new high school, a glamorou..."
6805,s6806,Movie,Friend Request,Simon Verhoeven,"Alycia Debnam-Carey, William Moseley, Connor P...",Germany,"April 18, 2018",2016,R,92 min,"Horror Movies, International Movies",A popular college student's love of social med...


In [341]:
# Getting the indexes of the null values
date_added_null_values_indexes = df[df['date_added'].isnull()].index
date_added_null_values_indexes

Index([6066, 6174, 6795, 6806, 6901, 7196, 7254, 7406, 7847, 8182], dtype='int64')

In [342]:
# Check date format
df['date_added'].head()

0    September 25, 2021
1    September 24, 2021
2    September 24, 2021
3    September 24, 2021
4    September 24, 2021
Name: date_added, dtype: object

In [343]:
# Check if it's type date
type(df['date_added'][0])

str

In [344]:
# Since it's a string. Convert values in the column to datetime objects
# df['date_added'] = pd.to_datetime(df['date_added'], format='%B %d, %Y', errors="coerce")
df['date_added'] = pd.to_datetime(df['date_added'], format='mixed', dayfirst=True)

# Check if type has changed
# type(df['date_added'][0])

In [345]:
df['date_added'].unique()

<DatetimeArray>
['2021-09-25 00:00:00', '2021-09-24 00:00:00', '2021-09-23 00:00:00',
 '2021-09-22 00:00:00', '2021-09-21 00:00:00', '2021-09-20 00:00:00',
 '2021-09-19 00:00:00', '2021-09-17 00:00:00', '2021-09-16 00:00:00',
 '2021-09-15 00:00:00',
 ...
 '2018-09-27 00:00:00', '2017-03-23 00:00:00', '2016-12-25 00:00:00',
 '2016-11-30 00:00:00', '2017-10-23 00:00:00', '2017-11-04 00:00:00',
 '2015-08-05 00:00:00', '2018-12-06 00:00:00', '2016-03-09 00:00:00',
 '2020-01-11 00:00:00']
Length: 1715, dtype: datetime64[ns]

In [346]:
# Confirming that only the original null values indexes still exist
date_added_null_values_indexes = df[df['date_added'].isnull()].index
date_added_null_values_indexes

Index([6066, 6174, 6795, 6806, 6901, 7196, 7254, 7406, 7847, 8182], dtype='int64')

In [347]:
# Getting the count of null values
null_vals_date_added_col = df['date_added'].isnull().sum()
null_vals_date_added_col

10

In [348]:
# Checking the type of converted time column values
type(df['date_added'][0])

pandas._libs.tslibs.timestamps.Timestamp

In [349]:
# Interpolate the null date values
df['date_added'] = df['date_added'].interpolate(method='linear')

In [350]:
# Getting the count of null values
null_vals_date_added_col = df['date_added'].isnull().sum()
null_vals_date_added_col 

0

In [351]:
# iloc is used for integer-based indexing, selects rows at those indexes
null_date_added_rows_plus_adjacent_ones = df.iloc[rows_to_include]
null_date_added_rows_plus_adjacent_ones 

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
6065,s6066,Movie,A Wrinkle in Time,Ava DuVernay,"Storm Reid, Oprah Winfrey, Reese Witherspoon, ...",United States,2018-09-25 00:00:00,2018,PG,110 min,Children & Family Movies,"Years after their father disappears, Meg and h..."
6066,s6067,TV Show,A Young Doctor's Notebook and Other Stories,Unavailable,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,2018-05-06 00:00:00,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ..."
6067,s6068,TV Show,A.D. Kingdom and Empire,Unavailable,"Juan Pablo Di Pace, Adam Levy, Chipo Chung, Ba...",United States,2017-12-15 00:00:00,2015,TV-14,1 Season,TV Dramas,"In the wake of Jesus Christ's crucifixion, his..."
6173,s6174,Movie,Antariksha Ke Rakhwale,Anirban Majumder,"Sonal Kaushal, Rupa Bhimani, Julie Tejwani, Sa...",Unavailable,2019-06-18 00:00:00,2018,TV-Y7,65 min,Children & Family Movies,Space villains have abducted all the superhero...
6174,s6175,TV Show,Anthony Bourdain: Parts Unknown,Unavailable,Anthony Bourdain,United States,2019-05-16 12:00:00,2018,TV-PG,5 Seasons,Docuseries,This CNN original series has chef Anthony Bour...
6175,s6176,Movie,Antidote,Ken Barbet,"Randy Couture, Chuck Zito, Wil Traval, Celeste...",United States,2019-04-14 00:00:00,2018,TV-MA,90 min,"Action & Adventure, Horror Movies",A tough-as-nails treasure hunter protects a hu...
6794,s6795,Movie,Frances Ha,Noah Baumbach,"Greta Gerwig, Mickey Sumner, Charlotte d'Amboi...","United States, Brazil",2017-10-01 00:00:00,2012,R,86 min,"Comedies, Dramas, Independent Movies",Determined to make it as a modern dancer in Ne...
6795,s6796,TV Show,Frasier,Unavailable,"Kelsey Grammer, Jane Leeves, David Hyde Pierce...",United States,2019-03-08 12:00:00,2003,TV-PG,11 Seasons,"Classic & Cult TV, TV Comedies",Frasier Crane is a snooty but lovable Seattle ...
6796,s6797,Movie,Freak Show,Trudie Styler,"Alex Lawther, Abigail Breslin, AnnaSophia Robb...",United States,2020-08-13 00:00:00,2018,TV-MA,91 min,"Comedies, Dramas, Independent Movies","Forced to attend a new high school, a glamorou..."
6805,s6806,Movie,Friend Request,Simon Verhoeven,"Alycia Debnam-Carey, William Moseley, Connor P...",Germany,2018-04-18 00:00:00,2016,R,92 min,"Horror Movies, International Movies",A popular college student's love of social med...


In [352]:
# Converting date_added col from Timestamp (date & time) to date only
# after interpolating the null values
df['date_added'] = df['date_added'].dt.date

# Now checking the type of the converted column
type(df['date_added'][0])

datetime.date

In [353]:
# Viewing the converted types
df['date_added'].head()

0    2021-09-25
1    2021-09-24
2    2021-09-24
3    2021-09-24
4    2021-09-24
Name: date_added, dtype: object

In [355]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8807 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
