In [87]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Get an overview of the data

In [88]:
# Load dataframe
df = pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [89]:
df.shape

(8807, 12)

In [90]:
df.columns.tolist()

['show_id',
 'type',
 'title',
 'director',
 'cast',
 'country',
 'date_added',
 'release_year',
 'rating',
 'duration',
 'listed_in',
 'description']

# Check columns with nulls

In [91]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [92]:
# From above info: 'director', 'country', 'cast' have most nulls consecutively
# Now confirm the column with most nulls
most_nulls_column = df.isnull().sum().idxmax()
most_nulls_column

'director'

### Dealing with null values in 'director' column

In [93]:
# Since 'director' column has most nulls, view what's the null designation
# In this case it's 'NaN'
df['director'].unique()

array(['Kirsten Johnson', nan, 'Julien Leclercq', ..., 'Majid Al Ansari',
       'Peter Hewitt', 'Mozez Singh'], dtype=object)

In [94]:
# Get the null values count
df['director'].isnull().sum()

2634

In [95]:
# Find how missing values are represented in the column: 
def missing_val_representation(column):
    missing_val = set()
    for val in column:
        if pd.isna(val) or val == "NaN" or val == "NA":
            missing_val.add(val)
    return missing_val

missing_val_representation(df['director'])

{nan}

In [96]:
# Replace null values in director column with 'Unavailable'
df['director'] = df['director'].apply(lambda x: 'Unavailable' if pd.isna(x) == True else x)

In [97]:
# Check how many null values in 'director' column
df['director'].isnull().sum()  # The count is now 0

0

In [98]:
# Check the info summary once more
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


### Dealing with null values in 'country' column

In [99]:
# Confirm count of non-null values is equal to one above 
null_value_count_country_col = df['country'].isnull().sum()
non_null_value_count_country_col = len(df) - null_value_count_country_col

print(f"Non-null value count in 'country': {non_null_value_count_country_col}")
print(f"Null value count in 'country': {null_value_count_country_col}")

Non-null value count in 'country': 7976
Null value count in 'country': 831


In [100]:
# Replace null values in ‘country’ column with ‘Not Available’
df['country'] = df['country'].apply(lambda x: 'Unavailable' if pd.isna(x) == True else x)

In [101]:
# Confirm all null values in 'country' have been replaced
null_value_count_country_col = df['country'].isnull().sum()
print(f"Null value count 'country' col after removal: {null_value_count_country_col}")

Null value count 'country' col after removal: 0


In [102]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          7982 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


# Dealing with null values in ‘cast’ column

In [103]:
# Confirm count of non values in 'cast' column (Hence all other values are null)
null_vals_count_cast_col = df['cast'].isnull().sum()
non_null_vals_in_cast_col = len(df) - null_vals_count_cast_col

print(f"Non-null value count in 'country': {non_null_vals_in_cast_col}")
print(f"Null value count in 'country': {null_vals_count_cast_col}")

Non-null value count in 'country': 7982
Null value count in 'country': 825


In [104]:
# Replacing all null values in 'cast' column with "Unavailable"
df['cast'] = df['cast'].apply(lambda x: "Unavailable" if pd.isna(x) == True else x)

In [105]:
# Confirm all null values in 'cast' have been removed
df['cast'].isnull().sum()

0

In [106]:
# Confirm summary info once more
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
