# **1. Import libraries**

In [2]:
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter('ignore')
import time

colors = ["#581845", "#900C3F", "#C70039", "#8E44AD", "#6C3483", "#943126", "#4A235A", "#7D3C98", "#A93226", "#76448A", "#512E5F","#BB8FCE", "#A569BD", "#922B21", "#5D6D7E"]

  from pandas.core import (


# **2. Dataset loading & first look**

In [3]:
df = pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [4]:
df.shape

(8807, 12)

In [5]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [7]:
df.describe(include = 'object')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,rating,duration,listed_in,description
count,8807,8807,8807,6173,7982,7976,8797,8803,8804,8807,8807
unique,8807,2,8807,4528,7692,748,1767,17,220,514,8775
top,s1,Movie,Dick Johnson Is Dead,Rajiv Chilaka,David Attenborough,United States,"January 1, 2020",TV-MA,1 Season,"Dramas, International Movies","Paranormal activity at a lush, abandoned prope..."
freq,1,6131,1,19,19,2818,109,3207,1793,362,4


In [8]:
df.duplicated().sum()

0

Conclusion:
- The data set has 8807 rows and 12 columns.
- show_id and description columns can be dropped due to low data value.
- Null values appear in director, cast, country, date_added, rating and duration columns.
- There is no full duplicate in the dataset.

# **3. Data cleaning**

### **- Clean country column**

In [9]:
df['country'].isna().sum()

831

In [10]:
df['country'] = df['country'].fillna(df['country'].mode()[0])
df['country'] = df['country'].astype(str)
df['country'] = df['country'].apply(lambda x : x.split(', ')[0])

In [11]:
df['country'].value_counts()

country
United States     4041
India             1008
United Kingdom     626
Canada             271
Japan              259
                  ... 
Namibia              1
Senegal              1
Cameroon             1
Syria                1
Somalia              1
Name: count, Length: 89, dtype: int64

### **- Clean rating column**

In [12]:
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
       'TV-Y7-FV', 'UR'], dtype=object)

In [13]:
df['rating'] = df['rating'].replace({'74 min': np.nan, '84 min': np.nan, '66 min': np.nan, 'TV-Y7-FV': 'TV-Y7'})
df['rating'].unique()

array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
       'TV-G', 'G', 'NC-17', nan, 'NR', 'UR'], dtype=object)

### **- Clean director and cast columns**

In [14]:
df['director'].isnull().sum()

2634

In [15]:
df['director'].value_counts()

director
Rajiv Chilaka                     19
Raúl Campos, Jan Suter            18
Marcus Raboy                      16
Suhas Kadav                       16
Jay Karas                         14
                                  ..
Raymie Muzquiz, Stu Livingston     1
Joe Menendez                       1
Eric Bross                         1
Will Eisenberg                     1
Mozez Singh                        1
Name: count, Length: 4528, dtype: int64

In [16]:
null_percent = df.isnull().sum() * 100 / df.shape[0]
null_percent.round(2).sort_values(ascending = False)

director       29.91
cast            9.37
date_added      0.11
rating          0.08
duration        0.03
show_id         0.00
type            0.00
title           0.00
country         0.00
release_year    0.00
listed_in       0.00
description     0.00
dtype: float64

### **- Fill null values in all columns**

In [17]:
#Fill null values with a new category

df['director'].fillna('Unknown', inplace = True)
df['cast'].fillna('Unknown', inplace = True)

In [19]:
#Fill null values with mode

mode_im = ['date_added', 'rating', 'duration']
for i in mode_im:
    df[i] = df[i].fillna(df[i].mode()[0])

In [20]:
df['month'] = df['date_added'].apply(lambda x : x.lstrip().split(' ')[0])
df['year'] = df['date_added'].apply(lambda x : x.split(', ')[-1])