# Netflix data cleaning

In [1]:
import numpy as np 
import pandas as pd

import plotly.express as px

## Downloaded dataset

In [77]:
data = pd.read_csv('./../dataset/netflix_titles.csv', encoding= 'unicode_escape')
data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,9/25/2021,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,9/24/2021,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,9/24/2021,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,9/24/2021,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


## Information

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8530 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


# Drop cols

In [78]:
# Drop Columns 
data.drop(columns=['cast', 'director', 'description'], inplace=True)

# Extract year_added

In [33]:
data.dtypes

show_id         object
type            object
title           object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
dtype: object

In [79]:
data['date_added'] = pd.to_datetime(data['date_added'])

In [80]:
data.dtypes

show_id                 object
type                    object
title                   object
country                 object
date_added      datetime64[ns]
release_year             int64
rating                  object
duration                object
listed_in               object
dtype: object

In [81]:
data['year_added'] = data['date_added'].dt.year

In [82]:
data['year_added'] = data['year_added'].fillna(0).astype(int)

In [84]:
data['year_added'] = data['year_added'].replace(0, np.nan)
data['year_added'].unique()

array([2021., 2020., 2019., 2018., 2017., 2016., 2015., 2014., 2013.,
       2012., 2011., 2009., 2008.,   nan, 2010.])

## Rating ages

In [85]:
ratings_ages = {
    'TV-PG': 'Kids',
    'TV-MA': 'Adults',
    'TV-Y7-FV': 'Kids',
    'TV-Y7': 'Kids',
    'TV-14': 'Teens',
    'R': 'Adults',
    'TV-Y': 'Kids',
    'NR': 'Adults',
    'PG-13': 'Teens',
    'TV-G': 'Kids',
    'PG': 'Kids',
    'G': 'Kids',
    'UR': 'Adults',
    'NC-17': 'Adults'
}

# Replace Rating values with age targets, they are based on
data['ages'] = data['rating'].replace(ratings_ages)
data['ages'].unique()

array(['Teens', 'Adults', 'Kids', nan], dtype=object)

## Principal genre

In [86]:
# Let's leave only first genre in a new col
data['genre'] = data['listed_in'].apply(lambda x: x.split(",")[0])

## Select some cols and download first dataset

In [87]:
data1 = data[['show_id','type', 'title', 'year_added', 'release_year', 'genre', 'ages']]
data1.head()

Unnamed: 0,show_id,type,title,year_added,release_year,genre,ages
0,s1,Movie,Dick Johnson Is Dead,2021.0,2020,Documentaries,Teens
1,s2,TV Show,Blood & Water,2021.0,2021,International TV Shows,Adults
2,s3,TV Show,Ganglands,2021.0,2021,Crime TV Shows,Adults
3,s4,TV Show,Jailbirds New Orleans,2021.0,2021,Docuseries,Adults
4,s5,TV Show,Kota Factory,2021.0,2021,International TV Shows,Adults


In [88]:
data1.to_csv('./../dataset/netflix_titles_1.csv', index = False)

## Split and explode country col

In [26]:
data = data.assign(country=data['country'].str.split(', ')).explode('country')

## Select some cols and download second dataset

In [27]:
data2 = data[['show_id', 'country', 'type', 'title', 'release_year', 'genre', 'ages']]
data2.head(10)

Unnamed: 0,show_id,country,type,title,release_year,genre,ages
0,s1,United States,Movie,Dick Johnson Is Dead,2020,Documentaries,Teens
1,s2,South Africa,TV Show,Blood & Water,2021,International TV Shows,Adults
2,s3,,TV Show,Ganglands,2021,Crime TV Shows,Adults
3,s4,,TV Show,Jailbirds New Orleans,2021,Docuseries,Adults
4,s5,India,TV Show,Kota Factory,2021,International TV Shows,Adults


In [29]:
data2.to_csv('./../dataset/netflix_titles_2.csv', index = False)