<a href="https://colab.research.google.com/github/mbewustanley/Netflix_Titles_Data_Cleaning/blob/main/Netflix_titles_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1 Dataset Overview

In [None]:
#read csv file
df_netflix = pd.read_csv('/content/drive/MyDrive/YT_datasets/netflix_titles.csv')

In [6]:
df_netflix.dtypes

Unnamed: 0,0
show_id,int64
type,object
title,object
director,object
cast,object
country,object
date_added,object
release_year,int64
rating,object
duration,object


In [7]:
#note the date_added column has an 'object' data type

In [8]:
#check the number of rows and columns with 'shape'
df_netflix.shape

(6234, 12)

In [9]:
df_netflix

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...
...,...,...,...,...,...,...,...,...,...,...,...,...
6229,80000063,TV Show,Red vs. Blue,,"Burnie Burns, Jason Saldaña, Gustavo Sorola, G...",United States,,2015,NR,13 Seasons,"TV Action & Adventure, TV Comedies, TV Sci-Fi ...","This parody of first-person shooter games, mil..."
6230,70286564,TV Show,Maron,,"Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh...",United States,,2016,TV-MA,4 Seasons,TV Comedies,"Marc Maron stars as Marc Maron, who interviews..."
6231,80116008,Movie,Little Baby Bum: Nursery Rhyme Friends,,,,,2016,,60 min,Movies,Nursery rhymes and original music for children...
6232,70281022,TV Show,A Young Doctor's Notebook and Other Stories,,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ..."


# 2 Identifying Missing Data

In [14]:
#number of rows missing in each column
df_netflix.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
director,1969
cast,570
country,476
date_added,11
rating,10
title,0
show_id,0
type,0
release_year,0
duration,0


In [17]:
#the director, cast, and country columns have the most missing values in the data set
#considering dropping them but we should first check the precentage of rows they consistute in the full dataset

df_netflix['director'].isnull().mean() * 100

np.float64(31.584857234520374)

In [18]:
#this means the null values in the 'dircctor' column constitute 30% of the dataset
#to check the rest we can use a for loop

for column in df_netflix.columns:
  percentage_null = df_netflix[column].isnull().mean()
  print(f'{column}: {round(percentage_null*100, 2)}% null values')

show_id: 0.0% null values
type: 0.0% null values
title: 0.0% null values
director: 31.58% null values
cast: 9.14% null values
country: 7.64% null values
date_added: 0.18% null values
release_year: 0.0% null values
rating: 0.16% null values
duration: 0.0% null values
listed_in: 0.0% null values
description: 0.0% null values


# 3. Dealing with missing data

In [21]:
# A few options
# Remove a column or row with .drop, .dropna, or .isnull

#drop column
"""df_netflix.drop('director', axis=1, inplace=True)

not the most ideal because there might be useful data in the director column we'll miss out on
"""

#drop row
"""no_director = df_netflix[df_netflix['director'].isnull()].index
df_netflix.drop(no_director, axis=0, inplace=True)

this drops the rows where the director column is null
"""

# ~ + .isnull()
df_netflix[~(df_netflix['director'].isnull())]

#dropna()
df_netflix.dropna(subset=['director'], axis=0, inplace=True)


In [27]:
df_netflix.isnull().sum().sort_values(ascending=False)

Unnamed: 0,0
cast,356
country,171
rating,6
show_id,0
director,0
title,0
type,0
date_added,0
release_year,0
duration,0


In [32]:
#fillna() to replace NaN with the mean, median or mode

#lets use the rating column
df_netflix['rating'].mode() #most common value in the rating column
#fillna takes an int, so we need to convert the mode
mode = ''.join(df_netflix['rating'].mode())
df_netflix['rating'].fillna(mode, inplace=True)


In [35]:
df_netflix.isnull().mean().sort_values(ascending=False)

Unnamed: 0,0
cast,0.08347
country,0.040094
show_id,0.0
type,0.0
director,0.0
title,0.0
date_added,0.0
release_year,0.0
rating,0.0
duration,0.0


In [41]:
#Replace with an arbitrary number with fillna()
#we'll use the duration column for examole sake. it has no null values

#df_netflix['duration'].fillna(0, inplace=True)

#backward, foward
#df_netflix.fillna(method='bfill')
#df_netflix.fillna(method='ffill')

In [49]:
#Extracting data from 'duration' column with split() and extract() method

#the 'type' column has both netflix movies and series, we can separate them by
df_movie = df_netflix[df_netflix['type'] == 'Movie']
df_movie['duration'] #this gives us the movies duration but theres an issue

#to remove the text from the 'duration' column leaving only int
df_movie['minutes'] = df_movie['duration'].str.split(' ').str.get(0).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movie['minutes'] = df_movie['duration'].str.split(' ').str.get(0).astype('int')


In [50]:
df_movie

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,minutes
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...,90
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...,99
6,70304989,Movie,Automata,Gabe Ibáñez,"Antonio Banderas, Dylan McDermott, Melanie Gri...","Bulgaria, United States, Spain, Canada","September 8, 2017",2014,R,110 min,"International Movies, Sci-Fi & Fantasy, Thrillers","In a dystopian future, an insurance adjuster f...",110
7,80164077,Movie,Fabrizio Copano: Solo pienso en mi,"Rodrigo Toro, Francisco Schultz",Fabrizio Copano,Chile,"September 8, 2017",2017,TV-MA,60 min,Stand-Up Comedy,Fabrizio Copano takes audience participation t...,60
9,70304990,Movie,Good People,Henrik Ruben Genz,"James Franco, Kate Hudson, Tom Wilkinson, Omar...","United States, United Kingdom, Denmark, Sweden","September 8, 2017",2014,R,90 min,"Action & Adventure, Thrillers",A struggling couple can't believe their luck w...,90
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5575,80093107,Movie,Toro,Kike Maíllo,"Mario Casas, Luis Tosar, José Sacristán, Claud...",Spain,"April 1, 2017",2016,NR,106 min,"Dramas, International Movies, Thrillers",Ex-con Toro's brother and former partner in cr...,106
5577,80085438,Movie,Frank and Cindy,G.J. Echternkamp,,United States,"April 1, 2016",2007,TV-MA,70 min,Documentaries,Frank was a rising pop star when he married Ci...,70
5578,80085439,Movie,Frank and Cindy,G.J. Echternkamp,"Rene Russo, Oliver Platt, Johnny Simmons, Jane...",United States,"April 1, 2016",2015,R,102 min,"Comedies, Dramas, Independent Movies",A student filmmaker vengefully turns his camer...,102
5579,80011846,Movie,Iverson,Zatella Beatty,Allen Iverson,United States,"April 1, 2016",2014,NR,88 min,"Documentaries, Sports Movies",This unfiltered documentary follows the rocky ...,88


In [55]:
df_movie.dtypes

Unnamed: 0,0
show_id,int64
type,object
title,object
director,object
cast,object
country,object
date_added,object
release_year,int64
rating,object
duration,object


In [57]:
#we can do the same for the 'date_added' column

#df_movie['date_added'].str.split(',').str.get(0)
#df_movie['date_added'].str.extract('(\d{4})')