In [1]:
#Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
pd.set_option('max_colwidth', 400)

## DATA EXTRACTION
--
- Define the path to the netflix_titles_csv
- Read the file and store it as a pandas dataframe

In [4]:
#Define the path to the file
netflix_data = Path("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/netflix_titles.csv")

#read the file and store into a pandas dataframe
netflix_data_df = pd.read_csv(netflix_data )

#Take a glimpse at the dataset
netflix_data_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",,24-Sep-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
3,s4,TV Show,Jailbirds New Orleans,,,,24-Sep-21,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."


## DATA CLEANING
--
**We'll the perform the following task.**
- Get the summary of the the dataset to ensure no lost data encurred during the extraction
- Drop unwanted columns('description' and 'release_year','country','duration','listed_in','description)
- remove all the null values
- format the 'date_added'columns
- Rename and reorder all the columns
- save the cleaned dataframe as netflix_cleaned_df.csv CSV file.

In [7]:
#Get the summary of the dataset
netflix_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [9]:
#Drop unwanted columns and remove all the null values
reduced_df = netflix_data_df[['show_id','type','title','director','cast','date_added','rating']]
reduced_df = reduced_df.dropna()
reduced_df.head()

Unnamed: 0,show_id,type,title,director,cast,date_added,rating
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",24-Sep-21,TV-MA
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",24-Sep-21,TV-MA
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",24-Sep-21,PG
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri",24-Sep-21,TV-MA
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",24-Sep-21,TV-14


In [11]:
#check the data count
reduced_df.count()

show_id       5699
type          5699
title         5699
director      5699
cast          5699
date_added    5699
rating        5699
dtype: int64

In [13]:
#Format the date_added column and get it in the format(dd-mm-yyy)
reduced_df['date_added'] = pd.to_datetime(reduced_df['date_added'], format='mixed')
reduced_df['date_added'] = reduced_df['date_added'].dt.strftime('%Y-%m-%d')
reduced_df.head()

Unnamed: 0,show_id,type,title,director,cast,date_added,rating
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",2021-09-24,TV-MA
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",2021-09-24,TV-MA
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",2021-09-24,PG
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri",2021-09-24,TV-MA
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",2021-09-24,TV-14


In [15]:
#Capitalize and rename the columns for a more appealing display
netflix_df = reduced_df.rename(columns = {'show_id':'Show_ID','type':'Category','title':'Title','director':'Director','cast':'Actors',
                                          'date_added':'Date_Added','rating':'Rating'})

netflix_df.head()                               

Unnamed: 0,Show_ID,Category,Title,Director,Actors,Date_Added,Rating
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",2021-09-24,TV-MA
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",2021-09-24,TV-MA
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",2021-09-24,PG
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri",2021-09-24,TV-MA
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",2021-09-24,TV-14


In [17]:
#Reorder the columns
netflix_df = netflix_df[['Show_ID','Title','Category','Date_Added','Rating','Director','Actors']]
netflix_df.head()

Unnamed: 0,Show_ID,Title,Category,Date_Added,Rating,Director,Actors
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera"
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver"
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr"
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri"
8,s9,The Great British Baking Show,TV Show,2021-09-24,TV-14,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood"


In [21]:
# save the cleaned dataframe as netflix_df.csv CSV file
netflix_df.to_csv("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/output1", index=False)

## CREATE THE MOVIE TABLE
--
** Create a movie_df dataframe that has the following colums
- Title_ID
- Title
- Category
- Date_Added
- Rating

In [24]:
# Create a copy of the netflix_cleaned_df DataFrame and name it movie_data_df
movie_data_df = netflix_df.copy()
movie_data_df.head()

Unnamed: 0,Show_ID,Title,Category,Date_Added,Rating,Director,Actors
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera"
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver"
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr"
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri"
8,s9,The Great British Baking Show,TV Show,2021-09-24,TV-14,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood"


In [26]:
# Extract the column of interest('Title','Category','Date_Added','Rating')
movie_data_df  = movie_data_df[['Title','Category','Date_Added','Rating']]
movie_data_df.head()

Unnamed: 0,Title,Category,Date_Added,Rating
2,Ganglands,TV Show,2021-09-24,TV-MA
5,Midnight Mass,TV Show,2021-09-24,TV-MA
6,My Little Pony: A New Generation,Movie,2021-09-24,PG
7,Sankofa,Movie,2021-09-24,TV-MA
8,The Great British Baking Show,TV Show,2021-09-24,TV-14


In [28]:
# Get the unique genres in a list
titles = movie_data_df['Title'].tolist()
# Get the number of distinct values in the genre list.
title_count = len(titles)
# Create a numpy array of the length of the title_count list.
np_title = np.arange(1,5700)
# Use a list comprehension to add "tid" to each title.
title_id = [f'tid{x}' for x in range(1,5700)]
#create a dataframe with the title and title_id columns
title_df = pd.DataFrame({'Titles_ID':title_id , 'Title':titles})
title_df.head()

Unnamed: 0,Titles_ID,Title
0,tid1,Ganglands
1,tid2,Midnight Mass
2,tid3,My Little Pony: A New Generation
3,tid4,Sankofa
4,tid5,The Great British Baking Show


In [34]:
#merge title-df and the movie_data_df into a new dataframe named movie_df
movie_df = pd.merge(title_df,movie_data_df,on='Title')
movie_df.head()

Unnamed: 0,Titles_ID,Title,Category,Date_Added,Rating
0,tid1,Ganglands,TV Show,2021-09-24,TV-MA
1,tid2,Midnight Mass,TV Show,2021-09-24,TV-MA
2,tid3,My Little Pony: A New Generation,Movie,2021-09-24,PG
3,tid4,Sankofa,Movie,2021-09-24,TV-MA
4,tid5,The Great British Baking Show,TV Show,2021-09-24,TV-14


In [36]:
#Export the new dataframe as genre_df.csv to a csv file.
movie_df.to_csv("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/output2", index=False)

##CREATE THE CAST TABLE
--
** Create the cast_data_df with the following columns
- Show_ID
- Title
- First_Name
- Last_Name
  

In [39]:
 #Create a copy of the  DataFrame and name it cast_data_df
director_data_df = netflix_df.copy()
director_data_df.head()

Unnamed: 0,Show_ID,Title,Category,Date_Added,Rating,Director,Actors
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera"
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver"
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr"
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri"
8,s9,The Great British Baking Show,TV Show,2021-09-24,TV-14,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood"


In [52]:
#Grab the following columns['Show_ID','Title','Actors']
cast_data_df = netflix_df[['Show_ID','Title','Actors']]
cast_data_df.head()

Unnamed: 0,Show_ID,Title,Actors
2,s3,Ganglands,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera"
5,s6,Midnight Mass,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver"
6,s7,My Little Pony: A New Generation,"Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr"
7,s8,Sankofa,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri"
8,s9,The Great British Baking Show,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood"


In [81]:
# Split the cast coulumn into 2 colums and name each First_Name and 'Last_Name' respectively
#store the actors column in to a list
cast_list = cast_data_df['Actors'].tolist()
actors = []
for actor_list in cast_list:
    temp = actor_list.split(',')
    for i in temp:
        actors.append(i.strip())
        
#Add the newly created columns to the dataframe and name it cast_df
first_name = []
last_name = []

for name in actors:
     full_name = name.split(' ')
#    print(full_name)
    
     first_name.append(full_name[0])
     last_name.append(full_name[-1])
     print(first_name)
     print(f'-----------------------')
     print(last_name)
#add the names columns to the cast_data_df by a merge
#pd = pd.DataFrame({'First_name':first_name})
#pd

#split the names column into 2 columns named First_name and Last_name

#re_organize the columns  

#Display the dataframe


IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [82]:

     print(first_name)
     print(f'-----------------------')
     print(last_name)

['Sami', 'Tracy', 'Samuel', 'Nabiha', 'Sofia', 'Salim', 'Noureddine', 'Geert', 'Bakary', 'Kate', 'Zach', 'Hamish', 'Henry', 'Kristin', 'Samantha', 'Igby', 'Rahul', 'Annarah', 'Annabeth', 'Alex', 'Rahul', 'Matt', 'Michael', 'Crystal', 'Louis', 'Vanessa', 'Kimiko', 'James', 'Sofia', 'Liza', 'Ken', 'Elizabeth', 'Jane', 'Michael', 'Phil', 'Kofi', 'Oyafunmike', 'Alexandra', 'Nick', 'Mutabaruka', 'Afemo', 'Reggie', 'Mzuri', 'Mel', 'Sue', 'Mary', 'Paul', 'Melissa', 'Chris', 'Kevin', 'Timothy', 'Daveed', 'Skyler', 'Laura', 'Rosalind', 'Kimberly', 'Loretta', 'Ravi', 'Sukollawat', 'Sushar', 'Pavarit', 'Sahajak', 'Suthipongse', 'Bhasaworn', 'Daweerit', 'Waratthaya', 'Kittiphoom', 'Abhicha', 'Nophand', 'Kittipong', 'Arisara', 'Jaytiya', 'Pantipa', 'Panupan', 'Kungtap', 'Phumphat', 'Issara', 'Keerati', 'Panjai', 'Supranee', 'Suda', 'Visaka', 'Pitchatorn', 'Luna', 'Jannis', 'Milan', 'Edin', 'Anna', 'Marlon', 'Victor', 'Fleur', 'Aziz', 'Mélanie', 'Elizaveta', 'Klara', 'Lucca', 'Júlia', 'Marcus', 'Kir

In [None]:
#Export the new dataframe as cast_df.csv to a csv file.