In [127]:
#Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
pd.set_option('max_colwidth', 400)

## DATA EXTRACTION
---
- Define the path to the netflix_titles_csv
- Read the file and store it as a pandas dataframe

In [130]:
#Define the path to the file
netflix_data = Path("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/netflix_titles.csv")

#read the file and store into a pandas dataframe
netflix_data_df = pd.read_csv(netflix_data )

#Take a glimpse at the dataset
netflix_data_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",,24-Sep-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
3,s4,TV Show,Jailbirds New Orleans,,,,24-Sep-21,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."


## DATA CLEANING
---
**In this section , we'll perform the following task.**
- Get the summary of the the dataset to ensure no lost data encurred during the extraction
- Drop unwanted columns('description' and 'release_year','country','duration','listed_in','description)
- remove all the null values
- format the 'date_added'columns
- Rename and reorder all the columns
- save the cleaned dataframe as netflix_df.csv CSV file.

In [133]:
#Get the summary of the dataset
netflix_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [135]:
#Drop unwanted columns and remove all the null values
reduced_df = netflix_data_df[['show_id','type','title','director','cast','date_added','rating']]
reduced_df = reduced_df.dropna()
reduced_df.head()

Unnamed: 0,show_id,type,title,director,cast,date_added,rating
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",24-Sep-21,TV-MA
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",24-Sep-21,TV-MA
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",24-Sep-21,PG
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri",24-Sep-21,TV-MA
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",24-Sep-21,TV-14


In [137]:
#check the data count to ensure that all the noises were dropped.
reduced_df.count()

show_id       5699
type          5699
title         5699
director      5699
cast          5699
date_added    5699
rating        5699
dtype: int64

In [139]:
#Format the date_added column and get it in the format(dd-mm-yyy)
reduced_df['date_added'] = pd.to_datetime(reduced_df['date_added'], format='mixed')
reduced_df['date_added'] = reduced_df['date_added'].dt.strftime('%Y-%m-%d')
reduced_df.head()

Unnamed: 0,show_id,type,title,director,cast,date_added,rating
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",2021-09-24,TV-MA
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",2021-09-24,TV-MA
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",2021-09-24,PG
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri",2021-09-24,TV-MA
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",2021-09-24,TV-14


In [141]:
#Capitalize and rename the columns for a more appealing display
netflix_df = reduced_df.rename(columns = {'show_id':'Show_ID','type':'Categories','title':'Titles','director':'Directors','cast':'Actors',
                                          'date_added':'Date_Added','rating':'Ratings'})

netflix_df.head()                               

Unnamed: 0,Show_ID,Categories,Titles,Directors,Actors,Date_Added,Ratings
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",2021-09-24,TV-MA
5,s6,TV Show,Midnight Mass,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",2021-09-24,TV-MA
6,s7,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",2021-09-24,PG
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri",2021-09-24,TV-MA
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",2021-09-24,TV-14


In [143]:
#Reorder the columns
netflix_df = netflix_df[['Show_ID','Titles','Categories','Date_Added','Ratings','Directors','Actors']]
netflix_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Actors
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera"
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver"
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr"
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri"
8,s9,The Great British Baking Show,TV Show,2021-09-24,TV-14,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood"


In [145]:
# save the cleaned dataframe as netflix_df.csv CSV file
netflix_df.to_csv("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/output1", index=False)

## CREATE THE MOVIE DATAFRAME
---
**Create a movie_df dataframe that has the following colums.**
- Title_ID
- Titles
- Categories
- Date_Added
- Ratings

In [148]:
# Create a copy of the netflix_df DataFrame and name it movie_data_df
movie_data_df = netflix_df.copy()
movie_data_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Actors
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera"
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver"
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr"
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri"
8,s9,The Great British Baking Show,TV Show,2021-09-24,TV-14,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood"


In [150]:
# Extract the column of interest('Title','Category','Date_Added','Rating')
movie_data_df  = movie_data_df[['Titles','Categories','Date_Added','Ratings']]
movie_data_df.head()

Unnamed: 0,Titles,Categories,Date_Added,Ratings
2,Ganglands,TV Show,2021-09-24,TV-MA
5,Midnight Mass,TV Show,2021-09-24,TV-MA
6,My Little Pony: A New Generation,Movie,2021-09-24,PG
7,Sankofa,Movie,2021-09-24,TV-MA
8,The Great British Baking Show,TV Show,2021-09-24,TV-14


In [152]:
# Get the unique titles in a list
titles = movie_data_df['Titles'].tolist()
# Get the number of distinct values in the genre list.
title_count = len(titles)
# Create a numpy array of the length of the title_count list.
np_title = np.arange(1,5700)
# Use a list comprehension to add "tid" to each title.
title_id = [f'tid{x}' for x in range(1,5700)]
#create a dataframe with the title and title_id columns
title_df = pd.DataFrame({'Title_ID':title_id , 'Titles':titles})
title_df.head()

Unnamed: 0,Title_ID,Titles
0,tid1,Ganglands
1,tid2,Midnight Mass
2,tid3,My Little Pony: A New Generation
3,tid4,Sankofa
4,tid5,The Great British Baking Show


In [154]:
#merge title-df and the movie_data_df into a new dataframe named movie_df
movie_df = pd.merge(title_df,movie_data_df,on='Titles')
movie_df.head()

Unnamed: 0,Title_ID,Titles,Categories,Date_Added,Ratings
0,tid1,Ganglands,TV Show,2021-09-24,TV-MA
1,tid2,Midnight Mass,TV Show,2021-09-24,TV-MA
2,tid3,My Little Pony: A New Generation,Movie,2021-09-24,PG
3,tid4,Sankofa,Movie,2021-09-24,TV-MA
4,tid5,The Great British Baking Show,TV Show,2021-09-24,TV-14


In [156]:
#export the movie_df to a csv file as movie_df.csv
movie_df.to_csv("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/output2", index=False)

## CREATE THE TITLES DATAFRAME
---
**Create the titles_df dataframe that has the following columns.**
- Title_ID
- Show_ID
- Titles	 
- Category


In [159]:
#import a copy of the movie_df and netflix_df
df1 = movie_df.copy()
df2 = netflix_df.copy()

In [161]:
#merge both dataframe on titles column and display the output
titles_data_df = pd.merge(df2,df1)
titles_data_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Actors,Title_ID
0,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",tid1
1,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver",tid2
2,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr",tid3
3,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri",tid4
4,s9,The Great British Baking Show,TV Show,2021-09-24,TV-14,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood",tid5


In [163]:
#extract the columns 'Show_ID','Titles','Categories','Title_ID'
titles_df = titles_data_df[['Show_ID','Title_ID','Categories','Titles']]
titles_df.head()

Unnamed: 0,Show_ID,Title_ID,Categories,Titles
0,s3,tid1,TV Show,Ganglands
1,s6,tid2,TV Show,Midnight Mass
2,s7,tid3,Movie,My Little Pony: A New Generation
3,s8,tid4,Movie,Sankofa
4,s9,tid5,TV Show,The Great British Baking Show


In [165]:
#export the titles_df to a csv file as titles_df.csv
titles_df.to_csv("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/output3", index=False)

## CREATE THE DIRECTOR DATAFRAME
---
**Create the director table with the following columns**
- Show_ID
- Titles
- Categories
- Date_Added
- First_Name
- Last_Name
  The first and last name columns will derive from the Director column

In [168]:
 #Create a copy of the  DataFrame and name it cast_data_df
director_data_df = netflix_df.copy()
director_data_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Actors
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera"
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"Kate Siegel, Zach Gilford, Hamish Linklater, Henry Thomas, Kristin Lehman, Samantha Sloyan, Igby Rigney, Rahul Kohli, Annarah Cymone, Annabeth Gish, Alex Essoe, Rahul Abburi, Matt Biedel, Michael Trucco, Crystal Balint, Louis Oliver"
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Vanessa Hudgens, Kimiko Glenn, James Marsden, Sofia Carson, Liza Koshy, Ken Jeong, Elizabeth Perkins, Jane Krakowski, Michael McKean, Phil LaMarr"
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra Duah, Nick Medley, Mutabaruka, Afemo Omilami, Reggie Carter, Mzuri"
8,s9,The Great British Baking Show,TV Show,2021-09-24,TV-14,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Hollywood"


In [170]:
#Extract the columns of interest
director_data_df = director_data_df[['Show_ID','Titles','Categories','Date_Added','Directors']]
director_data_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Directors
2,s3,Ganglands,TV Show,2021-09-24,Julien Leclercq
5,s6,Midnight Mass,TV Show,2021-09-24,Mike Flanagan
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,"Robert Cullen, José Luis Ucha"
7,s8,Sankofa,Movie,2021-09-24,Haile Gerima
8,s9,The Great British Baking Show,TV Show,2021-09-24,Andy Devonshire


In [172]:
director_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5699 entries, 2 to 8806
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Show_ID     5699 non-null   object
 1   Titles      5699 non-null   object
 2   Categories  5699 non-null   object
 3   Date_Added  5699 non-null   object
 4   Directors   5699 non-null   object
dtypes: object(5)
memory usage: 267.1+ KB


In [174]:
# Split the cast coulumn into 2 colums and name each First_Name and 'Last_Name' respectively
#store the actors column in a list
cast_list = director_data_df['Directors'].tolist()

#extract individual names from the director_list
name_list = []
for value in cast_list:
      full_name = value.split(',')
      name_list.append(full_name[-1])
      
#create a variable to store a dictionanry.
actors_dict={}
actors_dict['Directors']=name_list

#create a dataframe
data_df = pd.DataFrame(actors_dict)


#split the names column into 2 columns named First_name and Last_name
data_df[['First_Name','Last_Name']] =data_df['Directors'].str.split(' ',n=1,expand=True)

#Display the dataframe
data_df.head()

Unnamed: 0,Directors,First_Name,Last_Name
0,Julien Leclercq,Julien,Leclercq
1,Mike Flanagan,Mike,Flanagan
2,José Luis Ucha,,José Luis Ucha
3,Haile Gerima,Haile,Gerima
4,Andy Devonshire,Andy,Devonshire


In [176]:
data_df['Last_Name'].fillna(0).count()

5699

In [178]:
#merge the data_df and director_data_df dataframes
director_df = pd.merge(data_df,director_data_df).drop_duplicates()
director_df.head()

Unnamed: 0,Directors,First_Name,Last_Name,Show_ID,Titles,Categories,Date_Added
0,Julien Leclercq,Julien,Leclercq,s3,Ganglands,TV Show,2021-09-24
1,Julien Leclercq,Julien,Leclercq,s1237,Sentinelle,Movie,2021-03-05
2,Julien Leclercq,Julien,Leclercq,s2669,Earth and Blood,Movie,2020-04-17
9,Mike Flanagan,Mike,Flanagan,s6,Midnight Mass,TV Show,2021-09-24
10,Mike Flanagan,Mike,Flanagan,s5092,Before I Wake,Movie,2018-01-05


In [180]:
#drop the director column
director_df = director_df[['First_Name','Last_Name','Show_ID','Titles','Categories','Date_Added']]
director_df.head()

Unnamed: 0,First_Name,Last_Name,Show_ID,Titles,Categories,Date_Added
0,Julien,Leclercq,s3,Ganglands,TV Show,2021-09-24
1,Julien,Leclercq,s1237,Sentinelle,Movie,2021-03-05
2,Julien,Leclercq,s2669,Earth and Blood,Movie,2020-04-17
9,Mike,Flanagan,s6,Midnight Mass,TV Show,2021-09-24
10,Mike,Flanagan,s5092,Before I Wake,Movie,2018-01-05


In [182]:
#create a unique id column
titles = director_df['Titles']
#get the count of the titles
titles_count = director_df['Titles'].count()
#create a numpy array of the length of titles
#np_tid = np.arange(1,5195)
np_titles = np.arange(1,5195)
#set the id 
ID = [f'100{x}'for x in range(1,5195)]
#add the id  column to the director_df
id_df= pd.DataFrame({'IDs':ID,'Titles':titles})
# concatenating id_df and director_df along columns
director_df = pd.merge(id_df,director_df,on='Titles')
#display the first5 row
director_df.head()


Unnamed: 0,IDs,Titles,First_Name,Last_Name,Show_ID,Categories,Date_Added
0,1001,Ganglands,Julien,Leclercq,s3,TV Show,2021-09-24
1,1002,Sentinelle,Julien,Leclercq,s1237,Movie,2021-03-05
2,1003,Earth and Blood,Julien,Leclercq,s2669,Movie,2020-04-17
3,1004,Midnight Mass,Mike,Flanagan,s6,TV Show,2021-09-24
4,1005,Before I Wake,Mike,Flanagan,s5092,Movie,2018-01-05


In [184]:
#Export the new dataframe as cast_df.csv to a csv file.
director_df.to_csv("/Users/francoiseelismbazoaokala/Documents/DATA-ENGINEERING-PROJECT/output4", index=False)