In [489]:
#Dependencies
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
pd.set_option('max_colwidth', 400)


# DATA EXTRACTION
---
- Define the path to the netflix_titles_csv
- Read the file and store it as a pandas dataframe

In [610]:
#Define the path to the file
netflix_data = Path("/Users/francoiseelismbazoaokala/Documents/projects/DATA-ENGINEERING-PROJECT/Resources/netflix_titles.csv")

#read the file and store into a pandas dataframe
netflix_data_df = pd.read_csv(netflix_data )

#Take a glimpse at the dataset
netflix_data_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,25-Sep-21,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Molaba, Dillon Windvogel, Natasha Thahane, Arno Greeff, Xolile Tshabalala, Getmore Sithole, Cindy Mahlangu, Ryle De Morny, Greteli Fincham, Sello Maake Ka-Ncube, Odwa Gwanya, Mekaila Mathys, Sandi Schultz, Duane Williams, Shamilla Miller, Patrick Mofokeng",South Africa,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town teen sets out to prove whether a private-school swimming star is her sister who was abducted at birth."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabiha Akkari, Sofia Lesaffre, Salim Kechiouche, Noureddine Farihi, Geert Van Rampelberg, Bakary Diombera",,24-Sep-21,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Action & Adventure","To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
3,s4,TV Show,Jailbirds New Orleans,,,,24-Sep-21,2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down among the incarcerated women at the Orleans Justice Center in New Orleans on this gritty reality series."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam Khan, Ahsaas Channa, Revathi Pillai, Urvi Singh, Arun Kumar",India,24-Sep-21,2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV Comedies","In a city of coaching centers known to train India’s finest collegiate minds, an earnest but unexceptional student and his friends navigate campus life."


In [612]:
#Get the summary of the dataset
netflix_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


# DATA CLEANING
---
**In this section , we'll perform the following task.**
- Get the summary of the the dataset to ensure no lost data encurred during the extraction
- Drop unwanted columns('description' and 'release_year','country','duration','listed_in','cast','description)
- remove all the null values
- remove duplicated values on the show_id column
- format the 'date_added'columns
- Rename and reorder all the columns
- save the cleaned dataframe as netflix_df.csv CSV file.

In [615]:
# Remove all the null values ,Drop unwanted columns and re_order them.
reduced_df = netflix_data_df[['show_id','title','type','date_added','rating','director','description']]
reduced_df = reduced_df.dropna()
reduced_df.head()

Unnamed: 0,show_id,title,type,date_added,rating,director,description
0,s1,Dick Johnson Is Dead,Movie,25-Sep-21,PG-13,Kirsten Johnson,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
2,s3,Ganglands,TV Show,24-Sep-21,TV-MA,Julien Leclercq,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
5,s6,Midnight Mass,TV Show,24-Sep-21,TV-MA,Mike Flanagan,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
6,s7,My Little Pony: A New Generation,Movie,24-Sep-21,PG,"Robert Cullen, José Luis Ucha","Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
7,s8,Sankofa,Movie,24-Sep-21,TV-MA,Haile Gerima,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."


In [617]:
#check the data count to ensure that all the noises were dropped.
reduced_df.count()

show_id        6172
title          6172
type           6172
date_added     6172
rating         6172
director       6172
description    6172
dtype: int64

In [619]:
#Format the date_added column and get it in the format(dd-mm-yyy)
reduced_df['date_added'] = pd.to_datetime(reduced_df['date_added'], format='mixed')
reduced_df['date_added'] = reduced_df['date_added'].dt.strftime('%Y-%m-%d')
reduced_df.head()

Unnamed: 0,show_id,title,type,date_added,rating,director,description
0,s1,Dick Johnson Is Dead,Movie,2021-09-25,PG-13,Kirsten Johnson,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."


In [621]:
#Capitalize and rename the columns for a more appealing display
netflix_df = reduced_df.rename(columns = {'show_id':'Show_ID','title':'Titles','type':'Categories',
                                'date_added':'Date_Added','rating':'Ratings','director':'Directors','description':'Description'})

netflix_df.head()                               

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Description
0,s1,Dick Johnson Is Dead,Movie,2021-09-25,PG-13,Kirsten Johnson,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."


In [623]:
# save the cleaned dataframe as netflix_df.csv CSV file
netflix_df.to_csv("/Users/francoiseelismbazoaokala/Documents/projects/DATA-ENGINEERING-PROJECT/output-files/netflix.csv", index=False)

# DATA TRANSFORMATION
---

### CREATE THE MOVIE and DATAFRAME
---

**Create a movie_df and titles_df ataframes that has the following colums.**
- Title_ID
- Titles
- Categories
- Date_Added
- Ratings

In [628]:
# Create a copy of the netflix_df DataFrame and name it movie_data_df
movie_data_df = netflix_df.copy()
movie_data_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Description
0,s1,Dick Johnson Is Dead,Movie,2021-09-25,PG-13,Kirsten Johnson,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."


In [630]:
# Extract the columns of interest('Title','Category','Date_Added','Rating')
movie_data_df  = movie_data_df[['Titles','Categories','Date_Added','Ratings','Description']]
movie_data_df.head()

Unnamed: 0,Titles,Categories,Date_Added,Ratings,Description
0,Dick Johnson Is Dead,Movie,2021-09-25,PG-13,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
2,Ganglands,TV Show,2021-09-24,TV-MA,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
5,Midnight Mass,TV Show,2021-09-24,TV-MA,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
6,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
7,Sankofa,Movie,2021-09-24,TV-MA,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."


In [632]:
# Get the unique titles in a list
titles = movie_data_df['Titles'].tolist()
# Get the number of distinct values in the titles list.
title_count = len(titles)
# Create a numpy array of the length of the title_count list.
np_title = np.arange(1,6173)
# Use a list comprehension to add "tid" to each title.
title_id = [f'tid{x}' for x in range(1,6173)]
#create a dataframe with the title and title_id columns
title_df = pd.DataFrame({'Title_ID':title_id , 'Titles':titles})
title_df.head()

Unnamed: 0,Title_ID,Titles
0,tid1,Dick Johnson Is Dead
1,tid2,Ganglands
2,tid3,Midnight Mass
3,tid4,My Little Pony: A New Generation
4,tid5,Sankofa


In [634]:
#merge title-df and the movie_data_df into a new dataframe named movie_df
movie_df = pd.merge(title_df,movie_data_df,on='Titles',how = 'right')
movie_df.head()

Unnamed: 0,Title_ID,Titles,Categories,Date_Added,Ratings,Description
0,tid1,Dick Johnson Is Dead,Movie,2021-09-25,PG-13,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
1,tid2,Ganglands,TV Show,2021-09-24,TV-MA,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
2,tid3,Midnight Mass,TV Show,2021-09-24,TV-MA,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
3,tid4,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
4,tid5,Sankofa,Movie,2021-09-24,TV-MA,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."


In [636]:
#export the movie_df to a csv file as movie_df.csv
movie_df.to_csv("/Users/francoiseelismbazoaokala/Documents/projects/DATA-ENGINEERING-PROJECT/output-files/movie.csv", index=False)

### CREATE THE TITLES DATAFRAME
---
**Create the titles_df dataframe that has the following columns.**
- Title_ID
- Show_ID
- Titles	 
- Categories


In [639]:
#import a copy of the movie_df and netflix_df
df1 = movie_df.copy()
df2 = netflix_df.copy()

In [641]:
#merge both dataframe on titles column and display the output
titles_data_df = pd.merge(df2,df1)
titles_data_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Description,Title_ID
0,s1,Dick Johnson Is Dead,Movie,2021-09-25,PG-13,Kirsten Johnson,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable.",tid1
1,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war.",tid2
2,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe.",tid3
3,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it.",tid4
4,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past.",tid5


In [643]:
#extract and reorganize the columns 'Show_ID','Titles','Categories','Title_ID'
titles_data_df = titles_data_df[['Show_ID','Title_ID','Titles','Categories']]
titles_data_df.head()

Unnamed: 0,Show_ID,Title_ID,Titles,Categories
0,s1,tid1,Dick Johnson Is Dead,Movie
1,s3,tid2,Ganglands,TV Show
2,s6,tid3,Midnight Mass,TV Show
3,s7,tid4,My Little Pony: A New Generation,Movie
4,s8,tid5,Sankofa,Movie


In [645]:
#check the data count
titles_data_df.count()

Show_ID       6184
Title_ID      6184
Titles        6184
Categories    6184
dtype: int64

In [647]:
#check for duplicate values
duplicate_titles = titles_data_df.loc[titles_data_df.duplicated(subset=['Show_ID','Title_ID']),'Title_ID'].unique()
duplicate_titles


array(['tid2572', 'tid3910', 'tid2932', 'tid3909'], dtype=object)

In [649]:
#remove duplicated Title_ID
titles_df = titles_data_df[titles_data_df.isin(duplicate_titles)==False]
#drop all null values
titles_df = titles_df.dropna()
#Check the new data count
titles_df.count()

Show_ID       6168
Title_ID      6168
Titles        6168
Categories    6168
dtype: int64

#the title_id column has less values than the other columns signaling some null values

In [652]:
#Display the final titles_df dataframe
titles_df.head()

Unnamed: 0,Show_ID,Title_ID,Titles,Categories
0,s1,tid1,Dick Johnson Is Dead,Movie
1,s3,tid2,Ganglands,TV Show
2,s6,tid3,Midnight Mass,TV Show
3,s7,tid4,My Little Pony: A New Generation,Movie
4,s8,tid5,Sankofa,Movie


In [654]:
#export the titles_df to a csv file as titles_df.csv
titles_df.to_csv("/Users/francoiseelismbazoaokala/Documents/projects/DATA-ENGINEERING-PROJECT/output-files/titles.csv", index=False)

### CREATE THE DIRECTOR DATAFRAME
---
**Create the director table with the following columns**
- Show_ID
- Titles
- Categories
- Date_Added
- First_Name
- Last_Name
- Ratings
  The first and last name columns will derive from the Director column

In [657]:
 #Create a copy of the  DataFrame and name it cast_data_df
director_data_df = netflix_df.copy()
director_data_df.head()

Unnamed: 0,Show_ID,Titles,Categories,Date_Added,Ratings,Directors,Description
0,s1,Dick Johnson Is Dead,Movie,2021-09-25,PG-13,Kirsten Johnson,"As her father nears the end of his life, filmmaker Kirsten Johnson stages his death in inventive and comical ways to help them both face the inevitable."
2,s3,Ganglands,TV Show,2021-09-24,TV-MA,Julien Leclercq,"To protect his family from a powerful drug lord, skilled thief Mehdi and his expert team of robbers are pulled into a violent and deadly turf war."
5,s6,Midnight Mass,TV Show,2021-09-24,TV-MA,Mike Flanagan,"The arrival of a charismatic young priest brings glorious miracles, ominous mysteries and renewed religious fervor to a dying town desperate to believe."
6,s7,My Little Pony: A New Generation,Movie,2021-09-24,PG,"Robert Cullen, José Luis Ucha","Equestria's divided. But a bright-eyed hero believes Earth Ponies, Pegasi and Unicorns should be pals — and, hoof to heart, she’s determined to prove it."
7,s8,Sankofa,Movie,2021-09-24,TV-MA,Haile Gerima,"On a photo shoot in Ghana, an American model slips back in time, becomes enslaved on a plantation and bears witness to the agony of her ancestral past."


In [659]:
#Extract the columns of interest
director_data_df = director_data_df[['Show_ID','Titles','Ratings','Categories','Date_Added','Directors']]
director_data_df.head()

Unnamed: 0,Show_ID,Titles,Ratings,Categories,Date_Added,Directors
0,s1,Dick Johnson Is Dead,PG-13,Movie,2021-09-25,Kirsten Johnson
2,s3,Ganglands,TV-MA,TV Show,2021-09-24,Julien Leclercq
5,s6,Midnight Mass,TV-MA,TV Show,2021-09-24,Mike Flanagan
6,s7,My Little Pony: A New Generation,PG,Movie,2021-09-24,"Robert Cullen, José Luis Ucha"
7,s8,Sankofa,TV-MA,Movie,2021-09-24,Haile Gerima


In [661]:
director_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6172 entries, 0 to 8806
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Show_ID     6172 non-null   object
 1   Titles      6172 non-null   object
 2   Ratings     6172 non-null   object
 3   Categories  6172 non-null   object
 4   Date_Added  6172 non-null   object
 5   Directors   6172 non-null   object
dtypes: object(6)
memory usage: 337.5+ KB


In [663]:
# Split the director coulumn into 2 colums and name each First_Name and 'Last_Name' respectively
#store the directors column in a list
cast_list = director_data_df['Directors'].tolist()

#extract individual names from the director_list
name_list = []
for value in cast_list:
      full_name = value.split(',')
      name_list.append(full_name[-1])
      
#create a variable to store a dictionanry.
actors_dict={}
actors_dict['Directors']=name_list

#create a dataframe
data_df = pd.DataFrame(actors_dict)


#split the names column into 2 columns named First_name and Last_name
data_df[['First_Name','Last_Name']] =data_df['Directors'].str.split(' ',n=1,expand=True)

#Display the dataframe
data_df.head()

Unnamed: 0,Directors,First_Name,Last_Name
0,Kirsten Johnson,Kirsten,Johnson
1,Julien Leclercq,Julien,Leclercq
2,Mike Flanagan,Mike,Flanagan
3,José Luis Ucha,,José Luis Ucha
4,Haile Gerima,Haile,Gerima


In [665]:
data_df['Last_Name'].fillna(0).count()

6172

In [667]:
#merge the data_df and director_data_df dataframes
director_df = pd.merge(data_df,director_data_df).drop_duplicates()
director_df.head()

Unnamed: 0,Directors,First_Name,Last_Name,Show_ID,Titles,Ratings,Categories,Date_Added
0,Kirsten Johnson,Kirsten,Johnson,s1,Dick Johnson Is Dead,PG-13,Movie,2021-09-25
1,Julien Leclercq,Julien,Leclercq,s3,Ganglands,TV-MA,TV Show,2021-09-24
2,Julien Leclercq,Julien,Leclercq,s1237,Sentinelle,TV-MA,Movie,2021-03-05
3,Julien Leclercq,Julien,Leclercq,s2669,Earth and Blood,TV-MA,Movie,2020-04-17
10,Mike Flanagan,Mike,Flanagan,s6,Midnight Mass,TV-MA,TV Show,2021-09-24


In [669]:
#drop the director column
director_df = director_df[['First_Name','Last_Name','Show_ID','Titles','Ratings','Categories','Date_Added']].dropna()
director_df.head()

Unnamed: 0,First_Name,Last_Name,Show_ID,Titles,Ratings,Categories,Date_Added
0,Kirsten,Johnson,s1,Dick Johnson Is Dead,PG-13,Movie,2021-09-25
1,Julien,Leclercq,s3,Ganglands,TV-MA,TV Show,2021-09-24
2,Julien,Leclercq,s1237,Sentinelle,TV-MA,Movie,2021-03-05
3,Julien,Leclercq,s2669,Earth and Blood,TV-MA,Movie,2020-04-17
10,Mike,Flanagan,s6,Midnight Mass,TV-MA,TV Show,2021-09-24


In [671]:
#create a unique id column
titles = director_df['Titles']
#get the count of the titles
titles_count = director_df['Titles'].count()
#create a numpy array of the length of titles
np_titles = np.arange(1,5494)
#set the id format
ID = [f'100{x}'for x in range(1,5494)]
#add the id  column to the director_df
id_df= pd.DataFrame({'IDs':ID,'Titles':titles})
# concatenating id_df and director_df along columns
director_df = pd.merge(id_df,director_df)
#display the first5 row
director_df.head()


Unnamed: 0,IDs,Titles,First_Name,Last_Name,Show_ID,Ratings,Categories,Date_Added
0,1001,Dick Johnson Is Dead,Kirsten,Johnson,s1,PG-13,Movie,2021-09-25
1,1002,Ganglands,Julien,Leclercq,s3,TV-MA,TV Show,2021-09-24
2,1003,Sentinelle,Julien,Leclercq,s1237,TV-MA,Movie,2021-03-05
3,1004,Earth and Blood,Julien,Leclercq,s2669,TV-MA,Movie,2020-04-17
4,1005,Midnight Mass,Mike,Flanagan,s6,TV-MA,TV Show,2021-09-24


In [673]:
#Export the new dataframe as cast_df.csv to a csv file.
director_df.to_csv("/Users/francoiseelismbazoaokala/Documents/projects/DATA-ENGINEERING-PROJECT/output-files/director.csv", index=False)

---
# DEFINING THE ENTITY RELATIONSHIP DIAGRAM (ERD)
---

In [681]:
import pandas as pd
from pandaserd import ERD

df1 = netflix_df
df1.columns = ['Show_ID','Titles','Categories','Date_Added','Ratings','Directors','Description']
df2 = movie_df
df2.columns = ['Title_ID','Titles','Categories','Date_Added','Ratings','Description']
df3 = titles_df
df3.columns = ['Show_ID','Title_ID','Categories','Titles']
df4 = director_df
df4.columns = ['IDs','Titles','First_Name','Last_Name','Show_ID','Ratings','Categories','Date_Added']



erd = ERD()
t1 = erd.add_table(df1, 'netflix', bg_color='gold')
t2 = erd.add_table(df2, 'movie', bg_color='skyblue')
t3 = erd.add_table(df3, 'titles', bg_color='lightblue')
t4 = erd.add_table(df4, 'director', bg_color='lightyellow')
erd.create_rel('netflix', 'director', on='Show_ID', right_cardinality='*',left_cardinality='+')
erd.create_rel('netflix', 'titles', on='Show_ID',  left_cardinality='+', right_cardinality='+')
erd.create_rel('movie','titles' ,on = 'Title_ID', left_cardinality='+', right_cardinality='+')
erd.write_to_file('output.txt')

written to output.txt; visit https://edotor.net/ to render ERD


---
# LODING OUR TABLES INTO A SQLITE DATABASE #
---

In [684]:
# Import SQL Alchemy
from sqlalchemy import create_engine

# Import and establish Base for which classes will be constructed
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()

# Import modules to declare columns and column data types
from sqlalchemy import Column, Integer, String, Date

  Base = declarative_base()


In [686]:
# Create the  class
class Netflix(Base):
     __tablename__ = 'netflix'
     Show_ID = Column(String, primary_key=True)
     Titles = Column(String(255))
     Categories = Column(String(10))
     Date_Added = Column(Date)
     Ratings = Column(String(10))
     Directors = Column(String(255))
     Description = Column(String(255))
    

In [688]:
#Create the Movie class
class Movie(Base):
    __tablename__='movie'
    Title_ID = Column(String, primary_key=True)
    Titles = Column(String(255))
    Categories = Column(String(10))
    Date_Added = Column(Date)
    Ratings = Column(String(10))
    Description = Column(String(255))

In [690]:
#Create the Title class
class Title(Base):
    __tablename__='Title'
    Show_ID = Column(String, primary_key=True)
    Title_ID = Column(String(20))
    Titles = Column(String(255))
    Categories = Column(String(10))

In [692]:
#Create the Director class
class Director(Base):
    __tablename__='Director'
    IDs = Column(Integer, primary_key=True)
    Titles = Column(String(255))
    First_Name = Column(String(255))
    Last_Name = Column(String(255))
    Show_ID = Column(String)
    Ratings = Column(String(10))
    Categories = Column(String(10))
    Date_Added = Column(Date)

In [700]:
# Call the Entertaiment Constructors to create the Netflix , Movie ,Titles and Director's objects
# Create a Specific Instance of each class.
title = Title(Show_ID = 's1',Title_ID = 'tid1',Titles = 'Dick Johnson Is Dead',Categories = 'Movie')
netflix = Netflix(Show_ID = 's1',Titles = 'Dick Johnson Is Dead',Categories = 'Movies',Date_Added = '2021-09-25',Ratings = 'PG-13',
                 Directors = 'Kirsten Johnson',Description = 'As her father nears the end of his life, filmmaker Kirsten Johnson \
                 stages his death in inventive and comical ways to help them both face the inevitable.')
movie = Movie(Title_ID = 'tid1',Titles = 'Dick Johnson Is Dead',Categories = 'Movie',Date_Added = '2021-09-25',Ratings = 'PG-13',
                  Description = 'As her father nears the end of his life, filmmaker Kirsten Johnson \
                  stages his death in inventive and comical ways to help them both face the inevitable.')
director = Director(IDs = '1001',Titles = 'Dick Johnson Is Dead',First_Name = 'Kirsten',Last_Name = 'Johnson',
                    Show_ID = 's1',Ratings = 'PG-13',Categories = 'Movie',Date_Added = '2021-09-25')

In [702]:
# Create a connection to a SQLite database
engine = create_engine('sqlite:///Entertainment.sqlite')

In [704]:
# Create the tables within the database
Base.metadata.create_all(engine)

In [706]:
# Push the objects made and query the server using the Session object
from sqlalchemy.orm import Session
session = Session(bind = engine)

In [708]:
# Add Records to the Entertainment database
# ----------------------------------
# Use the SQL ALchemy methods to run simple "INSERT" statements using the classes and objects  
session.add(netflix)
session.add(movie)
session.add(title)
session.add(director)
session.commit()

StatementError: (builtins.TypeError) SQLite Date type only accepts Python date objects as input.
[SQL: INSERT INTO "Director" ("IDs", "Titles", "First_Name", "Last_Name", "Show_ID", "Ratings", "Categories", "Date_Added") VALUES (?, ?, ?, ?, ?, ?, ?, ?)]
[parameters: [{'Show_ID': 's1', 'Titles': 'Dick Johnson Is Dead', 'Categories': 'Movie', 'First_Name': 'Kirsten', 'IDs': '1001', 'Ratings': 'PG-13', 'Date_Added': '2021-09-25', 'Last_Name': 'Johnson'}]]

In [710]:
# close the session
session.close()