# ETL Project:  Video Streaming Services
Our project looks at the ETL process of the top video streaming services.  We aim to create a database with (4) tables, each consisting of television shows from popular platforms such as Amazon Prime, Disney Plus, Hulu, and Netflix.  Since each provider has different titles with little to no overlap, our objective is to create a relational database for those who are subscribed to all of them to have the ability to run queries with specified parameters to find something to watch and where. 

We plan to unite them by Title, Genre, and User Rating.

In [None]:
# Dependencies and setup
import pandas as pd
import numpy as np
from sqlalchemy import create_engine

### Start of Amazon Prime

In [None]:
# Read Amazon Prime Shows CSV into DataFrame
amazonPrime_file = "Resources/amazon_prime_shows.csv"
amazonPrimeRaw_df = pd.read_csv(amazonPrime_file, encoding = "iso-8859-1")
amazonPrimeRaw_df.head()

In [None]:
# Create new DataFrame with only desired columns
amazonPrime_df = amazonPrimeRaw_df[["Name of the show", "Genre", "IMDb rating"]].copy()
amazonPrime_df

In [None]:
# Rename columns
amazonPrime_df = amazonPrime_df.rename(columns = {"Name of the show":"Title", "IMDb rating":"User_Rating"})
amazonPrime_df

In [None]:
# Split strings in Genre column
a_df = amazonPrime_df["Genre"].str.split(" ", n = 1, expand = True)
amazonPrime_df["Genre_1"] = a_df[0]
# amazonPrime_df["Genre_2"] = a_df[1]

# Drop old Genre column
amazonPrime_df.drop(columns = ["Genre"], inplace = True)
amazonPrime_df

In [None]:
# Drop unwanted characters from string in Genre_1
amazonPrime_df["Genre_1"] = amazonPrime_df["Genre_1"].str.replace(",", "")
amazonPrime_df

In [None]:
# Make genres consistent with other datasets
amazonPrime_df["Genre_1"] = amazonPrime_df["Genre_1"].str.replace("Sci-fi", "Sci-Fi")
amazonPrime_df

In [None]:
# Drop rows containing NaN values
amazonPrime_df.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace = True)
amazonPrime_df

In [None]:
amazonPrime_df.dtypes

In [None]:
# Export to CSV
amazonPrime_df.to_csv("amazonPrime_df.csv", index = False, header = True)

### Start of Disney Plus

In [None]:
# Read Disney Plus Shows CSV into DataFrame
disneyPlus_file = "Resources/disney_plus_shows.csv"
disneyPlusRaw_df = pd.read_csv(disneyPlus_file)
disneyPlusRaw_df.head()

In [None]:
# Create new DataFrame with only desired columns
disneyPlus_df = disneyPlusRaw_df[["title", "genre", "imdb_rating"]].copy()
disneyPlus_df

In [None]:
# Rename columns
disneyPlus_df = disneyPlus_df.rename(columns = {"title":"Title", "genre":"Genre", "imdb_rating":"User_Rating"})
disneyPlus_df.head()

In [None]:
# Split strings in Genre column
d_df = disneyPlus_df["Genre"].str.split(" ", n = 1, expand = True)
disneyPlus_df["Genre_1"] = d_df[0]
# disneyPlus_df["Genre_2"] = d_df[1]

# Drop old Genre column
disneyPlus_df.drop(columns = ["Genre"], inplace = True)
disneyPlus_df

In [None]:
# # Split strings in Genre_2 column
# d2_df = disneyPlus_df["Genre_2"].str.split(" ", n = 1, expand = True)
# disneyPlus_df["Genre_3"] = d2_df[0]
# disneyPlus_df["Genre_4"] = d2_df[1]

# # Drop old Genre_2 and Genre_4 column
# disneyPlus_df.drop(columns = ["Genre_2", "Genre_4"], inplace = True)
# disneyPlus_df

In [None]:
# # Rename columns
# disneyPlus_df = disneyPlus_df.rename(columns = {"Genre_3":"Genre_2"})
# disneyPlus_df.head()

In [None]:
# Drop unwanted characters from string in Genre_1 and Genre_2
disneyPlus_df["Genre_1"] = disneyPlus_df["Genre_1"].str.replace(",", "")
# disneyPlus_df["Genre_2"] = disneyPlus_df["Genre_2"].str.replace(",", " ")
disneyPlus_df

In [None]:
# Make genres consistent with other datasets
disneyPlus_df["Genre_1"] = disneyPlus_df["Genre_1"].str.replace("Reality-TV", "Reality")
disneyPlus_df

In [None]:
# Drop rows containing NaN values
disneyPlus_df.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace = True)
disneyPlus_df

In [None]:
disneyPlus_df.dtypes

In [None]:
# Export to CSV
disneyPlus_df.to_csv("disneyPlus_df.csv", index = False, header = True)

### Start of Hulu

In [None]:
# Read Hulu Shows CSV into DataFrame
hulu_file = "Resources/HuluRaw.csv"
huluRaw_df = pd.read_csv(hulu_file)
huluRaw_df.head()

In [None]:
# Create new DataFrame with only desired columns
hulu_df = huluRaw_df[["show/canonical_name", "show/genre", "show/rating"]].copy()
hulu_df

In [None]:
# Multiply show/rating by 2 for consistency with other DataFrame User_Rating values
hulu_df["show/rating"] = 2 * hulu_df["show/rating"].round(decimals = 1)
hulu_df

In [None]:
# Rename columns
hulu_df = hulu_df.rename(columns = {"show/canonical_name":"Title", "show/genre":"Genre", "show/rating":"User_Rating"})
hulu_df.head()

In [None]:
# Format strings in Title column?

In [None]:
# Split strings in Genre column
h_df = hulu_df["Genre"].str.split(" ", n = 1, expand = True)
hulu_df["Genre_1"] = h_df[0]
# hulu_df["Genre_2"] = h_df[1]

# Drop old Genre column
hulu_df.drop(columns = ["Genre"], inplace = True)
hulu_df

In [None]:
# Drop Genre_2 column
# hulu_df.drop(columns = ["Genre_2"], inplace = True)
# hulu_df

In [None]:
# Make genres consistent with other datasets
hulu_df["Genre_1"] = hulu_df["Genre_1"].str.replace("Science", "Sci-Fi")

In [None]:
# Drop rows containing NaN values
hulu_df.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace = True)
hulu_df

In [None]:
hulu_df.dtypes

In [None]:
# Export to CSV
hulu_df.to_csv("hulu_df.csv", index = False, header = True)

### Start of Netflix

In [None]:
# Read Netflix Shows CSV into DataFrame
netflix_file = "Resources/netflix_titles.csv"
netflixRaw_df = pd.read_csv(netflix_file)
netflixRaw_df.head()

In [None]:
# Create new DataFrame with only desired columns
netflix_df = netflixRaw_df[["title", "listed_in"]].copy()
netflix_df

In [None]:
# Rename columns
netflix_df = netflix_df.rename(columns = {"title":"Title", "listed_in":"Genre"})
netflix_df.head()

In [None]:
# Split strings in Genre column
n_df = netflix_df["Genre"].str.split(" ", n = 1, expand = True)
netflix_df["Genre_1"] = n_df[0]
# netflix_df["Genre_2"] = n_df[1]

# Drop old Genre column
netflix_df.drop(columns = ["Genre"], inplace = True)
netflix_df

In [None]:
# # Split strings in Genre_2 column
# n2_df = netflix_df["Genre_2"].str.split(" ", n = 1, expand = True)
# netflix_df["Genre_3"] = n2_df[0]
# netflix_df["Genre_4"] = n2_df[1]

# # Drop old Genre_2 and Genre_4 column
# netflix_df.drop(columns = ["Genre_2", "Genre_4"], inplace = True)
# netflix_df

In [None]:
# # Rename column
# netflix_df = netflix_df.rename(columns = {"Genre_3":"Genre_2"})
# netflix_df.head()

In [None]:
# Drop unwanted characters from string in Genre_1 and Genre_2
netflix_df["Genre_1"] = netflix_df["Genre_1"].str.replace(",", "")
# netflix_df["Genre_2"] = netflix_df["Genre_2"].str.replace(",", " ")
netflix_df

In [None]:
# Make genres consistent with other datasets
netflix_df["Genre_1"] = netflix_df["Genre_1"].str.replace("Dramas", "Drama")
netflix_df["Genre_1"] = netflix_df["Genre_1"].str.replace("Comedies", "Comedy")
netflix_df["Genre_1"] = netflix_df["Genre_1"].str.replace("Documentaries", "Documentary")
netflix_df

In [None]:
# Drop rows containing NaN values
netflix_df.dropna(axis = 0, how = "any", thresh = None, subset = None, inplace = True)
netflix_df

In [None]:
netflix_df.dtypes

In [None]:
# Export to CSV
netflix_df.to_csv("netflix_df.csv", index = False, header = True)

### Start of Database Connection

In [None]:
# Connect to local database
# rds_connection_string = "<insert user name>:<insert password>@localhost:5432/customer_db"
# engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# Check for tables
# engine.table_names()

In [None]:
# Use pandas to load Amazon Prime Movies DataFrame into database
# new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

In [None]:
# Use pandas to load Amazon Prime Shows DataFrame into database
# new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

In [None]:
# Use pandas to load Disney Plus Shows DataFrame into database
# new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

In [None]:
# Use pandas to load merged Netflix DataFrame into database
# new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

In [None]:
# Confirm data has been added by querying the Amazon Prime Movies table
# pd.read_sql_query('select * from customer_name', con=engine).head()

In [None]:
# Confirm data has been added by querying the Amazon Prime Shows table
# pd.read_sql_query('select * from customer_name', con=engine).head()

In [None]:
# Confirm data has been added by querying the Disney Plus Shows table
# pd.read_sql_query('select * from customer_name', con=engine).head()

In [None]:
# Confirm data has been added by querying the Netflix table
# pd.read_sql_query('select * from customer_name', con=engine).head()