# Import Dependencies

In [4]:
import pandas as pd
from sqlalchemy import create_engine
import os

In [5]:
# export paths
outputImagePath = os.path.join("Images")
outputResourcePath = os.path.join("Resources")

# Extract the Resource Files

In [14]:
movie_df = pd.read_csv(os.path.join("Resources","Movies.csv"),delimiter=",")
movie_df.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [8]:
tvShow_df = pd.read_csv(os.path.join("Resources","tv_shows.csv"),delimiter=",")
tvShow_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,1
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,1
2,2,Money Heist,2017,18+,8.4,91%,1,0,0,0,1
3,3,Sherlock,2010,16+,9.1,78%,1,0,0,0,1
4,4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,1


In [22]:
num_movies = len(movie_df.index)
num_tvShows = len(tvShow_df.index)
print(f"Total\nMovies: {num_movies}\nTV Shows: {num_tvShows}")

Total
Movies: 16744
TV Shows: 5611


In [21]:
unique_movies = len(pd.unique(movie_df['Title']))
unique_tvShows = len(pd.unique(tvShow_df['Title']))
print(f"Unique\nMovies: {num_movies}\nTV Shows: {num_tvShows}")

Unique
Movies: 16744
TV Shows: 5611


# Clean Up the Data

## Movie Data

In [12]:
movie_df['Type'] = movie_df['Type'].replace([0,1],['movie','tv show'])
movie_df.head()

Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,1,Inception,2010,13+,8.8,87%,1,0,0,0,movie,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,2,The Matrix,1999,18+,8.7,87%,1,0,0,0,movie,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,3,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,movie,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,4,Back to the Future,1985,7+,8.5,96%,1,0,0,0,movie,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,5,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,movie,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


In [24]:
# drop the id columne for pairing
clean_movie_df = movie_df.drop(columns=['ID'])
clean_movie_df.head()

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type,Directors,Genres,Country,Language,Runtime
0,Inception,2010,13+,8.8,87%,1,0,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0
1,The Matrix,1999,18+,8.7,87%,1,0,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0
2,Avengers: Infinity War,2018,13+,8.5,84%,1,0,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0
3,Back to the Future,1985,7+,8.5,96%,1,0,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0
4,"The Good, the Bad and the Ugly",1966,18+,8.8,97%,1,0,1,0,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0


## TV Show Data

In [9]:
tvShow_df['type'] = tvShow_df['type'].replace([0,1],['movie','tv show'])
tvShow_df.head()

Unnamed: 0.1,Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,tv show
1,1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,tv show
2,2,Money Heist,2017,18+,8.4,91%,1,0,0,0,tv show
3,3,Sherlock,2010,16+,9.1,78%,1,0,0,0,tv show
4,4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,tv show


In [33]:
# drop the unnamed id column
clean_tvShow_df = tvShow_df.drop(columns=['Unnamed: 0'])
clean_tvShow_df.head()

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,type
0,Breaking Bad,2008,18+,9.5,96%,1,0,0,0,tv show
1,Stranger Things,2016,16+,8.8,93%,1,0,0,0,tv show
2,Money Heist,2017,18+,8.4,91%,1,0,0,0,tv show
3,Sherlock,2010,16+,9.1,78%,1,0,0,0,tv show
4,Better Call Saul,2015,18+,8.7,97%,1,0,0,0,tv show


# Combine the DataFrames

# Transform DataFrames