In [2]:
# Import dependencies for pandas (read in the CSV) and create engine from sqlalchemy to set up our database
import pandas as pd
from sqlalchemy import create_engine
import codecs
import requests
import json
from pprint import pprint
from config import api_key

In [3]:
# Import episodes csv using the pandas read_csv function, display dataframe head to get a quick look at the data
episodes_file = "episodes.csv"
episodes_df = pd.read_csv(episodes_file)
episodes_df.head()

Unnamed: 0,title,audio,audio_length,description,pub_date,uuid,podcast_uuid
0,Piątek - 01 grudnia,https://cdneu.modlitwawdrodze.pl/prayers/MWD_2...,490,"święci męczennicy jezuiccy Edmund Campion SJ, ...",2017-12-01 00:00:00+00,fd5d891411174c7ca953c1f54657c3eb,811c18cf575841b3bef4601978f17ca9
1,Sobota - 02 grudnia,https://cdneu.modlitwawdrodze.pl/prayers/MWD_2...,481,"bł. Rafał Chyliński, prezbiter, Łk 21, 34-36",2017-12-02 00:00:00+00,5c28fa0a27b342cd92ff03c16a8019c2,811c18cf575841b3bef4601978f17ca9
2,Niedziela - 03 grudnia,https://cdneu.modlitwawdrodze.pl/prayers/MWD_2...,667,"Pierwsza Niedziela Adwentu, Mk 13, 33-37",2017-12-03 00:00:00+00,efdc9f4f07fa4c4883f8848256066cec,811c18cf575841b3bef4601978f17ca9
3,Introduction to Luke,http://www.wgcr.net/images/TimelessTruths/TTT-...,1691,Luke 1:1-4 -,2017-12-03 11:30:05+00,cc2860165fa84d1092f6b45f19255a87,36ed4e62dcd94412a5211cc9bd76ba7c
4,"Dear Science: Lightning, Dead Cats and Hand Sa...",http://95bfm.com/sites/default/files/291117_De...,1152,<p>Today on Dear Science with AUT's Allan Blac...,2017-12-27 11:00:00+00,69bd409e0469433581ccc76cf7b664ad,fa36a26a1879453f95da1379c737cd6d


In [None]:
# Import episodes csv using the pandas read_csv function, display dataframe head to get a quick look at the data
# podcast_file = "podcasts.csv"
# podcast_df = pd.read_csv(podcast_file)
# podcast_df.head()

# Work on episodes.csv

In [5]:
# Create a filtered episodes dataframe from specific columns 
# Here we are only really interested the eventual primary key (podcast_uuid) and sortable, descriptive columns 
# Filtering out extraneous fields
episodes_cols = ["title", "description", "podcast_uuid", "uuid"]
episodes_transformed = episodes_df[episodes_cols].copy()

# Rename the column headers
episodes_transformed = episodes_transformed.rename(columns={"title": "title", 
                                                            "description": "description",
                                                            "uuid": "episode_id",
                                                          "podcast_uuid": "podcast_id"})

# Clean the data by dropping duplicates and setting the index
# Display the head of our new dataframe
episodes_transformed.drop_duplicates("episode_id", inplace=True)
episodes_transformed.set_index("episode_id", inplace=True)

episodes_transformed.head()

Unnamed: 0_level_0,title,description,podcast_id
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fd5d891411174c7ca953c1f54657c3eb,Piątek - 01 grudnia,"święci męczennicy jezuiccy Edmund Campion SJ, ...",811c18cf575841b3bef4601978f17ca9
5c28fa0a27b342cd92ff03c16a8019c2,Sobota - 02 grudnia,"bł. Rafał Chyliński, prezbiter, Łk 21, 34-36",811c18cf575841b3bef4601978f17ca9
efdc9f4f07fa4c4883f8848256066cec,Niedziela - 03 grudnia,"Pierwsza Niedziela Adwentu, Mk 13, 33-37",811c18cf575841b3bef4601978f17ca9
cc2860165fa84d1092f6b45f19255a87,Introduction to Luke,Luke 1:1-4 -,36ed4e62dcd94412a5211cc9bd76ba7c
69bd409e0469433581ccc76cf7b664ad,"Dear Science: Lightning, Dead Cats and Hand Sa...",<p>Today on Dear Science with AUT's Allan Blac...,fa36a26a1879453f95da1379c737cd6d


In [6]:
# Drop NA values
episodes_final = episodes_transformed.dropna()

episodes_final.head()

Unnamed: 0_level_0,title,description,podcast_id
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
fd5d891411174c7ca953c1f54657c3eb,Piątek - 01 grudnia,"święci męczennicy jezuiccy Edmund Campion SJ, ...",811c18cf575841b3bef4601978f17ca9
5c28fa0a27b342cd92ff03c16a8019c2,Sobota - 02 grudnia,"bł. Rafał Chyliński, prezbiter, Łk 21, 34-36",811c18cf575841b3bef4601978f17ca9
efdc9f4f07fa4c4883f8848256066cec,Niedziela - 03 grudnia,"Pierwsza Niedziela Adwentu, Mk 13, 33-37",811c18cf575841b3bef4601978f17ca9
cc2860165fa84d1092f6b45f19255a87,Introduction to Luke,Luke 1:1-4 -,36ed4e62dcd94412a5211cc9bd76ba7c
69bd409e0469433581ccc76cf7b664ad,"Dear Science: Lightning, Dead Cats and Hand Sa...",<p>Today on Dear Science with AUT's Allan Blac...,fa36a26a1879453f95da1379c737cd6d


In [7]:
# Sort the data by titles
episodes_final_sorted = episodes_final.sort_values(by=['title'], ascending = True)

episodes_final_sorted.head()

Unnamed: 0_level_0,title,description,podcast_id
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
07e2cd89a7244383a6c8fb8520676f5b,! Episode 1: Too-Good-For-Human-Form Tobias,<p>This week our hosts dive hawk-first into th...,584c7d1e2208418d9ec0efea10197ad6
9a5405870f8e4a21a39f8c55a5b098c3,! GRACIAS A VOSOTROS ! FELIZ 2018,GRACIAS A TODOS.,fc31d46e3e464e9ca043ac1a61e0747d
f1b75bb5e12b49e394cae03d398ff2a0,! GRACIAS A VOSOTROS ! FELIZ 2018,GRACIAS A TODOS.,1bd453973e764a19933fcfa62d3c1678
380313c52ca046aab62177a06fc7615e,!!!!!!!!FREE DOWNLOAD SIMPLE !!!!!!!!!!!!!!!!!,!!!!!!!!FREE DOWNLOAD SIMPLE !!!!!!!!!!!!!!!!!...,64101a4a5eb54f47afa3a7120a4d32db
fe4c70f40fd64fefba3c2764ebc59b14,!!Special Announcement!! & Short Interview Mix,"<p><img class=""size-medium wp-image-723 alignl...",d86b2ff8c4084bb4857398e5856a9d25


In [37]:
# Testing for case sensitivity
episode_find = episodes_final_sorted.loc[episodes_final_sorted['title'].str.contains('Stranger Things')]

episode_find.head()

Unnamed: 0_level_0,title,description,podcast_id
episode_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
e33741cfbfec4b3194c8087bd9446b1b,#019 Más Stranger Things,<p><strong>SerialMe! 019.</strong> Más Strange...,c31e82449da64db8b3afdc2d7ba484a9
081864775e2c4cf1bce56ecc17cc9f0f,#019 Más Stranger Things,<p><strong>SerialMe! 019.</strong> Más Strange...,30d4ca6df1884a66874bf764d8332678
f6e768d5c9a248829a6766d072d459a8,#039 - Especial “Stranger Things 2”,Promessa é dívida! Se toda dívida fosse tão le...,5bc0a4b912704322bf72cdb52dfa996e
cbaa96fd37794cb486406071aecd2586,#10 - Crisis on Earth-X and Stranger Things 2,<br />\nWelcome to Culture Pop Remix #10!<br /...,769d7995433b483abb290371df034447
be909ceaf8604e1380b0293933cafcf4,#142 - Stranger Things,"<img class=""thumb-image"" alt=""142 Stranger Thi...",85c2596c25894cbbb2a5b96e75feeb72


# Work on podcasts.csv

In [None]:
# # Create a filtered dataframe from specific columns 
# # Here we are only really interested the eventual primary key (uuid) and sortable, descriptive columns 
# # Filtering out extraneous fields
podcast_cols = ["uuid", "title", "language", "categories"]
podcast_clean= podcast_df[podcast_cols].copy()

# # Rename the column headers
podcast_clean = podcast_clean.rename(columns={"uuid": "id"})

# # Clean the data by dropping duplicates and setting the index
# # Display the head of our new dataframe
podcast_clean.drop_duplicates("id", inplace=True)
podcast_clean.set_index("id", inplace=True)

podcast_clean.head()

In [None]:
# # Our project will focus on the intersection of podcasts and streaming (Netfilx)
# # Display unique values for the categories column and locate what is relevant: "TV & Film"
podcast_clean.categories.unique()

In [None]:
# # Filter the dataframe to display only results mathing "TV & Film" in the "categories" column
podcast_tvfilm = podcast_clean.loc[podcast_clean['categories'] == 'TV & Film']

# # Sort the data by title
podcast_tvfilm_a = podcast_tvfilm.sort_values(by=['title'], ascending = False)
podcast_tvfilm_a.head()


In [None]:
# # Filtered the dataframe by English only podcasts, attempting to clean up any errors in loading to mysql
podcast_english = podcast_tvfilm_a.loc[podcast_tvfilm_a['language'] == 'English']

podcast_english.head()

In [None]:
# # Testing for case sensitivity
podcast_find = podcast_tvfilm_a.loc[podcast_tvfilm_a['title'].str.contains('Stranger Things')]
podcast_find

# Connect OMDb API

In [9]:
# Import Netflix_Shows.csv 
file = "netflix_shows_.csv"
netflix_data = pd.read_csv(file)
netflix_data.head()

Unnamed: 0,title,rating,ratingLevel,ratingDescription,release year,user rating score,user rating size
0,White Chicks,PG-13,"crude and sexual humor, language and some drug...",80,2004,82.0,80
1,Lucky Number Slevin,R,"strong violence, sexual content and adult lang...",100,2006,,82
2,Grey's Anatomy,TV-14,Parents strongly cautioned. May be unsuitable ...,90,2016,98.0,80
3,Prison Break,TV-14,Parents strongly cautioned. May be unsuitable ...,90,2008,98.0,80
4,How I Met Your Mother,TV-PG,Parental guidance suggested. May not be suitab...,70,2014,94.0,80


In [10]:
# Select only the 'title' of Netflix tv shows
# Convert series into a list
netflix_titles = list(netflix_data['title'])
netflix_titles[0:5]

['White Chicks',
 'Lucky Number Slevin',
 "Grey's Anatomy",
 'Prison Break',
 'How I Met Your Mother']

In [11]:
# Create empty list of OMDb description responses of netflix titles
responses = []
base_url = f"http://www.omdbapi.com/?apikey={api_key}&t="

# Loop through netflix_titles
# Exclude title '3%' because creates an error with title in API url call
for title in [x for x in netflix_titles if x != '3%']:
    
#   Create request for each title
    omdb_response = requests.get(base_url + title).json()

#   Append omdb_response to main list of responses
    responses.append(omdb_response)

In [12]:
# Show length of list responses
# Does not include show '3%'
len(responses)

999

In [13]:
# Check first 6 responses from OMDb for Netflix Titles
responses[0]

{'Title': 'White Chicks',
 'Year': '2004',
 'Rated': 'PG-13',
 'Released': '23 Jun 2004',
 'Runtime': '109 min',
 'Genre': 'Comedy, Crime',
 'Director': 'Keenen Ivory Wayans',
 'Writer': 'Keenen Ivory Wayans (screenplay), Shawn Wayans (screenplay), Marlon Wayans (screenplay), Andrew McElfresh (screenplay), Michael Anthony Snowden (screenplay), Xavier Cook (screenplay), Keenen Ivory Wayans (story), Shawn Wayans (story), Marlon Wayans (story)',
 'Actors': 'Shawn Wayans, Marlon Wayans, Jaime King, Frankie Faison',
 'Plot': 'Two disgraced FBI agents go way undercover in an effort to protect hotel heiresses the Wilson Sisters from a kidnapping plot.',
 'Language': 'English',
 'Country': 'USA',
 'Awards': '3 wins & 13 nominations.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTY3OTg2OTM3OV5BMl5BanBnXkFtZTYwNzY5OTA3._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '5.5/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '15%'},
  {'Source': 'Metacritic', 'Value'

In [14]:
# Saved 'responses' to a json file
with open('responses.json', 'w') as outfile:
    json.dump(responses, outfile)

In [15]:
# Use method read_json to convert responses.json to pd DF
all_omdb_categories = pd.read_json("responses.json")

print(f"The length of all_omdb_dategories DF is {len(all_omdb_categories)}")
all_omdb_categories.head()

The length of all_omdb_dategories DF is 999


Unnamed: 0,Actors,Awards,BoxOffice,Country,DVD,Director,Error,Genre,Language,Metascore,...,Runtime,Title,Type,Website,Writer,Year,imdbID,imdbRating,imdbVotes,totalSeasons
0,"Shawn Wayans, Marlon Wayans, Jaime King, Frank...",3 wins & 13 nominations.,"$69,148,997",USA,26 Oct 2004,Keenen Ivory Wayans,,"Comedy, Crime",English,41.0,...,109 min,White Chicks,movie,http://www.sonypictures.com/movies/whitechicks,"Keenen Ivory Wayans (screenplay), Shawn Wayans...",2004,tt0381707,5.5,115904,
1,"Josh Hartnett, Bruce Willis, Lucy Liu, Morgan ...",5 wins & 4 nominations.,,"Germany, UK, USA, Canada",26 Sep 2006,Paul McGuigan,,"Crime, Drama, Mystery, Thriller",English,53.0,...,110 min,Lucky Number Slevin,movie,,Jason Smilovic,2006,tt0425210,7.8,285907,
2,"Ellen Pompeo, Justin Chambers, Chandra Wilson,...",Won 2 Golden Globes. Another 67 wins & 209 nom...,,USA,,,,"Drama, Romance",English,,...,41 min,Grey's Anatomy,series,,Shonda Rhimes,2005–,tt0413573,7.6,205650,15.0
3,"Dominic Purcell, Wentworth Miller, Robert Knep...",Nominated for 2 Golden Globes. Another 4 wins ...,,"UK, USA",,,,"Action, Crime, Drama, Mystery, Thriller","Spanish, English",,...,44 min,Prison Break,series,,Paul Scheuring,2005–2017,tt0455275,8.4,421553,5.0
4,"Josh Radnor, Jason Segel, Cobie Smulders, Neil...",Nominated for 2 Golden Globes. Another 25 wins...,,USA,,,,"Comedy, Romance","English, Persian, Chinese",,...,22 min,How I Met Your Mother,series,,"Carter Bays, Craig Thomas",2005–2014,tt0460649,8.3,527514,9.0


In [28]:
# Edit all_omdb_dategories to choose select categories
netflix_omdb_key_categories = all_omdb_categories.filter(['Title','Type',
                                                  'Genre', 'imdbRating',
                                                  'imdbID','Language',
                                                  'Country','totalSeasons'], axis=1)
netflix_omdb_key_categories.head()

nf = netflix_omdb_key_categories.sort_values(by=['imdbRating'], ascending = False)
nf.head(100)

df = nf[nf.imdbRating != "N/A"]


top_five_shows = df.drop_duplicates(subset='Title', keep="last").copy()[0:5]
top_five_shows

Unnamed: 0,Title,Type,Genre,imdbRating,imdbID,Language,Country,totalSeasons
6,Breaking Bad,series,"Crime, Drama, Thriller",9.5,tt0903747,"English, Spanish",USA,5
418,Sherlock,series,"Crime, Drama, Mystery, Thriller",9.2,tt1475582,English,"UK, USA",4
12,Death Note,series,"Animation, Crime, Drama, Fantasy, Mystery, Thr...",9.0,tt0877057,Japanese,Japan,1
83,Stranger Things,series,"Drama, Fantasy, Horror, Mystery, Sci-Fi, Thriller",8.9,tt4574334,English,USA,3
557,Justice League Unlimited,series,"Animation, Action, Adventure",8.9,tt6025022,English,USA,2


In [36]:
#Loop

matched_netflix_titles = []

for ep_desc in episodes_final['description']:
    for title in netflix_titles:
        match = str(ep_desc).find(title)
        if match == -1:
            matched_netflix_titles.append(title)
matched_netflix_titles

['White Chicks',
 'Lucky Number Slevin',
 "Grey's Anatomy",
 'Prison Break',
 'How I Met Your Mother',
 'Supernatural',
 'Breaking Bad',
 'The Vampire Diaries',
 'The Walking Dead',
 'Pretty Little Liars',
 'Once Upon a Time',
 'Sherlock',
 'Death Note',
 'Naruto',
 'The Hunter',
 'Lottie Dottie Chicken',
 'Arrow',
 'Black Mirror',
 'The Originals',
 'The 100',
 'Masha and the Bear',
 'ÌÒ Pai, ÌÒ - Look at This',
 'Nymphomaniac: Volume 1',
 'Hunter X Hunter (2011)',
 "Marvel's Luke Cage",
 "Marvel's Iron Fist",
 'Narcos',
 'The Flash',
 'The Seven Deadly Sins',
 'Scream',
 'The Do-Over',
 'OperaÌ_Ìµes Especiais',
 '3%',
 'Trollhunters',
 'Fearless',
 'Hyena Road',
 'Santa Clarita Diet',
 'Sex, Love and Therapy',
 'White Girl',
 'Girlboss',
 'City of God: 10 Years Later',
 '13 Reasons Why',
 'Sand Castle',
 'Salvador Martinha - Tip of the Tongue',
 'Sandy Wexler',
 'O Amor no DivÌ£',
 'Stronger Than the World',
 'Back and Forth',
 'Happily Married',
 'Pandora',
 'Lucky Number Slevin',
 

In [38]:
len(matched_netflix_titles)

792521319

# Prepare engine and send data to mysql

In [None]:
# Setting up connection to mysql workbench
connection_string = "root:<enter your password>@localhost/podcasts_db?charset=utf8"
engine = create_engine(f'mysql://{connection_string}')

In [None]:
# Display table names in data base, testing connection
engine.table_names()

In [None]:
# Send data frames into mysql
episodes_final_sorted.to_sql(name='episodes', con=engine, if_exists='append', index=True)

In [None]:
# Send data frames into mysql
podcast_tvfilm_a.to_sql(name='podcasts', con=engine, if_exists='append', index=True)