In [None]:
# Import dpendencies for pandas (read in the CSV) and create engine from sqlalchemy to set up our database
import pandas as pd
from sqlalchemy import create_engine
import codecs

In [None]:
# Import csv using the pandas read_csv function, display dataframe head to get a quick look at the data
podcast_file = "podcasts.csv"
podcast_df = pd.read_csv(podcast_file)
podcast_df.head()

In [None]:
# Create a filtered dataframe from specific columns 
# Here we are only really interested the eventual primary key (uuid) and sortable, descriptive columns 
# Filtering out extraneous fields
podcast_cols = ["uuid", "title", "language", "categories"]
podcast_clean= podcast_df[podcast_cols].copy()

# Rename the column headers
podcast_clean = podcast_clean.rename(columns={"uuid": "id"})

# Clean the data by dropping duplicates and setting the index
# Display the head of our new dataframe
podcast_clean.drop_duplicates("id", inplace=True)
podcast_clean.set_index("id", inplace=True)

podcast_clean.head()

In [None]:
# Our project will focus on the intersection of podcasts and streaming (Netfilx)
# Display unique values for the categories column and locate what is relevant: "TV & Film"
podcast_clean.categories.unique()

In [None]:
# Filter the dataframe to display only results mathing "TV & Film" in the "categories" column
podcast_tvfilm = podcast_clean.loc[podcast_clean['categories'] == 'TV & Film']

# Sort the data by title
podcast_tvfilm_a = podcast_tvfilm.sort_values(by=['title'], ascending = False)
podcast_tvfilm_a.head(100)

In [None]:
# Filtered the dataframe by English only podcasts, attempting to clean up any errors in loading to mysql
podcast_english = podcast_tvfilm_a.loc[podcast_tvfilm_a['language'] == 'English']

podcast_english

In [None]:
# Testing for case sensitivity
podcast_find = podcast_tvfilm_a.loc[podcast_tvfilm_a['title'].str.contains('Stranger Things')]
podcast_find

In [None]:
# Setting up connection to mysql workbench
connection_string = "root:<enter your password>@localhost/podcasts_db"
engine = create_engine(f'mysql://{connection_string}')

In [None]:
# Display table names in data base, testing connection
engine.table_names()

In [None]:
# Attempting to load the "title" column by itself, testing for errors
titles = ['title']
titles_df = podcast_english[titles].copy()

titles_df

In [None]:
# Attempting to load the "language" and "categories" columns into mysql, diagnosing where the previous errors were
# These successfully load into mysql, clear that the issue is with the title column
test_podcast_cols = ["language", "categories"]
test_podcast_clean= podcast_tvfilm_a[test_podcast_cols].copy()

test_podcast_clean

In [None]:
# Send data frames into mysql
titles_df.to_sql(name='TABLE NAME HERE', con=engine, if_exists='append', index=True)