# Scraping iTunes podcast information

In [None]:
from bs4 import BeautifulSoup
import requests
from string import ascii_uppercase
import pickle
import pandas as pd
import numpy as np
import pymongo
import json

## Scrape podcast genres from apple
We first get the list of genres and the their links from the iTunes homepage.

In [None]:
# parse html from genre overview and extract the main genres
page = requests.get('https://podcasts.apple.com/us/genre/podcasts/id26')
soup = BeautifulSoup(page.content, 'html.parser')
main_category = soup.select('.top-level-genre')

In [None]:
# we store the links in a dictionary using the genre name as key
main_cat_dict = {}
for item in main_category:
    main_cat = item['title'].split(" - ")[0]
    link = item['href']
    main_cat_dict[main_cat] = link

In [None]:
main_cat_dict

## Scraping the genres for podcast links and id's
Cool, we have now all the genres of the podcasts that are listed on the podcast page. We now want to traverse through the pages and collect the podcast links and their id to later use the links to gather additional information of every podcast. There are some duplicates in the id's we will investigate and resolve this later. For now we simply go through all the pages of a genre and a letter. If there are no more pages, the page will not show a 'next' link. We use this to break our loop and move on the next letter or genre

In [None]:
podcast_genre_dict = {}


for key in main_cat_dict:
    
    podcast_link_list = []
    podcast_id_list = []
    
    genre = main_cat_dict[key]

    for letter in ascii_uppercase + "*":

        page = 1
        new_casts = True
        
        print("genre: ", key, "letter :", letter) 

        while new_casts == True:
        
            page_string = genre + "?letter=" +  letter + "&page=" + str(page)
            subpage = requests.get(page_string)

            soup_subpage = BeautifulSoup(subpage.content, 'html.parser')

            podcasts = soup_subpage.select('#selectedcontent ul>li a')

            for podcast in podcasts:

                podcast_link = podcast['href']
                # there are podcast with an empty title string
                try:
                    podcast_id = podcast['href'].split("/")[6]
                except:
                    podcast_id = podcast['href'].split("/")[5]
                
                podcast_id_list.append(podcast_id)
                podcast_link_list.append(podcast_link)
                
            # check if there is a next link
            next_link = soup_subpage.select('.paginate-more')
                
            # there is a next link on top and at the bottom of every page
            if len(next_link) < 2:
                #print(podcast_link)
                new_casts = False

                #page = page + 1
                print("letter: ", letter, "page :", page)
                break
            page = page + 1
            # store results for one genre in dictionary
    podcast_genre_dict[key] = {'links':podcast_link_list , 'ids':podcast_id_list}

In [None]:
# get number of podcasts across all genres
genre_numbers = [len(podcast_genre_dict[key]['links']) for key in podcast_genre_dict]
sum(genre_numbers)

## Deal with duplicates
There are multiple duplicates in the data which we do not want to have in the final data set.

In [None]:
# make dataframe and find duplicate rows
genres_dedup_frames = {}
for key in  podcast_genre_dict:
    
    links = podcast_genre_dict[key]['links']
    ids = podcast_genre_dict[key]['ids']
    
    frame = pd.DataFrame([pd.Series(ids), pd.Series(links)]).T
    frame.columns = ['id', 'link']
    
    frame_dedup = frame.drop_duplicates()
    
    genres_dedup_frames[key] = frame_dedup

In [None]:
# check how many podcasts were filtered out 
dedup_genre_numbers = [genres_dedup_frames[key].shape[0] for key in genres_dedup_frames]
sum(genre_numbers) - sum(dedup_genre_numbers)

In [None]:
# what is left?
sum(dedup_genre_numbers)

In [None]:
dedup_genre_numbers

In [None]:
# save our work as pickle file
with open('dedup_podcasts.pickle', 'wb') as f:
    pickle.dump(genres_dedup_frames, f)

## Get more information from iTunes
So we have list of unique identifiers/links of podcasts. We will now crawl through every podcast page and extract addtional info. We are only interested in "active" podcasts. A podcast is defined as active (by me) if at least one episode has been published since January 1 2019. We use the fact that every podcast page shows the latest episode on top. Scraping the podcast information takes really long, therefore we focus here on a very small subset of the data in the True Crime genre. The rest of the scraping will be done in a simple python script that then can run for several days in the background.

In [None]:
genre_frame = genres_dedup_frames['True Crime']

In [None]:
genre_frame.shape

In [None]:
casts = {}
pc_artist_names = {}
for index, row in genre_frame.iterrows():
    link = row['link']
    pc_id = row['id']
    
    # get
    pc_page = requests.get(link)
    soup_pc = BeautifulSoup(pc_page.content, 'html.parser')
    
    # first check if an episode was published in 2019 or later
    # information is hidden in div third div (refers to latest episode)
    # there are some discrepancies of episodes listed in the overview page and entries
    # that have actually pages
    try:
        div_class_of_interest = soup_pc.find_all('div', class_ = "l-row")[2]

        latest_year = int(div_class_of_interest.div.time['datetime'].split("-")[0])

        if latest_year >= 2019:
            
            # find genre, artist and artis id
            div_class_of_interest = soup_pc.find_all('div', class_ = "l-column small-7 medium-12 small-valign-top")
            
            # not all artists have an id and/or a artist page
            try:    
                artist_string = div_class_of_interest[0].h1.a.contents
                artist_string = ' '.join(artist_string[0].split())
                artist_link = div_class_of_interest[0].h1.a['href']
                
                try:
                    artist_id = artist_link.split("/")[6]
                except:
                    artist_id = artist_link.split("/")[5]
                
                # add it to artist dict
                artist_names[artist_id] = artist_string
                artist_links[artist_id] = artist_link
                
                # add artist name to podcast dict
                
            except:
                # only get artist name to store in podcast names dict
                artist_string = div_class_of_interest[0].h1.contents[1].contents
                artist_string = ' '.join(artist_string[0].split())
                
                
            # get genre
            genre_string = div_class_of_interest[0].li.li.contents
            genre_string = ' '.join(genre_string[0].split())
            
            # get title, description out of soup object and cut out unicode thingy
            title = soup_pc.find("meta", {"name":"apple:title"})['content'].replace('\u200E', '')
            description = soup_pc.find("meta", {"name":"apple:description"})['content'].replace('\u200E', '')
            
            
            casts[pc_id] = np.array([pc_id, title, description, link, artist_string, genre_string])
            
            
    except:
        print(link)
        failed[pc_id] = link
    
    

In [None]:
genre_data_frame = pd.DataFrame([casts[key] for key in casts.keys()])
genre_data_frame.columns = ['itunes_id', 'name', 'description', 'itunes_link', 'artist_name', 'genre']

In [None]:
# add new column rss_feed consisting only of NAs to frame to retrieve them later
genre_data_frame["rss_feed"] = np.nan

In [None]:
genre_data_frame.head()

In [None]:
# save as csv file
genre_data_frame.to_csv("true_crime.csv")

## Adding the data into MongoDB Atlas database
In the next step we upload the data into an MongoDB data base hosted by MongoDB Atlas. At this point the free tier cluster is sufficient

In [None]:
# load login credentials from different place for obvious reasons
mongo_db_login_string = pickle.load( open( 'mongo_db_login_string.pickle', "rb" ) )

In [None]:
client = pymongo.MongoClient(mongo_db_login_string)

In [None]:
# create new database named podcasts
db = client.podcasts

In [None]:
# create new collection named podcast to insert our table data into
podcasts = db.podcast

In [None]:
# convert dataframe to json and insert data into the database
podcasts_json = json.loads(genre_data_frame.T.to_json()).values()

In [None]:
podcasts.insert_many(podcasts_json)

Yay!