# Popular podcasts on iTunes
In this notebook, I collect the iTunes id of all the podcasts that are listed in the popular podcasts page on the iTunes website. There are 19 main categories and 102 sub-categories. In total, I find about 22000 unique podcasts and save them in a pandas dataframe.

In [1]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
import numpy as np
import subprocess
from dateutil.parser import parse
import datetime
import re

In [2]:
# get the url for each popular category
re = requests.get(url = 'https://podcasts.apple.com/us/genre/podcasts-arts/id1301')
soup = BeautifulSoup(re.text)
classes = soup.find_all('ul', attrs= {"class":"list column first"})[0].\
    find_all('a', attrs = {"class":"top-level-genre"})
genre_links = {pod_class.text:pod_class['href'] for pod_class in classes}

In [3]:
genre_links

{'Arts': 'https://podcasts.apple.com/us/genre/podcasts-arts/id1301',
 'Business': 'https://podcasts.apple.com/us/genre/podcasts-business/id1321',
 'Comedy': 'https://podcasts.apple.com/us/genre/podcasts-comedy/id1303',
 'Education': 'https://podcasts.apple.com/us/genre/podcasts-education/id1304',
 'Fiction': 'https://podcasts.apple.com/us/genre/podcasts-fiction/id1483',
 'Government': 'https://podcasts.apple.com/us/genre/podcasts-government/id1511',
 'Health & Fitness': 'https://podcasts.apple.com/us/genre/podcasts-health-fitness/id1512',
 'History': 'https://podcasts.apple.com/us/genre/podcasts-history/id1487',
 'Kids & Family': 'https://podcasts.apple.com/us/genre/podcasts-kids-family/id1305',
 'Leisure': 'https://podcasts.apple.com/us/genre/podcasts-leisure/id1502',
 'Music': 'https://podcasts.apple.com/us/genre/podcasts-music/id1310',
 'News': 'https://podcasts.apple.com/us/genre/podcasts-news/id1489',
 'Religion & Spirituality': 'https://podcasts.apple.com/us/genre/podcasts-religi

In [30]:
all_links = dict()
for genre, genre_link in genre_links.items():
    re = requests.get(genre_link)
    soup = BeautifulSoup(re.text)
    try:
        sub_genres = soup.find_all('ul', attrs= {"class":"top-level-subgenres"})[0].find_all('a')
        subgenre_links = {subs.text:subs['href'] for subs in sub_genres}
    except:
        subgenre_links = {genre:genre_link}
    all_links[genre] = subgenre_links

In [58]:
itunes_ids = []
genres = []
subgenres = []

for genre in all_links.keys():
    print(genre, end = ', ')
    url = genre_links[genre]
    re = requests.get(url)
    soup = BeautifulSoup(re.text)
    pods = soup.find_all('div', attrs= {"class":"grid3-column"})[0].find_all("a")
    for pod in pods:
        s = pod['href'][::-1]
        idx = s.find("di")
        itunes_ids.append(int(s[:idx][::-1]))
        genres.append(genre)   
        subgenres.append(genre)

    for subgenre, url in all_links[genre].items():
        re = requests.get(url)
        soup = BeautifulSoup(re.text)
        pods = soup.find_all('div', attrs= {"class":"grid3-column"})[0].find_all("a")
        for pod in pods:
            s = pod['href'][::-1]
            idx = s.find("di")
            itunes_ids.append(int(s[:idx][::-1]))
            genres.append(genre)   
            subgenres.append(subgenre)

Arts, Business, Comedy, Education, Fiction, Government, Health & Fitness, History, Kids & Family, Leisure, Music, News, Religion & Spirituality, Science, Society & Culture, Sports, TV & Film, Technology, True Crime, 

In [59]:
df = pd.DataFrame(columns = ["itunes_id", "genre", "subgenre"])
df["itunes_id"] = itunes_ids
df["genre"] = genres
df["subgenre"] = subgenres

In [63]:
popular_pods_list = df.loc[df["itunes_id"].drop_duplicates(keep = "last").index].reset_index(drop = True)
popular_pods_list

Unnamed: 0,itunes_id,genre,subgenre
0,1310458364,Arts,Arts
1,76069540,Arts,Arts
2,1455169228,Arts,Arts
3,1113585468,Arts,Arts
4,1133320064,Arts,Arts
...,...,...,...
22085,998568017,True Crime,True Crime
22086,1244309070,True Crime,True Crime
22087,1480263708,True Crime,True Crime
22088,1145089790,True Crime,True Crime


In [64]:
popular_pods_list.to_pickle("popular_pods_list.pkl")