# Scraping episodes data from iTunes

In this notebook, I collect episodes data for each podcast that was found in the previous notebook. For each podcast, all the episodes that are available on iTunes website are found (up to 300 episodes for each podcast). For each episode, I collect these attributes:
 - Name
 - Description
 - Release date
 - Duration
 
For each podcast, iTunes proved a list of podcasts that the listeners also subscribed too (up to 18 items). I collect these as well and add them to the popular podcasts dataframe.

In [1]:
import pandas as pd
import requests
import json
from bs4 import BeautifulSoup
import numpy as np
import subprocess
from dateutil.parser import parse
import datetime
import re

In [2]:
alsoSubbedCmd_0 = """curl 'https://amp-api.podcasts.apple.com/v1/catalog/us/podcasts/{}/listeners-also-subscribed' \
  -H 'Accept: application/json' \
  -H 'Referer: https://podcasts.apple.com/podcast/id{}' \
  -H 'Authorization: Bearer eyJhbGciOiJFUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6IldlYlBsYXlLaWQifQ.eyJpc3MiOiJBTVBXZWJQbGF5IiwiaWF0IjoxNTk4NjQzODE5LCJleHAiOjE2MTQxOTU4MTl9.D83tGI4HTOgGUgvKbnyjYtTGxqb7HcrUFixvcMHXvQmSqNy7TOgxO1WIqeBpnql7ibqMdUEImKbNgzfC671qjg' \
  -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36' \
  -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \
  --compressed"""

alsoSubbedCmd_1 = """curl 'https://amp-api.podcasts.apple.com/v1/catalog/us/podcasts/{}/listeners-also-subscribed?offset=15' \
  -H 'Accept: application/json' \
  -H 'Referer: https://podcasts.apple.com/podcast/id{}' \
  -H 'Authorization: Bearer eyJhbGciOiJFUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6IldlYlBsYXlLaWQifQ.eyJpc3MiOiJBTVBXZWJQbGF5IiwiaWF0IjoxNTk4NjQzODE5LCJleHAiOjE2MTQxOTU4MTl9.D83tGI4HTOgGUgvKbnyjYtTGxqb7HcrUFixvcMHXvQmSqNy7TOgxO1WIqeBpnql7ibqMdUEImKbNgzfC671qjg' \
  -H 'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36' \
  -H 'Content-Type: application/x-www-form-urlencoded; charset=UTF-8' \
  --compressed"""

In [27]:
def get_also_subbed_data(_id):
    out1 = subprocess.Popen(alsoSubbedCmd_0.format(_id, _id), stdout=subprocess.PIPE, shell = True).communicate()[0]
    out2 = subprocess.Popen(alsoSubbedCmd_1.format(_id, _id), stdout=subprocess.PIPE, shell = True).communicate()[0]
    try:
        data1 = json.loads(out1)['data']
        data2 = json.loads(out2)['data']
        data1.extend(data2)
        return [d['id'] for d in data1]
    except:
        return None

In [37]:
get_also_subbed_data(1530405817)

In [5]:
episodes_data = """curl 'https://amp-api.podcasts.apple.com/v1/catalog/us/podcasts/{}/episodes?offset={}&limit={}' \
  -H 'authority: amp-api.podcasts.apple.com' \
  -H 'accept: application/json' \
  -H 'authorization: Bearer eyJhbGciOiJFUzI1NiIsInR5cCI6IkpXVCIsImtpZCI6IldlYlBsYXlLaWQifQ.eyJpc3MiOiJBTVBXZWJQbGF5IiwiaWF0IjoxNTk4NjQzODE5LCJleHAiOjE2MTQxOTU4MTl9.D83tGI4HTOgGUgvKbnyjYtTGxqb7HcrUFixvcMHXvQmSqNy7TOgxO1WIqeBpnql7ibqMdUEImKbNgzfC671qjg' \
  -H 'user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36' \
  -H 'content-type: application/x-www-form-urlencoded; charset=UTF-8' \
  -H 'origin: https://podcasts.apple.com' \
  -H 'sec-fetch-site: same-site' \
  -H 'sec-fetch-mode: cors' \
  -H 'sec-fetch-dest: empty' \
  -H 'referer: https://podcasts.apple.com/podcast/id{}' \
  -H 'accept-language: en-US,en;q=0.9,fa;q=0.8' \
  --compressed"""

In [6]:
def get_episode_data(_id, offset, lim):
    out = subprocess.Popen(episodes_data.format(_id, offset, lim, _id), stdout=subprocess.PIPE, shell = True).communicate()[0]
    data = json.loads(out)
    if 'next' in data:
        idx = data['next'].find("=")
        next_offset = data['next'][idx+1:]
    else:
        next_offset = None
    descriptions = []
    titles = []
    durations = []
    dates = []
    if 'data' in data:
        for d in data['data']:
            try:
                descriptions.append(d['attributes']['description']['standard'])
            except:
                descriptions.append(None)
            try:
                titles.append(d['attributes']['name'])
            except:
                titles.append(None)
            try:
                durations.append(d['attributes']['durationInMilliseconds'])
            except:
                durations.append(None)
            try:
                dates.append(parse(d['attributes']['releaseDateTime']))
            except:
                dates.append(None)
    return titles, descriptions, durations, dates, next_offset


In [7]:
def get_all_episodes(_id):
    k = 0
    descriptions = []
    durations = []
    dates = []
    titles = []
    while k is not None:
        t, de, du, da, k = get_episode_data(_id, k, 100)
        titles.extend(t)        
        descriptions.extend(de)
        durations.extend(du)
        dates.extend(da)
    return titles, descriptions, durations, dates

In [8]:
titles, descriptions, durations, dates = get_all_episodes(1200361736)

In [9]:
pod_ids = pd.read_pickle("popular_pods_list.pkl")

In [10]:
pod_groups = pod_ids.groupby(by = "subgenre").groups

In [11]:
subgenres = list(pod_groups.keys())

In [18]:
for current in subgenres:
    print("_".join(current.lower().split()))
    df = pd.DataFrame(columns = ["itunes_id", "title", "description", "duration", "date"])
    current_pods = pod_ids.loc[pod_groups[current]]
    for count, idx in enumerate(current_pods.index):
        if (count+1)%10 == 0:
            print("{}/{}".format(count+1, current_pods.shape[0]), end = ' | ')
        _id = current_pods.loc[idx, "itunes_id"]
        titles, descriptions, durations, dates = get_all_episodes(_id)
        df = df.append(pd.DataFrame({"itunes_id":_id, "title":titles, "description":descriptions,
                      "duration":durations, "date":dates}))
    file_name = "episodes_subgenre_" + "_".join(current.lower().split()) + '.pkl'
    df.reset_index(drop = True, inplace=True)
    df.to_pickle(file_name)
    print()

after_shows
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 140/240 | 150/240 | 160/240 | 170/240 | 180/240 | 190/240 | 200/240 | 210/240 | 220/240 | 230/240 | 240/240 | 
alternative_health
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 140/240 | 150/240 | 160/240 | 170/240 | 180/240 | 190/240 | 200/240 | 210/240 | 220/240 | 230/240 | 240/240 | 
animation_&_manga
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 140/240 | 150/240 | 160/240 | 170/240 | 180/240 | 190/240 | 200/240 | 210/240 | 220/240 | 230/240 | 240/240 | 
arts
10/54 | 20/54 | 30/54 | 40/54 | 50/54 | 
astronomy
10/84 | 20/84 | 30/84 | 40/84 | 50/84 | 60/84 | 70/84 | 80/84 | 
automotive
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 14

10/39 | 20/39 | 30/39 | 
places_&_travel
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 140/240 | 150/240 | 160/240 | 170/240 | 180/240 | 190/240 | 200/240 | 210/240 | 220/240 | 230/240 | 240/240 | 
politics
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 140/240 | 150/240 | 160/240 | 170/240 | 180/240 | 190/240 | 200/240 | 210/240 | 220/240 | 230/240 | 240/240 | 
relationships
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 140/240 | 150/240 | 160/240 | 170/240 | 180/240 | 190/240 | 200/240 | 210/240 | 220/240 | 230/240 | 240/240 | 
religion
10/240 | 20/240 | 30/240 | 40/240 | 50/240 | 60/240 | 70/240 | 80/240 | 90/240 | 100/240 | 110/240 | 120/240 | 130/240 | 140/240 | 150/240 | 160/240 | 170/240 | 180/240 | 190/240 | 200/240 | 210/240 | 220/240 | 230/240 | 240/240 | 

In [40]:
also_subbed = pod_ids['itunes_id'].apply(get_also_subbed_data)

In [41]:
pod_ids["also_subbed"] = also_subbed

In [42]:
pod_ids.to_pickle("popular_pods_list_with_also_subbed.pkl")

In [53]:
pod_ids

Unnamed: 0,itunes_id,genre,subgenre,also_subbed
0,1310458364,Arts,Arts,"[1453911226, 1493574102, 1451228314, 149089415..."
1,76069540,Arts,Arts,"[76030848, 107541824, 211872343, 138767891, 73..."
2,1455169228,Arts,Arts,"[1342003491, 1472401495, 1502728938, 146232460..."
3,1113585468,Arts,Arts,"[1097417804, 507135865, 1163047880, 390071758,..."
4,1133320064,Arts,Arts,"[1092800054, 1365312021, 1066154319, 834515877..."
...,...,...,...,...
22085,998568017,True Crime,True Crime,"[790487079, 1062418176, 1089216339, 541481026,..."
22086,1244309070,True Crime,True Crime,"[1289005078, 1278924392, 1420191569, 132518038..."
22087,1480263708,True Crime,True Crime,"[1494167201, 1489482036, 1493193473, 147813899..."
22088,1145089790,True Crime,True Crime,"[1048123246, 1191380648, 977283328, 1166399817..."
