# Code snippets

Scraping the raw HTML content of a site

In [2]:
import requests
URL = "https://podcasts.apple.com/us/podcast/1377247052"
r = requests.get(URL)
print(r.content)



Parsing HTML content with html5lib

In [5]:
import requests
from bs4 import BeautifulSoup

URL = "https://podcasts.apple.com/us/podcast/1377247052"
r = requests.get(URL)

soup = BeautifulSoup(r.content, 'html5lib')
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en-US" prefix="og: http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="width=device-width, initial-scale=1, viewport-fit=cover" name="viewport"/>
  <meta content="pc,mobile" name="applicable-device"/>
  <script id="perfkit">
   window.initialPageRequestTime = +new Date();
  </script>
  <link crossorigin="" href="https://amp-api.podcasts.apple.com" rel="preconnect"/>
  <link crossorigin="" href="https://is1-ssl.mzstatic.com" rel="preconnect"/>
  <link crossorigin="" href="https://is2-ssl.mzstatic.com" rel="preconnect"/>
  <link crossorigin="" href="https://is3-ssl.mzstatic.com" rel="preconnect"/>
  <link crossorigin="" href="https://is4-ssl.mzstatic.com" rel="preconnect"/>
  <link crossorigin="" href="https://is5-ssl.mzstatic.com" rel="preconnect"/>
  <link crossorigin="" href="https://xp.apple.com" rel="preconnect"/>
  <link crossorigin="" href="https://js-cdn.music.apple.c

Parsing HTML content with html.parser

In [7]:
import requests
from bs4 import BeautifulSoup

URL = "https://podcasts.apple.com/us/podcast/1377247052"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")
print(soup)

<!DOCTYPE html>
<html dir="ltr" lang="en-US" prefix="og: http://ogp.me/ns#"><head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, viewport-fit=cover" name="viewport"/>
<meta content="pc,mobile" name="applicable-device"/>
<script id="perfkit">window.initialPageRequestTime = +new Date();</script>
<link crossorigin="" href="https://amp-api.podcasts.apple.com" rel="preconnect"/>
<link crossorigin="" href="https://is1-ssl.mzstatic.com" rel="preconnect"/>
<link crossorigin="" href="https://is2-ssl.mzstatic.com" rel="preconnect"/>
<link crossorigin="" href="https://is3-ssl.mzstatic.com" rel="preconnect"/>
<link crossorigin="" href="https://is4-ssl.mzstatic.com" rel="preconnect"/>
<link crossorigin="" href="https://is5-ssl.mzstatic.com" rel="preconnect"/>
<link crossorigin="" href="https://xp.apple.com" rel="preconnect"/>
<link crossorigin="" href="https://js-cdn.music.apple.com" rel="preconnect"/>
<link crosso

Scrape entire source code line for average rating for one podcast show

In [17]:
podcast_rating = soup.find_all("span", class_="we-customer-ratings__averages__display")
print(podcast_rating)

[<span class="we-customer-ratings__averages__display">4.6</span>]


Scrape average rating for one podcast show

In [30]:
podcast_rating = soup.find("span", class_="we-customer-ratings__averages__display")
print(podcast_rating.text)

4.6


Scrape number of ratings for one podcast show

In [31]:
nr_ratings = soup.find("div", class_="we-customer-ratings__count small-hide medium-show")
print(nr_ratings.text)

88 Ratings


Scrape entire source code section for star rating percentages distribution, for one podcast show

In [32]:
stars_distr = soup.find_all("div", class_="we-star-bar-graph__bar__foreground-bar")
print(stars_distr)

[<div class="we-star-bar-graph__bar__foreground-bar" style="width: 78%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 13%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 3%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 3%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 2%;"></div>]


Scrape all above variables for one podcast show, based on itunes_id

In [42]:
itunes_id = "1377247052"
base_URL = "https://podcasts.apple.com/us/podcast/"
full_URL = base_URL + itunes_id

page = requests.get(full_URL)

soup = BeautifulSoup(page.content, "html.parser")

apple_rating = soup.find("span", class_="we-customer-ratings__averages__display")
apple_nr_ratings = soup.find("div", class_="we-customer-ratings__count small-hide medium-show")
apple_stars_distr = soup.find_all("div", class_="we-star-bar-graph__bar__foreground-bar")

print({'apple_rating': apple_rating.text,
       'apple_nr_ratings': apple_nr_ratings.text,
       'apple_stars_distr': apple_stars_distr})

{'apple_rating': '4.6', 'apple_nr_ratings': '88 Ratings', 'apple_stars_distr': [<div class="we-star-bar-graph__bar__foreground-bar" style="width: 78%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 13%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 3%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 3%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 2%;"></div>]}


Import CSV table containing itunes_id for each podcast show in the sample

In [45]:
import csv

podcasts_apple_ratings = r'C:\...\...\...\...\...\podcasts_apple_ratings.csv' # File location

with open(podcasts_apple_ratings, 'r') as csvfile:
    datareader = csv.reader(csvfile)
    for row in datareader:
        print(row)

['id', 'title', 'publisher', 'itunes_id']
['d32e0e002c604089a14155955fed3ffc', 'Pekingology', 'Center for Strategic and International Studies', '1525445350']
['d3aab8c7641741debb2cff6d1024b9d5', 'Talk Is Jericho', 'Chris Jericho', '767016946']
['99081c767fb041bc80120a06ff20e884', 'Gun Talk', 'Tom Gresham', '190516844']
['1beb09a1427e43cd8dfca1f5268e4e22', 'Challenge Mania', 'Challenge Mania', '1204429149']
['eb28dc70d26c40deb41afd78e1c9d3d8', 'Tales to Terrify', 'Drew Sebesteny', '492711030']
['170c03d32d3a4fc3a1dd81407ea33704', 'The Instance: Deep Dives for Gamers', 'Scott Johnson', '115594899']
['4685e82279e84054a608c6364912ad73', 'The Run-Up', 'The New York Times', '1142083165']
['b586d7de2a8e49b687fa5ab8506f713c', 'In Machines We Trust', 'MIT Technology Review', '1523584878']
['d8dd100adea94898b104460238fa9e47', 'Blazing Trails', 'Salesforce', '1259579050']
['b5d8837218764e098531cdb498383e54', 'MIMI', 'Mimi Bouchard', '1434308661']
['220098bac49c4228a404f1151fef313c', 'Katherine Ry

Scrape all three variables for first three podcast shows

In [3]:
import csv

podcasts_apple_ratings = r'C:\...\...\...\...\...\podcasts_apple_ratings.csv' # File location

i = 0

with open(podcasts_apple_ratings, 'r') as csvfile:
    datareader = csv.reader(csvfile)
    for row in datareader:
        print(row[3])
        
        
        if i > 0 and i < 4:
            itunes_id = row[3]
            base_URL = "https://podcasts.apple.com/us/podcast/"
            full_URL = base_URL + itunes_id

            page = requests.get(full_URL)

            soup = BeautifulSoup(page.content, "html.parser")

            apple_rating = soup.find("span", class_="we-customer-ratings__averages__display")
            apple_nr_ratings = soup.find("div", class_="we-customer-ratings__count small-hide medium-show")
            apple_stars_distr = soup.find_all("div", class_="we-star-bar-graph__bar__foreground-bar")
   
            print({'itunes_id': itunes_id,
           'apple_rating': apple_rating.text,
           'apple_nr_ratings': apple_nr_ratings.text,
           'apple_stars_distr': apple_stars_distr})
        i += 1

itunes_id
1525445350
{'itunes_id': '1525445350', 'apple_rating': '4.7', 'apple_nr_ratings': '67 Ratings', 'apple_stars_distr': [<div class="we-star-bar-graph__bar__foreground-bar" style="width: 90%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 3%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 0%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 1%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 6%;"></div>]}
767016946
{'itunes_id': '767016946', 'apple_rating': '4.7', 'apple_nr_ratings': '7.9K Ratings', 'apple_stars_distr': [<div class="we-star-bar-graph__bar__foreground-bar" style="width: 87%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 6%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 2%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 1%;"></div>, <div class="we-star-bar-graph__bar__fore

# Final scraper

Import necessary packages

In [26]:
import requests
import csv
from bs4 import BeautifulSoup
from time import sleep

Scraping all three variables from each podcast shows' individual Apple site, and creating a CSV file in a relational database format

In [29]:
podcasts_apple_ratings = r'C:\...\...\...\...\...\podcasts_apple_ratings_itunes.csv' # File location\podcasts_apple_ratings_itunes.csv'

i = 0
datacsv = []
with open(podcasts_apple_ratings, 'r') as csvfile:
    datareader = csv.reader(csvfile)
    for row in datareader:
        datacsv.append(row)
csvfile.close()
with open('podcasts_apple_ratings_12.csv', 'w') as file: # Change each run, 12 runs overall, resulting in 12 created csv files (merge later in R)
    writer = csv.writer(file)
    writer.writerow(["itunes_id", "apple_rating", "apple_nr_ratings", "apple_stars_distr"])
    for row in datacsv:
        if i > 550 and i < 601: # Change each run, max. 50 shows scraped during one run
            itunes_id = row[3]
            base_URL = "https://podcasts.apple.com/us/podcast/"
            full_URL = base_URL + itunes_id

            page = requests.get(full_URL)

            soup = BeautifulSoup(page.content, "html.parser")

            apple_rating = soup.find("span", class_="we-customer-ratings__averages__display")
            apple_nr_ratings = soup.find("div", class_="we-customer-ratings__count small-hide medium-show")
            apple_stars_distr = soup.find_all("div", class_="we-star-bar-graph__bar__foreground-bar")
            
            print(apple_rating.text, apple_nr_ratings.text, apple_stars_distr)
            writer.writerow([itunes_id, apple_rating.text, apple_nr_ratings.text, apple_stars_distr])
            
            sleep(2) # 2 second delay for each request
        i += 1

4.6 3.4K Ratings [<div class="we-star-bar-graph__bar__foreground-bar" style="width: 80%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 8%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 4%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 3%;"></div>, <div class="we-star-bar-graph__bar__foreground-bar" style="width: 4%;"></div>]
