In [1]:
# 1. import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
# 2. find url and store it in a variable
url = "https://www.billboard.com/charts/hot-100"

In [3]:
# 3. download html with a get request
response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
# 4.1. parse html (create the 'soup')
soup = BeautifulSoup(response.content, "html.parser")
# 4.2. check that the html code looks like it should
soup

<!DOCTYPE html>

<html class="" lang="">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
<title>The Hot 100 Chart | Billboard</title>
<meta content="The Hot 100 Chart" name="title" property="title">
<meta content="@billboard" name="twitter:site"/>
<meta content="Billboard" property="og:site_name">
<meta content="article" property="og:type">
<link href="/manifest.json" rel="manifest"/>
<style>
        .chart-pro-access {
            background-image: url('https://www.billboard.com/assets/1607101016/images/piano/chart-pro-access-mb.png?cd6337a9a9a6c3a2749a');
        }

        @media (min-width: 769px) {
            .chart-pro-access {
                background-image: url('https://www.billboard.com/assets/1607101016/images/piano/chart-pro-access-dk.png?cd6337a9a9a6c3a2749a');
            }
        }
    </style>
<script async="async" data-cfasync="false" src="ht

In [5]:
song_list=soup.find_all('button',class_='chart-element__wrapper display--flex flex--grow sort--default')

In [6]:
song_list

[<button class="chart-element__wrapper display--flex flex--grow sort--default">
 <span class="chart-element__rank flex--column flex--xy-center flex--no-shrink">
 <span class="chart-element__rank__number">1</span>
 <span class="chart-element__trend chart-element__trend--new color--accent">New</span>
 </span>
 <span class="chart-element__information">
 <span class="chart-element__information__song text--truncate color--primary">Life Goes On</span>
 <span class="chart-element__information__artist text--truncate color--secondary">BTS</span>
 <span class="chart-element__information__delta color--secondary">
 <span class="chart-element__information__delta__text text--default">-</span>
 <span class="chart-element__information__delta__text text--last">- Last Week</span>
 <span class="chart-element__information__delta__text text--peak">1 Peak Rank</span>
 <span class="chart-element__information__delta__text text--week">1 Weeks on Chart</span>
 </span>
 </span>
 <span class="chart-element__metas

In [7]:
songs = []

for song in song_list:
    rank=song.find('span',class_='chart-element__rank__number')
    name=song.find('span',class_='chart-element__information__song text--truncate color--primary')
    artist=song.find('span',class_='chart-element__information__artist text--truncate color--secondary')
    songs.append([rank.text,name.text,artist.text])

In [8]:
songs

[['1', 'Life Goes On', 'BTS'],
 ['2', 'Mood', '24kGoldn Featuring iann dior'],
 ['3', 'Dynamite', 'BTS'],
 ['4', 'Positions', 'Ariana Grande'],
 ['5', 'I Hope', 'Gabby Barrett Featuring Charlie Puth'],
 ['6', 'Holy', 'Justin Bieber Featuring Chance The Rapper'],
 ['7', 'Laugh Now Cry Later', 'Drake Featuring Lil Durk'],
 ['8', 'Monster', 'Shawn Mendes & Justin Bieber'],
 ['9', 'Blinding Lights', 'The Weeknd'],
 ['10', 'Lemonade', 'Internet Money & Gunna Featuring Don Toliver & NAV'],
 ['11', 'Therefore I Am', 'Billie Eilish'],
 ['12', 'Body', 'Megan Thee Stallion'],
 ['13', 'Blue & Grey', 'BTS'],
 ['14', 'All I Want For Christmas Is You', 'Mariah Carey'],
 ['15', 'Dakiti', 'Bad Bunny & Jhay Cortez'],
 ['16', 'For The Night', 'Pop Smoke Featuring Lil Baby & DaBaby'],
 ['17', 'Go Crazy', 'Chris Brown & Young Thug'],
 ['18', 'Kings & Queens', 'Ava Max'],
 ['19', 'Savage Love (Laxed - Siren Beat)', 'Jawsh 685 x Jason Derulo'],
 ['20', 'More Than My Hometown', 'Morgan Wallen'],
 ['21', "Roc

In [9]:
# creating a dataframe from a list

hits = pd.DataFrame(songs,columns=['Ranking','Song','Artist'])

hits

Unnamed: 0,Ranking,Song,Artist
0,1,Life Goes On,BTS
1,2,Mood,24kGoldn Featuring iann dior
2,3,Dynamite,BTS
3,4,Positions,Ariana Grande
4,5,I Hope,Gabby Barrett Featuring Charlie Puth
...,...,...,...
95,96,Popstar,DJ Khaled Featuring Drake
96,97,Bichota,Karol G
97,98,Happy Does,Kenny Chesney
98,99,Cover Me Up,Morgan Wallen


In [10]:
# Alternative using dictionary

rank_num = []
song_name = []
artist_name = []

for song in song_list:
    rank=song.find('span',class_='chart-element__rank__number')
    name=song.find('span',class_='chart-element__information__song text--truncate color--primary')
    artist=song.find('span',class_='chart-element__information__artist text--truncate color--secondary')
    rank_num.append(rank.text)
    song_name.append(name.text)
    artist_name.append(artist.text)

song_ranking=pd.DataFrame({'Ranking':rank_num,
                           'Song Name':song_name,
                           'Artist':artist_name})

song_ranking

Unnamed: 0,Ranking,Song Name,Artist
0,1,Life Goes On,BTS
1,2,Mood,24kGoldn Featuring iann dior
2,3,Dynamite,BTS
3,4,Positions,Ariana Grande
4,5,I Hope,Gabby Barrett Featuring Charlie Puth
...,...,...,...
95,96,Popstar,DJ Khaled Featuring Drake
96,97,Bichota,Karol G
97,98,Happy Does,Kenny Chesney
98,99,Cover Me Up,Morgan Wallen


## Lab | Web Scraping Multiple Pages

**Expand the project**

- Find other lists of hot songs on the internet and scrape them too: having a bigger pool of songs will be awesome!
- Apply the same logic to other "groups" of songs: the best songs from a decade or from a country / culture / language / genre.
- Wikipedia maintains a large collection of lists of songs: https://en.wikipedia.org/wiki/Lists_of_songs

Decade-End charts

In [11]:
# 2. find url and store it in a variable
url2 = "https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=1"

In [12]:
# 3. download html with a get request
response2 = requests.get(url2)
response2.status_code # 200 status code means OK!

200

In [13]:
# 4.1. parse html (create the 'soup')
soup2 = BeautifulSoup(response2.content, "html.parser")
# 4.2. check that the html code looks like it should
soup2

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="width=device-width, initial-scale=1" name="viewport"/><title>The Rolling Stone Magazine’s 500 Greatest Songs of All Time - MusicBrainz</title><link href="https://staticbrainz.org/MB/common-e885eca.css" rel="stylesheet" type="text/css"/><link href="https://staticbrainz.org/MB/icons-07cbeda.css" rel="stylesheet" type="text/css"/><link href="/static/search_plugins/opensearch/musicbrainz_artist.xml" rel="search" title="MusicBrainz: Artist" type="application/opensearchdescription+xml"/><link href="/static/search_plugins/opensearch/musicbrainz_label.xml" rel="search" title="MusicBrainz: Label" type="application/opensearchdescription+xml"/><link href="/static/search_plugins/opensearch/musicbrainz_release.xml" rel="search" title="MusicBrainz: Release" type="application/opensearchdescription+xml"/><link href="/static/search_plugins/opensearch/musicbrainz_track.xml" r

In [14]:
# Because it has 5 pages

iterations = range(1, 6)

for i in iterations:
    start_at= str(i)
    url2 = "https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=" + start_at
    print(url2)

https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=1
https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=2
https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=3
https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=4
https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=5


In [15]:
#we need a few more tools for this one 
from random import randint
import time
time.sleep(2)

In [20]:
#Assembling the script to send and store multiple requests

pages = []

iterations = range(1, 6)

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url2 = "https://musicbrainz.org/series/b3484a66-a4de-444d-93d3-c99a73656905?page=" + start_at

    # download html with a get request:
    response = requests.get(url2)

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap, the sleeptime is random between 1 and 4 seconds:
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s.")
    time.sleep(wait_time)

Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 1 second/s.


In [21]:
# print the object pages after running the code above, you'll just see the response
# code messages, but the html code is still accessible and you can parse it the same way
# we've always done:

BeautifulSoup(pages[0].content, "html.parser")

<!DOCTYPE html>
<html lang="en"><head><meta charset="utf-8"/><meta content="IE=edge" http-equiv="X-UA-Compatible"/><meta content="width=device-width, initial-scale=1" name="viewport"/><title>The Rolling Stone Magazine’s 500 Greatest Songs of All Time - MusicBrainz</title><link href="https://staticbrainz.org/MB/common-e885eca.css" rel="stylesheet" type="text/css"/><link href="https://staticbrainz.org/MB/icons-07cbeda.css" rel="stylesheet" type="text/css"/><link href="/static/search_plugins/opensearch/musicbrainz_artist.xml" rel="search" title="MusicBrainz: Artist" type="application/opensearchdescription+xml"/><link href="/static/search_plugins/opensearch/musicbrainz_label.xml" rel="search" title="MusicBrainz: Label" type="application/opensearchdescription+xml"/><link href="/static/search_plugins/opensearch/musicbrainz_release.xml" rel="search" title="MusicBrainz: Release" type="application/opensearchdescription+xml"/><link href="/static/search_plugins/opensearch/musicbrainz_track.xml" r

In [27]:
# Parse just the first page, for testing purposes
soup2 = BeautifulSoup(pages[0].content, "html.parser")

# Paste the Selector from the first song title copied from Chrome Dev Tools
soup2.select("#content > div.recording-list > table > tbody > tr:nth-child(1) > td:nth-child(2) > a > bdi")

soup2.select("td:nth-child(2) > a > bdi")

[<bdi>Like a Rolling Stone</bdi>,
 <bdi>(I Can’t Get No) Satisfaction</bdi>,
 <bdi>Imagine</bdi>,
 <bdi>What’s Going On</bdi>,
 <bdi>Respect</bdi>,
 <bdi>Good Vibrations</bdi>,
 <bdi>Johnny B. Goode</bdi>,
 <bdi>Hey Jude</bdi>,
 <bdi>Smells Like Teen Spirit</bdi>,
 <bdi>What’d I Say</bdi>,
 <bdi>My Generation</bdi>,
 <bdi>A Change Is Gonna Come</bdi>,
 <bdi>Yesterday</bdi>,
 <bdi>Blowin' in the Wind</bdi>,
 <bdi>London Calling</bdi>,
 <bdi>I Want to Hold Your Hand</bdi>,
 <bdi>Purple Haze</bdi>,
 <bdi>Maybellene</bdi>,
 <bdi>Hound Dog</bdi>,
 <bdi>Let It Be</bdi>,
 <bdi>Born to Run</bdi>,
 <bdi>Be My Baby</bdi>,
 <bdi>In My Life</bdi>,
 <bdi>People Get Ready</bdi>,
 <bdi>God Only Knows</bdi>,
 <bdi>(Sittin’ on) The Dock of the Bay</bdi>,
 <bdi>Layla</bdi>,
 <bdi>A Day in the Life</bdi>,
 <bdi>Help!</bdi>,
 <bdi>I Walk the Line</bdi>,
 <bdi>Stairway to Heaven</bdi>,
 <bdi>Sympathy for the Devil</bdi>,
 <bdi>River Deep—Mountain High</bdi>,
 <bdi>You’ve Lost That Lovin’ Feelin’</bdi>,
 <b

In [35]:
# Paste the Selector from the first artist copied from Chrome Dev Tools
soup2.select("#content > div.recording-list > table > tbody > tr:nth-child(1) > td:nth-child(3) > a > bdi")

# Trim the selection: now it grabs all the artists
soup2.select("td:nth-child(3) > a > bdi")

[<bdi>Bob Dylan</bdi>,
 <bdi>The Rolling Stones</bdi>,
 <bdi>John Lennon</bdi>,
 <bdi>Marvin Gaye</bdi>,
 <bdi>Aretha Franklin</bdi>,
 <bdi>The Beach Boys</bdi>,
 <bdi>Chuck Berry</bdi>,
 <bdi>The Beatles</bdi>,
 <bdi>Nirvana</bdi>,
 <bdi>Ray Charles</bdi>,
 <bdi>The Who</bdi>,
 <bdi>Sam Cooke</bdi>,
 <bdi>The Beatles</bdi>,
 <bdi>Bob Dylan</bdi>,
 <bdi>The Clash</bdi>,
 <bdi>The Beatles</bdi>,
 <bdi>Jimi Hendrix</bdi>,
 <bdi>Chuck Berry</bdi>,
 <bdi>Elvis Presley</bdi>,
 <bdi>The Beatles</bdi>,
 <bdi>Bruce Springsteen</bdi>,
 <bdi>The Ronettes</bdi>,
 <bdi>The Beatles</bdi>,
 <bdi>The Impressions</bdi>,
 <bdi>The Beach Boys</bdi>,
 <bdi>Otis Redding</bdi>,
 <bdi>Derek and the Dominos</bdi>,
 <bdi>The Beatles</bdi>,
 <bdi>The Beatles</bdi>,
 <bdi>Johnny Cash</bdi>,
 <bdi>Led Zeppelin</bdi>,
 <bdi>The Rolling Stones</bdi>,
 <bdi>Ike &amp; Tina Turner</bdi>,
 <bdi>The Righteous Brothers</bdi>,
 <bdi>The Doors</bdi>,
 <bdi>U2</bdi>,
 <bdi>Bob Marley &amp; The Wailers</bdi>,
 <bdi>The Roll

In [None]:
pages_parsed = []
songs2 = []
artists2 = []

for i in range(len(pages)):
    # parse all pages
    pages_parsed.append(BeautifulSoup(pages[i].content, "html.parser"))
    # select only the info about the songs
    songs2_html = pages_parsed[i].select("div.lister-item-content")
    # for movie, store titles and reviews into lists
    for j in range(len(movies_html)):
        titles.append(movies_html[j].select("h3 > a")[0].get_text())
        synopsis.append(movies_html[j].select("p:nth-child(4)")[0].get_text().strip())