# WEB SCRAPING LAB

# Activity 1: MVP

## Importing libraries

In [1]:
from bs4 import BeautifulSoup
import requests # Allows us to access information on any webpage
import pandas as pd

## Storing and reading webpage

In [2]:
# Using the iTunes Top 100 Songs Chart

url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [3]:
# Checking the url response

response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [4]:
soup = BeautifulSoup(response.content, "html.parser")

In [20]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   iTunes Top 100 Songs Chart 2022
  </title>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <meta content="iTunes top 100 songs chart list. The most popular hit music and trending songs of 2022. Chart of today's current iTunes top 100 songs is updated daily." name="description"/>
  <meta content="iTunes Top 100 Songs Chart 2022" property="og:title">
   <meta content="Chart of the top 100 songs on iTunes. Chart list of the top 100 song downloads of 2022 is updated daily." property="og:description">
    <meta content="article" property="og:type">
     <meta content="https://www.popvortex.com/images/logo-facebook.png" property="og:image"/>
     <meta content="PopVortex" property="og:site_name"/>
     <meta content="https://www.popvortex.com/music/charts/top-100-songs.php" property="og:url"/>
     <meta content="100000239962942" property="fb:admins"/>
     <meta content="178831188827052

## Extracting information

In [57]:
# Getting the artist list
artist = []

for i in soup.select('.artist'):
    artist.append(i.get_text())

In [58]:
# Getting the song list
title = []

for i in soup.select('.title'):
    title.append(i.get_text())

In [59]:
# Merging both lists onto a DataFrame
artist_song = pd.DataFrame({"artist":artist,
                       "title":title
                      })
artist_song

Unnamed: 0,artist,title
0,Sam Smith & Kim Petras,Unholy
1,Transformation Worship,Eagle (feat. KB)
2,Fleetwood Mac,Everywhere
3,David Guetta & Bebe Rexha,I'm Good (Blue)
4,HARDY & Lainey Wilson,wait in the truck
...,...,...
95,"Carolina Gaitán - La Gaita, Mauro Castillo, Ad...",We Don't Talk About Bruno
96,Bad Bunny & Chencho Corleone,Me Porto Bonito
97,Morgan Wallen,Whiskey Glasses
98,Simple Minds,Don't You (Forget About Me)


In [211]:
import random
var = input("Enter the name of a song:")
random_name = random.choice(artist_song['title'])

if var in artist_song['title'].values:
    print("Here's a good song you might also like!", random_name)
else:
    print("Sorry, we don't have any suggestions!")

Enter the name of a song:Unholy
Here's a good song you might also like! I Ain't Worried


# Activity 2: expanding the project

Pulling out all rock songs between 2010 and 2019 from playback.fm.

In [202]:
url1 = 'https://playback.fm/charts/rock/2010'

In [203]:
# Checking the url response

response = requests.get(url1)
response.status_code # 200 status code means OK!

200

In [204]:
soup1 = BeautifulSoup(response.content, "html.parser")

In [205]:
print(soup1.prettify())

<!DOCTYPE html>
<html class="birthday" lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="upgrade-insecure-requests" http-equiv="Content-Security-Policy">
   <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
   <meta content="width=device-width, initial-scale=1.0,minimum-scale=1.0,maximum-scale=5.0" name="viewport">
    <meta content="always" name="referrer">
     <title>
      Top 100 Rock &amp; Roll Song Chart for 2010
     </title>
     <meta content="Find the top 100 Rock &amp; Roll songs for the year of 2010 and listen to them all! Can you guess the number one Rock &amp; Roll song in 2010? Find out now!" name="description"/>
     <meta content="" name="keywords"/>
     <meta content="7934465" property="fb:admins"/>
     <meta content="983502645001411" property="fb:app_id"/>
     <meta content="website" property="og:type"/>
     <meta content="Top 100 Rock &amp; Roll Song Chart for 20

In [184]:
iterations = range(2010, 2020, 1)
#[i for i in iterations]
for i in iterations:
    start_at= str(i)
    url1 = 'https://playback.fm/charts/rock/' + start_at
    print(url1)

https://playback.fm/charts/rock/2010
https://playback.fm/charts/rock/2011
https://playback.fm/charts/rock/2012
https://playback.fm/charts/rock/2013
https://playback.fm/charts/rock/2014
https://playback.fm/charts/rock/2015
https://playback.fm/charts/rock/2016
https://playback.fm/charts/rock/2017
https://playback.fm/charts/rock/2018
https://playback.fm/charts/rock/2019


In [185]:
import random
from time import sleep
from random import randint

In [186]:
pages = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url1 = 'https://playback.fm/charts/rock/' + start_at

    # download html with a get request:
    response = requests.get(url1)
    #response = requests.get(url, headers = {"Accept-Language": "en-US"})

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s.")
    sleep(wait_time)

Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 3 second/s.


In [239]:
artist1 = []

for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
    artist_html = parsed.select('td:nth-child(2) > a')
    for i in soup1.select('td:nth-child(2) > a'):
        artist1.append(i.get_text())

In [240]:
title1 = []

for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
    title_html = parsed.select('td.mobile-hide > a > span.song')
    for i in soup1.select('td.mobile-hide > a > span.song'):
        title1.append(i.get_text())

In [241]:
# Merging both lists onto a DataFrame
artist_song1 = pd.DataFrame({"artist":artist1,
                       "title":title1
                      })
artist_song1

Unnamed: 0,artist,title
0,\nSaving Abel\n,The Sex Is Good
1,\nDisturbed\n,Another Way to Die
2,\nMuse\n,Resistance
3,\nLinkin Park\n,Waiting for the End
4,\nThe Dirty Heads\n,Lay Me Down
...,...,...
995,\nTheory Of A Deadman\n,Little Smirk
996,\nMetric\n,Gold Guns Girls
997,\nTrapt\n,Sound Off
998,\nAgainst Me!\n,I Was A Teenage Anarchist


In [237]:
# Removing the '\n' from column 'artist'
artist_song1['artist'] = artist_song1['artist'].str.replace('\n','')
artist_song1

Unnamed: 0,artist,title
0,Saving Abel,The Sex Is Good
1,Disturbed,Another Way to Die
2,Muse,Resistance
3,Linkin Park,Waiting for the End
4,The Dirty Heads,Lay Me Down
...,...,...
995,Theory Of A Deadman,Little Smirk
996,Metric,Gold Guns Girls
997,Trapt,Sound Off
998,Against Me!,I Was A Teenage Anarchist
