# WEB SCRAPING LAB

# Activity 1: MVP

## Importing libraries

In [60]:
from bs4 import BeautifulSoup
import requests # Allows us to access information on any webpage
import pandas as pd

## Storing and reading webpage

In [61]:
# Using the iTunes Top 100 Songs Chart

url = "https://www.popvortex.com/music/charts/top-100-songs.php"

In [62]:
# Checking the url response

response = requests.get(url)
response.status_code # 200 status code means OK!

200

In [63]:
soup = BeautifulSoup(response.content, "html.parser")

In [8]:
#print(soup.prettify())

## Extracting information

In [64]:
# Getting the artist list
artist = []

for i in soup.select('.artist'):
    artist.append(i.get_text())

In [65]:
# Getting the song list
title = []

for i in soup.select('.title'):
    title.append(i.get_text())

In [66]:
# Merging both lists onto a DataFrame
artist_song = pd.DataFrame({"artist":artist,
                       "title":title
                      })
artist_song

Unnamed: 0,artist,title
0,Sam Smith & Kim Petras,Unholy
1,Transformation Worship,Eagle (feat. KB)
2,David Guetta & Bebe Rexha,I'm Good (Blue)
3,Fleetwood Mac,Everywhere
4,HARDY & Lainey Wilson,wait in the truck
...,...,...
95,Auli'i Cravalho,How Far I'll Go
96,"Bette Midler, Sarah Jessica Parker & Kathy Najimy",One Way or Another (Hocus Pocus 2 Version)
97,Blasterjaxx & Timmy Trumpet,Narco
98,Thomas Rhett,Half Of Me (feat. Riley Green)


# Activity 2: expanding the project

Pulling out all rock songs between 2010 and 2019 from playback.fm.

In [67]:
url1 = 'https://playback.fm/charts/rock/2010'

In [68]:
# Checking the url response

response = requests.get(url1)
response.status_code # 200 status code means OK!

200

In [70]:
soup1 = BeautifulSoup(response.content, "html.parser")

In [71]:
#print(soup1.prettify())

In [72]:
iterations = range(2010, 2020, 1)
#[i for i in iterations]
for i in iterations:
    start_at= str(i)
    url1 = 'https://playback.fm/charts/rock/' + start_at
    print(url1)

https://playback.fm/charts/rock/2010
https://playback.fm/charts/rock/2011
https://playback.fm/charts/rock/2012
https://playback.fm/charts/rock/2013
https://playback.fm/charts/rock/2014
https://playback.fm/charts/rock/2015
https://playback.fm/charts/rock/2016
https://playback.fm/charts/rock/2017
https://playback.fm/charts/rock/2018
https://playback.fm/charts/rock/2019


In [73]:
import random
from time import sleep
from random import randint

In [74]:
pages = []

for i in iterations:
    # assemble the url:
    start_at= str(i)
    url1 = 'https://playback.fm/charts/rock/' + start_at

    # download html with a get request:
    response = requests.get(url1)
    #response = requests.get(url, headers = {"Accept-Language": "en-US"})

    # monitor the process by printing the status code
    print("Status code: " + str(response.status_code))

    # store response into "pages" list
    pages.append(response)

    # respectful nap:
    wait_time = randint(1,4)
    print("I will sleep for " + str(wait_time) + " second/s.")
    sleep(wait_time)

Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 3 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 4 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 2 second/s.
Status code: 200
I will sleep for 1 second/s.
Status code: 200
I will sleep for 4 second/s.


In [75]:
artist1 = []

for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
    artist_html = parsed.select('td:nth-child(2) > a')
    for i in soup1.select('td:nth-child(2) > a'):
        artist1.append(i.get_text())

In [76]:
title1 = []

for i in range(len(pages)):
    parsed = BeautifulSoup(pages[i].content, "html.parser")
    title_html = parsed.select('td.mobile-hide > a > span.song')
    for i in soup1.select('td.mobile-hide > a > span.song'):
        title1.append(i.get_text())

In [77]:
# Merging both lists onto a DataFrame
artist_song1 = pd.DataFrame({"artist":artist1,
                       "title":title1
                      })
artist_song1

Unnamed: 0,artist,title
0,\nSaving Abel\n,The Sex Is Good
1,\nDisturbed\n,Another Way to Die
2,\nMuse\n,Resistance
3,\nLinkin Park\n,Waiting for the End
4,\nThe Dirty Heads\n,Lay Me Down
...,...,...
995,\nTheory Of A Deadman\n,Little Smirk
996,\nMetric\n,Gold Guns Girls
997,\nTrapt\n,Sound Off
998,\nAgainst Me!\n,I Was A Teenage Anarchist


In [78]:
# Removing the '\n' from column 'artist'
artist_song1['artist'] = artist_song1['artist'].str.replace('\n','')
artist_song1

Unnamed: 0,artist,title
0,Saving Abel,The Sex Is Good
1,Disturbed,Another Way to Die
2,Muse,Resistance
3,Linkin Park,Waiting for the End
4,The Dirty Heads,Lay Me Down
...,...,...
995,Theory Of A Deadman,Little Smirk
996,Metric,Gold Guns Girls
997,Trapt,Sound Off
998,Against Me!,I Was A Teenage Anarchist


In [79]:
# Concatenating both lists
artist_song = pd.concat([artist_song, artist_song1], axis = 0)
artist_song.shape

(1100, 2)

In [82]:
import random
var = input("Enter the name of a song:")
random_name = random.choice(artist_song['title'])

if var in artist_song['title'].values:
    print("Here's a good song you might also like!", random_name)
else:
    print("Sorry, we don't have any suggestions!")

Enter the name of a song:I Was A Teenage Anarchist
Here's a good song you might also like! Maybe
