This is a web-scraping utility to fetch and process [Pitchfork](https://pitchfork.com/) reviews into a custom dataset. Created for use in NLP applications such as the chatbot demonstrated in this PyTorch [tutorial](https://pytorch.org/tutorials/beginner/chatbot_tutorial.html). Credit to Rishav Agarwal's [article](https://towardsdatascience.com/byod-build-your-own-dataset-for-free-67133840dc85) for helpful tips on website parsing.

In [35]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import time
import random

In [36]:
'''
Data needed: url, album, artist, score, content, reviewer, bool genres
'''

def album_review(url):
    review = {}
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html5lib')

    artist, album = soup.find('title').find(text=True).split(' Album ')[0].split(':', 1)
    review['Artist'] = artist.strip(' ')
    review['Album'] = album.strip(' ')

    try:
        review['Score'] = float(soup.find('span', attrs={'score'}).find(text=True))
    except:
        print('Note: {} does not have a score associated, reverting to mean'.format(url))
        review['Score'] = 7.0   # average pitchfork album score

    review['Content'] = (' '.join([x.find(text=True) for x in soup.find_all('p') if x.find(text=True)])) # filter out None values
    review['URL'] = url
    review['Author'] = soup.find('a', attrs={'authors-detail__display-name'}).find(text=True)

    for genre in genres:
        review[genre] = False

    for genre in soup.find_all('a', attrs={'genre-list__link'}):
        review[genre.find(text=True)] = True
        
    return review


def review_df(review_list):
    ''' Converts dict review onto pd DataFrame '''
    df = pd.DataFrame.from_dict(review_list)
    return df


def ld_to_dl(LD):
    ''' Convert a list of dictionaries to a dictionary of lists ~ not used '''
    DL = {l: [dic[l] for dic in LD] for l in LD[0]}
    return DL



In [37]:
'''
Parse sitemap generated via https://www.xml-sitemaps.com/ on root https://pitchfork.com/reviews/albums/
'''

sitemap = 'album_sitemap.xml'
# sitemap = 'track_sitemap.xml'

with open(sitemap, 'r') as file:
    content = file.readlines()
    content = ''.join(content)
    soup = BeautifulSoup(content, 'lxml')

    urls = []
    genres = []
    for url in soup.find_all('loc'):
        url = url.find(text=True)   # convert xml tag to string
        if '?' in url:              # extract genre list
            genres.append(url.split('?genre=')[1].capitalize())
        else:
            urls.append(url)

    urls.pop(0)  # remove root (https://pitchfork.com/reviews/albums/)

    genres.extend(['Folk/Country','Pop/R&B'])   # not present as distinct URL (rarely used)

    print('Sample URL:')
    print('\t' + urls[0])
    print('Genres:')
    for genre in genres: 
        print('\t' + genre)

Sample URL:
	https://pitchfork.com/reviews/albums/boldy-james-real-bad-man-real-bad-boldy/
Genres:
	Rap
	Rock
	Folk
	Metal
	Jazz
	Pop
	Electronic
	Experimental
	Folk/Country
	Pop/R&B


In [38]:
'''
Parse album reviews and save to csv
'''

reviews = []
start = time.time()
prev = start
n_albums = len(urls)

print('Dragging a Pitchfork through {} reviews...'.format(n_albums))

for i, url in enumerate(urls, start=1):
    if i % 20 == 0:
        print('{}/{} reviews parsed \t>>>\ttime elapsed: {:.4f}'.format(i, n_albums, time.time()-prev))
        prev = time.time()

    time.sleep(random.random() / 5) # uniform random delay to prevent deny-list 
    review = album_review(url)
    reviews.append(review)

print('Web scraping complete. {} reviews parsed in {:.4f} seconds'.format(n_albums, time.time()-start))

df = pd.DataFrame(reviews)
df.head()

Dragging a Pitchfork through 310 reviews...
20/310 reviews parsed 	>>>	time elapsed: 6.9443
40/310 reviews parsed 	>>>	time elapsed: 8.9177
60/310 reviews parsed 	>>>	time elapsed: 11.4629
80/310 reviews parsed 	>>>	time elapsed: 10.1037
100/310 reviews parsed 	>>>	time elapsed: 11.6277
120/310 reviews parsed 	>>>	time elapsed: 13.8774
140/310 reviews parsed 	>>>	time elapsed: 14.1374
Note: https://pitchfork.com/reviews/albums/5911-the-complete-studio-recordings/ does not have a score associated, reverting to mean
160/310 reviews parsed 	>>>	time elapsed: 13.0076
180/310 reviews parsed 	>>>	time elapsed: 11.3089
200/310 reviews parsed 	>>>	time elapsed: 11.7394
220/310 reviews parsed 	>>>	time elapsed: 13.4727
240/310 reviews parsed 	>>>	time elapsed: 12.8516
260/310 reviews parsed 	>>>	time elapsed: 11.0172
280/310 reviews parsed 	>>>	time elapsed: 11.8285
300/310 reviews parsed 	>>>	time elapsed: 11.4433
Web scraping complete. 310 reviews parsed in 180.3166 seconds


Unnamed: 0,Artist,Album,Score,Content,URL,Author,Rap,Rock,Folk,Metal,Jazz,Pop,Electronic,Experimental,Folk/Country,Pop/R&B
0,Boldy James / Real Bad Man,Real Bad Boldy,7.6,The Detroit rapper’s fourth album of 2020 is y...,https://pitchfork.com/reviews/albums/boldy-jam...,Pete Tosiello,True,False,False,False,False,False,False,False,False,False
1,I Love Your Lifestyle,No Driver,7.7,"They’re still cooped up and bummed out, but th...",https://pitchfork.com/reviews/albums/i-love-yo...,Ian Cohen,False,True,False,False,False,False,False,False,False,False
2,Nathan Salsburg,Landwerk/Landwerk No. 2,7.5,1 / 2 Albums Inspired by The Caretaker’s corro...,https://pitchfork.com/reviews/albums/nathan-sa...,Grayson Haver Currin,False,False,False,False,False,False,False,False,True,False
3,Joan of Arc,Tim Melina Theo Bobby,7.4,After more than 20 albums in the past two deca...,https://pitchfork.com/reviews/albums/joan-of-a...,Anna Gaca,False,True,False,True,False,False,False,False,False,False
4,Roland Haynes,Second Wave,7.4,"Long a collectors’ holy grail, this 1975 sessi...",https://pitchfork.com/reviews/albums/roland-ha...,Marty Sartini Garner,False,False,False,False,True,False,False,False,False,False


In [39]:
'''
Erroneous value count (unfilled)
'''
df.isna().sum()

Artist          0
Album           0
Score           0
Content         0
URL             0
Author          0
Rap             0
Rock            0
Folk            0
Metal           0
Jazz            0
Pop             0
Electronic      0
Experimental    0
Folk/Country    0
Pop/R&B         0
dtype: int64

In [40]:
'''
Save reviews as csv
'''
df.head()

Unnamed: 0,Artist,Album,Score,Content,URL,Author,Rap,Rock,Folk,Metal,Jazz,Pop,Electronic,Experimental,Folk/Country,Pop/R&B
0,Boldy James / Real Bad Man,Real Bad Boldy,7.6,The Detroit rapper’s fourth album of 2020 is y...,https://pitchfork.com/reviews/albums/boldy-jam...,Pete Tosiello,True,False,False,False,False,False,False,False,False,False
1,I Love Your Lifestyle,No Driver,7.7,"They’re still cooped up and bummed out, but th...",https://pitchfork.com/reviews/albums/i-love-yo...,Ian Cohen,False,True,False,False,False,False,False,False,False,False
2,Nathan Salsburg,Landwerk/Landwerk No. 2,7.5,1 / 2 Albums Inspired by The Caretaker’s corro...,https://pitchfork.com/reviews/albums/nathan-sa...,Grayson Haver Currin,False,False,False,False,False,False,False,False,True,False
3,Joan of Arc,Tim Melina Theo Bobby,7.4,After more than 20 albums in the past two deca...,https://pitchfork.com/reviews/albums/joan-of-a...,Anna Gaca,False,True,False,True,False,False,False,False,False,False
4,Roland Haynes,Second Wave,7.4,"Long a collectors’ holy grail, this 1975 sessi...",https://pitchfork.com/reviews/albums/roland-ha...,Marty Sartini Garner,False,False,False,False,True,False,False,False,False,False
