This is a web-scraping utility to fetch and process [Pitchfork](https://pitchfork.com/) reviews into a custom dataset. Created for use in NLP applications such as the chatbot demonstrated in this PyTorch [tutorial](https://pytorch.org/tutorials/beginner/chatbot_tutorial.html). Credit to Rishav Agarwal's [article](https://towardsdatascience.com/byod-build-your-own-dataset-for-free-67133840dc85) for helpful tips on website parsing.

In [12]:
import requests
from bs4 import BeautifulSoup
import os
# import csv
# import pandas as pd

In [142]:
'''
Data needed: url, album, artist, score, text, reviewer

Parse sitemap generated via https://www.xml-sitemaps.com/ on root https://pitchfork.com/reviews/albums/
'''

def review_fetch(url):
    review = {}
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html5lib')
    
    review['url'] = url
    review['genre'] = [x.find(text=True) for x in soup.find_all('a', attrs={'genre-list__link'})]
    review['author'] = soup.find('a', attrs={'authors-detail__display-name'}).find(text=True)
    review['score'] = float(soup.find('span', attrs={'score'}).find(text=True))
    review['content'] = ' '.join([x.find(text=True) for x in soup.find_all('p')])
    artist, album = (''.join(soup.find('title').find(text=True))).split(' Album ')[0].split(':')
    review['artist'] = artist.strip(' ')
    review['album'] = album.strip(' ')

    return review


sitemap = 'sitemap.xml'

with open(sitemap, 'r') as file:
    content = file.readlines()
    content = ''.join(content)
    soup = BeautifulSoup(content, 'lxml')

    urls = []
    for url in soup.find_all('loc'):
        urls.append(''.join(url.find_all(text=True)))
    
    urls.pop() # remove root itself

In [144]:
# url = 'https://pitchfork.com/reviews/albums/the-postal-service-everything-will-change/'

# page = requests.get(url)
# soup = BeautifulSoup(page.text, 'html5lib')

# # print(soup.prettify())

# score = float(soup.find('span', attrs={'score'}).find(text=True))
# text = ' '.join([p.find(text=True) for p in soup.find_all('p')])

# artist, album = (''.join(soup.find('title').find(text=True))).split(' Album ')[0].split(':')
# artist, album = artist.strip(' '), album.strip(' ')

# genre = [gen.find(text=True) for gen in soup.find_all('a', attrs={'genre-list__link'})]
# author = soup.find('a', attrs={'authors-detail__display-name'}).find(text=True)

# print(album)
# print(artist)
# print(score)
# print(text)
# print(genre)
# print(author)

url = 'https://pitchfork.com/reviews/albums/the-postal-service-everything-will-change/'

review = review_fetch(url)

print(review)

{'url': 'https://pitchfork.com/reviews/albums/the-postal-service-everything-will-change/', 'genre': ['Electronic', 'Rock'], 'author': 'Marc Hogan', 'score': 7.8, 'content': 'On this live album, based from a 2014 concert film, you can hear the Postal Service transform from an idea to a band. They might  When Sub Pop  Everything Will Change The Postal Service’s most joyful song, “Such Great Heights,” is in a sense about the impossibility of capturing the moment, and then trying through technology to defy those limits. Gibbard attempts “to leave this all on your machine” and ends up being unsatisfied with the low fidelity; at the same time, computer wizardry allows Gibbard to sing the song’s breathlessly overlapping verses without pausing for air. Live shows, like live albums, also rely on technical sleight-of-hand, but the presumed presence of an audience, gathered in real time, is transformative. When the Postal Service do “Such Great Heights” on  What  These accomplished performances g