In [1]:
#Data storage
import numpy as np
import pandas as pd

#Scraping
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

#General
import re
import time
import random
import pickle

In [2]:
#Scrape an individual album page when passed from a link
def p_review_scrape(url):
    
    #Temporary dictionary to create a row for the review
    temp_dict = dict()
    
    #Try to connect
    try:
        html = urlopen(url)
        bs = BeautifulSoup(html, 'html.parser')
        
    #If the webpage cannot be connected to, then leave
    except HTTPError as e:
        print(e)
        print("Cannot reach {}".format(url))
        time.sleep(4 + 2*random.random())
        return 'None'
    
    #Collect data from webpages, and fill in nan if info isn't available
    
    #Get artist
    try:
        temp_dict['artist'] = bs.find_all(class_ = 'single-album-tombstone__artist-links')[0].get_text()
    except:
        temp_dict['artist'] = 'Unknown'
        
    #Get album    
    try:    
        temp_dict['album'] = bs.find_all(class_ = 'single-album-tombstone__review-title')[0].get_text()
    except:
        temp_dict['album'] = 'Unknown'
        
    #Get score
    try:
        temp_dict['score'] = bs.find_all(class_ = 'score')[0].get_text()
    except:
        temp_dict['score'] = np.nan
        
    #Get labels
    try:
        for entry in bs.find_all(class_ = 'labels-list__item'):
            temp_dict['labels'] = temp_dict.get('labels', []) + [entry.get_text()]
    except:
        temp_dict['labels'] = []
            
    #Get genres
    try:
        for entry in bs.find_all(class_ = 'genre-list__item'):
            temp_dict['genres'] = temp_dict.get('genres', []) + [entry.get_text()]
    except:
        temp_dict['genre'] = []
            
    #Get year
    try:
        temp_dict['year'] = bs.find_all(class_ = 'single-album-tombstone__meta-year')[0].get_text().split()[-1]
    except:
        temp_dict['year'] = np.nan
        
    #Get author name
    try:
        temp_dict['reviewer'] = bs.find_all(class_ = 'authors-detail__display-name')[0].get_text()
    except:
        temp_dict['reviewer'] = 'Unknown'
        
    #Get author title
    try:
        temp_dict['title'] = bs.find_all(class_ = 'authors-detail__title')[0].get_text()
    except:
        temp_dict['title'] = 'None'
        
    #Get published date
    try:
        temp_dict['pub_date'] = bs.find_all(class_ = 'pub-date')[0].get_text()
    except:
        temp_dict['pub_date'] = np.nan

        
    #Get abstract
    try:
        temp_dict['abstract'] = bs.find_all(class_ = 'review-detail__abstract')[0].get_text()
    except:
        temp_dict['abstract'] = 'None'
        
    #Get review
    try:
        temp_dict['review'] = bs.find_all(class_ = 'review-detail__text')[0].get_text()
    except:
        temp_dict['review'] = 'None'
        
    #Get artwork link
    try:
        temp_dict['art'] = re.findall(r'src=.*\.jpg',str(bs.find_all(class_ = 'single-album-tombstone__art')))[0]
    except:
        temp_dict['art'] = 'Unknown'
        
    return temp_dict

In [3]:
test = p_review_scrape('https://pitchfork.com/reviews/albums/cut-copy-freeze-melt/')
test

{'artist': 'Cut Copy',
 'album': 'Freeze, Melt',
 'score': '6.5',
 'labels': ['Cutters', 'The Orchard'],
 'genres': ['Rock'],
 'year': '2020',
 'reviewer': 'Jesse Dorris',
 'title': 'Contributor',
 'pub_date': 'August 25 2020',
 'abstract': 'The Australian electro-pop veterans return with a climate-change album both as chilled as an upscale boutique and as politically engaged as your Instagram feed.\n',
 'review': 'Featured Tracks:Play TrackLove Is All We Share — Cut CopyVia\xa0Bandcamp\xa0/\xa0BuyCut Copy have been at it for almost 20 years, which seems impossible. They seem to exist outside of time, perhaps because they’ve been looking back from the start—immediately appealing and with a toothsome competence, but more about articulating taste than defining a singular point of view.\nMaybe bandleader Dan Whitford felt stale after 2017’s slick bummer, Haiku from Zero. Maybe the anxiety of influence had settled into ennui. For whatever reason, Whitford left his native Australia and deca

In [4]:
#Grab relevant links on a pitchfork album page
def p_url_grab(url):
    
    #Load page
    try:
        html = urlopen(url)
        bs = BeautifulSoup(html, 'html.parser')
    
        #Create a list of review urls
        url_list = []
        for entry in bs.find_all('a', {'class': 'review__link'}):
            url_list.append(entry.attrs['href'])
            
    except HTTPError as e:
        print(e)
        print('Error on {}'.format(url))
        return 'None'
        
    return url_list

In [5]:
test_list = p_url_grab('https://pitchfork.com/reviews/albums/?page=1')
test_list

['/reviews/albums/harry-nilsson-pussy-cats/',
 '/reviews/albums/maluma-papi-juancho/',
 '/reviews/albums/silvia-tarozzi-mi-specchio-e-rifletto/',
 '/reviews/albums/katy-perry-smile/',
 '/reviews/albums/nines-crabs-in-a-bucket/',
 '/reviews/albums/sevdaliza-shabrang/',
 '/reviews/albums/shackleton-zimpel-primal-forms/',
 '/reviews/albums/nas-kings-disease/',
 '/reviews/albums/no-joy-motherhood/',
 '/reviews/albums/dan-deacon-well-groomed-original-score/',
 '/reviews/albums/03-greedo-ron-rontheproducer-load-it-up-vol-01/',
 '/reviews/albums/fireboy-dml-apollo/']

In [6]:
#Pitchfork web crawler
def p_crawler(start_page = 1, end_page = 2):
    
    start_time = time.time()
    
    #List for storing dictionaries of entries
    data_list = []
    
    #Record start page to increment and the main site page to use to create new paths later
    current_page = start_page
    main_url = 'https://pitchfork.com'
    
    #Continue loop until you complete end_page
    while current_page <= end_page:
        
        #Grab review urls from main page (twelve at a time), then pause for a sec
        url = 'https://pitchfork.com/reviews/albums/?page={}'.format(current_page)
        url_list = p_url_grab(url)
        time.sleep(2 + 1*random.random())
        
        if url_list != 'None':
            for url in url_list:
            
                #Create actual url, get the data, add to the list, then chill out for a sec
                true_url = main_url + url
                new_row = p_review_scrape(true_url)
                if new_row != 'None':
                    data_list.append(new_row)
                time.sleep(2 + 1*random.random())
                
        current_page += 1
        print(current_page - 1, '/', end_page, time.time() - start_time)
    return data_list

In [7]:
crawl_test = p_crawler(start_page = 1, end_page = 2)
pd.DataFrame(crawl_test)

1 / 2 45.33940243721008
2 / 2 91.05144619941711


Unnamed: 0,artist,album,score,labels,genres,year,reviewer,title,pub_date,abstract,review,art
0,Harry Nilsson,Pussy Cats,7.8,[RCA Victor],[Rock],1974,Jayson Greene,Contributing Editor,16 hrs ago,"Each Sunday, Pitchfork takes an in-depth look ...","If people know of Pussy Cats, they know at lea...","src=""https://media.pitchfork.com/photos/5f4680..."
1,Maluma,Papi Juancho,6.6,[Sony Music Latin],"[Global, Pop/R&B, Rap]",2020,Matthew Ismael Ruiz,"Associate Staff Writer, News",August 29 2020,"The Colombian pop star’s latest is low stakes,...",Can you really be macho if your best work come...,"src=""https://media.pitchfork.com/photos/5f4524..."
2,Silvia Tarozzi,Mi specchio e rifletto,7.8,[Unseen Worlds],"[Experimental, Folk/Country]",2020,Allison Hussey,"Associate Staff Writer, News",August 29 2020,"Inspired by poet Alda Merini, the Italian viol...",Featured Tracks:Play Track“La forza del canto”...,"src=""https://media.pitchfork.com/photos/5f4534..."
3,Katy Perry,Smile,5.7,[Capitol],[Pop/R&B],2020,Dani Blum,Contributor,August 28 2020,"Katy Perry’s bubbly, cliché-ridden pop feels e...","Between rows of Spandex at Forever 21, in the ...","src=""https://media.pitchfork.com/photos/5f4538..."
4,Nines,Crabs in a Bucket,7.0,[Warner],[Rap],2020,Will Pritchard,Contributor,August 28 2020,"On the UK road rapper’s heavy third album, Nin...","On Christmas Eve 2011, UK road rapper Nines up...","src=""https://media.pitchfork.com/photos/5f4695..."
5,Sevdaliza,Shabrang,7.8,[Twisted Elegance],[Pop/R&B],2020,Colin Lodewick,Contributor,August 28 2020,"On her sophomore album, the singer examines go...",Sevdaliza approaches her sophomore album Shabr...,"src=""https://media.pitchfork.com/photos/5f4674..."
6,ShackletonZimpel,Primal Forms,7.7,[Cosmo Rhythmatic],"[Electronic, Experimental]",2020,Ray Philp,,August 28 2020,The Polish clarinetist Wacław Zimpel blows awa...,Featured Tracks:Play TrackPrimal Forms — Shack...,"src=""https://media.pitchfork.com/photos/5f3168..."
7,Nas,King’s Disease,6.3,[Mass Appeal],[Rap],2020,Pete Tosiello,Contributor,August 27 2020,The Queensbridge legend’s 13th album marks a r...,"By all measures, the rollout of 2018’s NASIR, ...","src=""https://media.pitchfork.com/photos/5f451c..."
8,No Joy,Motherhood,8.0,[Joyful Noise],[Rock],2020,Evan Rytlewski,Contributor,August 27 2020,"On their fearlessly creative, beat-heavy new r...",Perhaps it was inevitable that shoegaze would ...,"src=""https://media.pitchfork.com/photos/5f43dd..."
9,Dan Deacon,Well Groomed (Original Score),7.4,[Domino],[Electronic],2020,Marty Sartini Garner,Contributor,August 27 2020,The rainbow-colored pups of Rebecca Stern’s do...,If you were making a documentary about people ...,"src=""https://media.pitchfork.com/photos/5f451a..."
