In [34]:
import time
import grequests
import multiprocessing
import itertools

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
from bs4 import element
from tqdm import tqdm

In [35]:
csv_fname = "data/p4k_reviews.csv"

cols = [ "artist", "album", "genre", "review", "lyrics", "date", "score"]
review_df = pd.DataFrame(columns=cols)

In [36]:
p4k_base =    "https://pitchfork.com"
review_base = "https://pitchfork.com/reviews/albums/"
page_base =   "https://pitchfork.com/reviews/albums/?page={}"

In [37]:
max_pages = 1813
desired_pages = 5

page_nums = [np.random.randint(1, max_pages) for x in range(desired_pages)]
page_urls = [page_base.format(x) for x in page_nums]

In [38]:
def g_scrape_review(review_response):

    global review_df

    review_soup = BeautifulSoup(review_response.content, "html.parser")

    tombstone = review_soup.find("div", {"class": "single-album-tombstone"})

    artist = tombstone.find("li").find("a")
    album  = tombstone.find("h1", {"class": "single-album-tombstone__review-title"})
    score  = review_soup.find("span", {"class": "score"})
    publish_date = review_soup.find("time", {"class": "pub-date"})["datetime"]
    genre = review_soup.find("a", {"class": "genre-list__link"})
    review_p = review_soup.find("div", {"class": "contents dropcap"}).findAll("p")

In [39]:
def g_get_review_links(page_response):

    page_soup = BeautifulSoup(page_response.content, "html.parser")
    page_review_divs = page_soup.findAll("div", {"class": "review"})
    review_links = [p4k_base + x.find("a", {"class": "review__link"})["href"] for x in page_review_divs]

    return review_links

In [40]:
page_requests  = (grequests.get(u) for u in page_urls)
page_responses = grequests.map(page_requests)

**Single Process** Get Review Links **GRequests**

In [41]:
%%time
# No multi
for page_response in page_responses:
    review_links = g_get_review_links(page_response)

CPU times: user 7.05 s, sys: 23.6 ms, total: 7.07 s
Wall time: 7.62 s


**Multi-Process** Get Review Links **GRequests**

In [42]:
%%time
# Multi
with multiprocessing.Pool() as p:
    review_links = p.map(g_get_review_links, page_responses)
review_links = list(itertools.chain.from_iterable(review_links))

CPU times: user 8.85 ms, sys: 11.4 ms, total: 20.3 ms
Wall time: 7.83 s


In [43]:
print(f"Found {len(review_links)} links for reviews")

Found 60 links for reviews


In [44]:
review_requests  = (grequests.get(u) for u in review_links)
review_responses = grequests.map(review_requests)

**Single Process** Scrape Review **GRequests**

In [45]:
%%time
# No multi
for review_response in review_responses:
    g_scrape_review(review_response)

CPU times: user 1min 16s, sys: 323 ms, total: 1min 17s
Wall time: 1min 33s


**Multi-Process** Scrape Reviews **GRequests**


In [46]:
%%time
# Multi
with multiprocessing.Pool() as p:
    p.map(g_scrape_review, review_responses)

CPU times: user 139 ms, sys: 27.4 ms, total: 166 ms
Wall time: 1min 37s


In [47]:
#review_df.to_csv(csv_fname)