In [1]:
import sys
import requests
import spacy
from bs4 import BeautifulSoup
import re
import pickle
import numpy as np
import pandas as pd
import json
import scipy as sp
import feather
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS 
from collections import Counter
from plotnine import *
import janitor

the 1.0 release. Instead of importing the Janitor DataFrame, please instead
`import janitor`, and use the functions directly attached to native pandas
dataframe.


In [None]:
def generate_urls(base_string, num_urls):
    string_list = []
    for i in range(1,num_urls):
        new_string = base_string + str(i)
        string_list.append(new_string)
    return string_list

def generate_artist_album_data(url_list):

    artist_list = []
    album_list = []
    link_list = []
    author_list = []
    score_list = []
    text_list = []
    pub_date = []
    
    counter = 1
    for url in url_list:
        print('Retrieving {}. {} of {} retrieved.'.format(url,counter,len(url_list)))
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        artist_info = soup.findAll(['a', 'ul', 'h2'], attrs={'class': ['artist-list review__title-artist']})
        album_info = soup.findAll(['a', 'ul', 'h2'], attrs={'class': 'review__title-album' })
        link_info = soup.findAll(['a'], attrs={'class': 'review__link'})

        for artist in artist_info:
            artist_list.append(artist.text)


        for album in album_info:
            album_list.append(album.text)

        for link in link_info:
            base_link = 'https://pitchfork.com'
            link_list.append(base_link + link['href'])
        counter += 1
    
    return link_list

def get_album_data(urls):
    
    album_df = pd.DataFrame({'publication_date': [], 'author': [], 'artist':[], 'album': [], 'score':[], 'review': []})
    
    counter = 1
    for url in urls:
        # Read in HTML from link
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        
        if soup.findAll(['div'], attrs={'class': ['contents dropcap']}):
            review = soup.findAll(['div'], attrs={'class': ['contents dropcap']})

        else:
            review = soup.findAll(['div'], attrs={'class': ['review-detail__article-content']})
        try:
            x=soup.findAll(['script'], attrs={'type': ["text/javascript"]})
            x = x[2].string
            data = x.split("window.digitalData=", 1)[1]
            data = json.loads(data)

            publication_date = pd.to_datetime(data['publishDate'])
            author = data['authors']
            artist = data['display'].split(':')[0].rstrip()
            artist = artist.replace('&amp;', 'and')
            artist = artist.replace('&quot;', '')
            artist = artist.replace('-&gt;', '')
            album = data['display'].split(':')[1].lstrip()
            album = album.replace('&amp;', 'and')
            album = album.replace('&quot;', '')
            album = album.replace('-&gt;', '')

            score = soup(text=re.compile('window.App'))[0]
            score = score.split("window.App=")[1].rstrip(';')
            score = json.loads(score)
            score=score['context']['dispatcher']['stores']['ReviewsStore']['items']
            key = [i for i in score][0]
            score = score[key]['tombstone']['albums'][0]['rating']['rating']

            print('Artist: {}, Album: {}'.format(artist,album))

            df_to_append = pd.DataFrame({'publication_date':[ publication_date], 'author': [author], 'artist':[artist], 'album': [album], 'score':[score], 'review': [review[0].text]})

            album_df = album_df.append(df_to_append, ignore_index=True)
            print('{} of {} completed'.format(counter,len(urls)))
            counter += 1
        except:
            print('Could not extract {}'.format(url))
            counter +=1
    
    album_df['score'] = album_df['score'].astype(float)
    
    return album_df

def tokenizeText(sample):
    stopwords = list(STOP_WORDS)

    # lemmatize
    #tokens = [i.lemma_ for i in sample]
    #tokens = [i for i in tokens if i not in stopwords]
    #tokens = [i for i in tokens if i != '-PRON-']
    #tokens = [i for i in tokens if i.pos_ != 'SYM']
    #tokens = [i for i in tokens if i.pos_ != 'PUNCT']
    
    stemmer=PorterStemmer()
    tokens = [i.lower_ for i in sample if i.lower_ not in list(stopwords) and i.pos_ != '-PRON-' and i.pos_ != 'SYM' and i.pos_ != "PUNCT"]
    tokens = [i for i in tokens if i not in ['n’t','"', ',', ',', ':', '.', '/', '-', '’s', '\n', '—', '’', '’s']]
    tokens = [stemmer.stem(i) for i in tokens]
    #tokens = [i for i in freq_list if i in tokens]
    
    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
    
    doc = ' '.join(tokens)

    return doc

def create_corpus(df):
    nlp = spacy.load('en_core_web_sm')
    doc_list = list(df['review'])
    doc_list_new = []
    
    for doc in doc_list:
        try:
            doc = nlp(doc)
            doc = tokenizeText(doc)
            doc_list_new.append(doc)
        except:
            pass
    
    return doc_list_new

def split_corpus(corpus, num_words):
    word_list = []
    for doc in corpus:
        doc = doc.split(' ')
        for word in doc:
            word_list.append(word)
    word_freq = Counter(word_list)
    common_words = word_freq.most_common(num_words)
    
    most_freq_list = []
    for i in common_words:
        most_freq_list.append(i[0])
    
    return most_freq_list

def get_best_new_music(num_urls):
    album_df = pd.DataFrame({'artist':[], 'album': []})

    for i in range(1,num_urls): 
        url = 'https://pitchfork.com/reviews/best/albums/?page=' + str(i)
        print(url)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        soup = soup.findAll(['script'])
        soup=soup[8].string.split("window.App=")[1]
        soup = soup.rstrip(';')
        soup = json.loads(soup)
        data = soup['context']['dispatcher']['stores']['ReviewsStore']['items']
        id_list = list(data)
        for i in id_list:
            #print(i)
            if data[i]['tombstone']['albums'][0]['album']['display_name']:
                artist = data[i]['tombstone']['albums'][0]['album']['display_name']
            else:
                artist = data[i]['tombstone']['albums'][0]['album']['artists'][0]['display_name']
            
            artist = artist.replace('&amp;', 'and')
            artist = artist.replace('&quot;', '')
            artist = artist.replace('-&gt;', '')
            
            album = data[i]['tombstone']['albums'][0]['album']['display_name']
            album = album.replace('&amp;', 'and')
            album = album.replace('&quot;', '')
            album = album.replace('-&gt;', '')
            
            df_to_append = pd.DataFrame({'artist':[artist], 'album': [album]})
            album_df = album_df.append(df_to_append,ignore_index=True)
            
    return album_df

def filter_by_frequency(zipped_items, top_items):
    zip_list = []
    counter = 0
    for i in zipped_items:
        if counter == top_items:
            break
        else:
            print(i[0])
            zip_list.append(i[0])
            counter += 1
            print(counter)
    return zip_list


### Get the data if it hasn't been retrieved already
This process takes a while, best to be done while you're gone at work or asleep.

In [None]:
# There are currently 59 pages of best new music.
# Data is output as a df.
best_new_music = get_best_new_music(59)
best_new['category'] = 1

# Output the data to a csv
best_new_music.to_csv('best_new_music.csv', index=False)

# Output the data to a more performance format, like Apache Feather
feather.write_dataframe(best_new_music, 'best_new_music.feather')

In [None]:
# Now we have to scrape al lthe other data.
# There are ~22000K reviews, it takes some time.
# Data is output as a df.
url_list = generate_urls('https://pitchfork.com/reviews/albums/?page=', 1703)
urls = generate_artist_album_data(url_list)
album_df = get_album_data(urls)

In [None]:
# Write the raw data to a csv and feather
album_df.to_csv('pitchfork_reviews.csv')
feather.write_dataframe(album_df,'pitchfork_reviews.feather')

### Read in the data if it's already been generated.

In [None]:
pitchfork_data = feather.read_dataframe('pitchfork_reviews.feather')
pitchfork_data.rename({'artist': 'artist_name'}, axis=1, inplace=True)

#### Create a corpus of data using the reviews.
This is accomplished using mainly the `spaCy` library, with some help from `nltk` to do PorterStemming instead of default lemmatization that spacy does. Lemmatization tends to lead to even more sparse matrices, which could be good (more data!) or bad (resource intensive for anything Feature Engineering or ML oriented).

I have a set of helper function that cleans up all text data, removes symbols, stop words, and every other random quirk I found doing this. This part alone took a significant part of the time.

In [None]:
corpus = create_corpus(pitchfork_data)

In [None]:
# Use scikit to do TFIDF vectorization. You can swap this out with simple counts
# by using CountVectorizer().

cv = TfidfVectorizer()
matrix = cv.fit_transform(corpus)

In [None]:
# Get word frequencies in a list of tuples. Use a helper function
# to retrieve the top n (here, 5000) words. 

# The rationale here is mainly that the data can get very large (20000 columns or so),
# so it's good to only take what you need.

zipped_frequency_list = sorted(zip(cv.get_feature_names(),
    np.asarray(matrix.sum(axis=0)).ravel()), key=lambda x: x[1], reverse=True)

vocab_list = filter_by_frequency(zipped_frequency_list, 5000)

In [None]:
# Refit our data, but this time only with the list of top n words we chose.
cv = TfidfVectorizer(vocabulary=vocab_list)
matrix = cv.fit_transform(corpus)

In [None]:
# Concat matrices to gether to get a neater df. This step isn't necessary,
# but I find dfs easier to examine the data with. Numpy arrays are more resource
# efficient so this step isn't necessary.
album_df2 = pd.concat([pitchfork_data, pd.DataFrame(matrix.todense(), columns=cv.get_feature_names())], axis=1).ffill()

#### Do a bunch of stuff to merge best new music to review data.

In [None]:
album_df2.drop('Unnamed: 0', axis=1, inplace=True)
best_new = feather.read_dataframe('best_new_music.feather')
best_new.rename({'artist':'artist_name'}, axis=1,inplace=True)
album_df2=album_df2.merge(best_new, how='left', on='artist_name')
album_df2[['category']] = album_df2[['category']].fillna(value=0)
album_df2 = album_df2.clean_names()
album_df2.drop('album_y', axis=1,inplace=True)
feather.write_dataframe(album_df2, 'pitchfork_tfidf.feather')