In [1]:
import sys
import requests
import spacy
from bs4 import BeautifulSoup
import re
import pickle
import numpy as np
import pandas as pd
import json
import scipy as sp
import feather
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS 
from collections import Counter
from plotnine import *
import janitor
import pickle
   

the 1.0 release. Instead of importing the Janitor DataFrame, please instead
`import janitor`, and use the functions directly attached to native pandas
dataframe.


In [2]:
def generate_urls(base_string, num_urls):
    string_list = []
    for i in range(1,num_urls):
        new_string = base_string + str(i)
        string_list.append(new_string)
    return string_list

def generate_artist_album_data(url_list):

    artist_list = []
    album_list = []
    link_list = []
    author_list = []
    score_list = []
    text_list = []
    pub_date = []
    
    counter = 1
    for url in url_list:
        print('Retrieving {}. {} of {} retrieved.'.format(url,counter,len(url_list)))
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        artist_info = soup.findAll(['a', 'ul', 'h2'], attrs={'class': ['artist-list review__title-artist']})
        album_info = soup.findAll(['a', 'ul', 'h2'], attrs={'class': 'review__title-album' })
        link_info = soup.findAll(['a'], attrs={'class': 'review__link'})

        for artist in artist_info:
            artist_list.append(artist.text)


        for album in album_info:
            album_list.append(album.text)

        for link in link_info:
            base_link = 'https://pitchfork.com'
            link_list.append(base_link + link['href'])
        counter += 1
    
    return link_list

def get_album_data(urls):
    
    album_df = pd.DataFrame({'publication_date': [], 'author': [], 'artist':[], 'album': [], 'score':[], 'review': []})
    
    counter = 1
    for url in urls:
        # Read in HTML from link
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        
        if soup.findAll(['div'], attrs={'class': ['contents dropcap']}):
            review = soup.findAll(['div'], attrs={'class': ['contents dropcap']})

        else:
            review = soup.findAll(['div'], attrs={'class': ['review-detail__article-content']})
        try:
            x=soup.findAll(['script'], attrs={'type': ["text/javascript"]})
            x = x[2].string
            data = x.split("window.digitalData=", 1)[1]
            data = json.loads(data)

            publication_date = pd.to_datetime(data['publishDate'])
            author = data['authors']
            artist = data['display'].split(':')[0].rstrip()
            artist = artist.replace('&amp;', 'and')
            artist = artist.replace('&quot;', '')
            artist = artist.replace('-&gt;', '')
            album = data['display'].split(':')[1].lstrip()
            album = album.replace('&amp;', 'and')
            album = album.replace('&quot;', '')
            album = album.replace('-&gt;', '')

            score = soup(text=re.compile('window.App'))[0]
            score = score.split("window.App=")[1].rstrip(';')
            score = json.loads(score)
            score=score['context']['dispatcher']['stores']['ReviewsStore']['items']
            key = [i for i in score][0]
            score = score[key]['tombstone']['albums'][0]['rating']['rating']

            print('Artist: {}, Album: {}'.format(artist,album))

            df_to_append = pd.DataFrame({'publication_date':[ publication_date], 'author': [author], 'artist':[artist], 'album': [album], 'score':[score], 'review': [review[0].text]})

            album_df = album_df.append(df_to_append, ignore_index=True)
            print('{} of {} completed'.format(counter,len(urls)))
            counter += 1
        except:
            print('Could not extract {}'.format(url))
            counter +=1
    
    album_df['score'] = album_df['score'].astype(float)
    
    return album_df

def tokenizeText(sample):
    stopwords = list(STOP_WORDS)

    # lemmatize
    #tokens = [i.lemma_ for i in sample]
    #tokens = [i for i in tokens if i not in stopwords]
    #tokens = [i for i in tokens if i != '-PRON-']
    #tokens = [i for i in tokens if i.pos_ != 'SYM']
    #tokens = [i for i in tokens if i.pos_ != 'PUNCT']
    
    stemmer=PorterStemmer()
    tokens = [i.lower_ for i in sample if i.lower_ not in list(stopwords) and i.pos_ != '-PRON-' and i.pos_ != 'SYM' and i.pos_ != "PUNCT"]
    tokens = [i for i in tokens if i not in ['n’t','"', ',', ',', ':', '.', '/', '-', '’s', '\n', '—', '’', '’s']]
    tokens = [stemmer.stem(i) for i in tokens]
    #tokens = [i for i in freq_list if i in tokens]
    
    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")
    
    doc = ' '.join(tokens)

    return doc

def create_corpus(df):
    nlp = spacy.load('en_core_web_sm')
    doc_list = list(df['review'])
    doc_list_new = []
    
    for doc in doc_list:
        try:
            doc = nlp(doc)
            doc = tokenizeText(doc)
            doc_list_new.append(doc)
        except:
            pass
    
    return doc_list_new

def split_corpus(corpus, num_words):
    word_list = []
    for doc in corpus:
        doc = doc.split(' ')
        for word in doc:
            word_list.append(word)
    word_freq = Counter(word_list)
    common_words = word_freq.most_common(num_words)
    
    most_freq_list = []
    for i in common_words:
        most_freq_list.append(i[0])
    
    return most_freq_list

def get_best_new_music(num_urls):
    album_df = pd.DataFrame({'artist':[], 'album': []})

    for i in range(1,num_urls): 
        url = 'https://pitchfork.com/reviews/best/albums/?page=' + str(i)
        print(url)
        page = requests.get(url)
        soup = BeautifulSoup(page.text, 'html.parser')
        soup = soup.findAll(['script'])
        soup=soup[8].string.split("window.App=")[1]
        soup = soup.rstrip(';')
        soup = json.loads(soup)
        data = soup['context']['dispatcher']['stores']['ReviewsStore']['items']
        id_list = list(data)
        for i in id_list:
            #print(i)
            if data[i]['tombstone']['albums'][0]['album']['display_name']:
                artist = data[i]['tombstone']['albums'][0]['album']['display_name']
            else:
                artist = data[i]['tombstone']['albums'][0]['album']['artists'][0]['display_name']
            
            artist = artist.replace('&amp;', 'and')
            artist = artist.replace('&quot;', '')
            artist = artist.replace('-&gt;', '')
            
            album = data[i]['tombstone']['albums'][0]['album']['display_name']
            album = album.replace('&amp;', 'and')
            album = album.replace('&quot;', '')
            album = album.replace('-&gt;', '')
            
            df_to_append = pd.DataFrame({'artist':[artist], 'album': [album]})
            album_df = album_df.append(df_to_append,ignore_index=True)
            
    return album_df

def filter_by_frequency(zipped_items, top_items):
    zip_list = []
    counter = 0
    for i in zipped_items:
        if counter == top_items:
            break
        else:
            print(i[0])
            zip_list.append(i[0])
            counter += 1
            print(counter)
    return zip_list


### Get the data if it hasn't been retrieved already
This process takes a while, best to be done while you're gone at work or asleep.

In [None]:
# There are currently 59 pages of best new music.
# Data is output as a df.
best_new_music = get_best_new_music(59)
best_new['category'] = 1

# Output the data to a csv
best_new_music.to_csv('best_new_music.csv', index=False)

# Output the data to a more performance format, like Apache Feather
feather.write_dataframe(best_new_music, 'best_new_music.feather')

In [None]:
# Now we have to scrape al lthe other data.
# There are ~22000K reviews, it takes some time.
# Data is output as a df.
url_list = generate_urls('https://pitchfork.com/reviews/albums/?page=', 1703)
urls = generate_artist_album_data(url_list)
album_df = get_album_data(urls)

In [3]:
# Write the raw data to a csv and feather
album_df.to_csv('pitchfork_reviews.csv')
feather.write_dataframe(album_df,'pitchfork_reviews.feather')

NameError: name 'album_df' is not defined

### Read in the data if it's already been generated.

In [4]:
pitchfork_data = feather.read_dataframe('pitchfork_reviews_aug_29.feather')
pitchfork_data.rename({'artist': 'artist_name'}, axis=1, inplace=True)

#### Create a corpus of data using the reviews.
This is accomplished using mainly the `spaCy` library, with some help from `nltk` to do PorterStemming instead of default lemmatization that spacy does. Lemmatization tends to lead to even more sparse matrices, which could be good (more data!) or bad (resource intensive for anything Feature Engineering or ML oriented).

I have a set of helper function that cleans up all text data, removes symbols, stop words, and every other random quirk I found doing this. This part alone took a significant part of the time.

In [5]:
# Read in the corpus if it's created already:
corpus = pickle.load( open( "/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/corpus_aug_28th.p", "rb" ) )

In [None]:
corpus = create_corpus(pitchfork_data)

In [None]:


pickle.dump(corpus, open( "/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/corpus_aug_28th.p", "wb" ) )

In [6]:
# Use scikit to do TFIDF vectorization. You can swap this out with simple counts
# by using CountVectorizer().

cv = TfidfVectorizer()
matrix = cv.fit_transform(corpus)

In [7]:
# Get word frequencies in a list of tuples. Use a helper function
# to retrieve the top n (here, 5000) words. 

# The rationale here is mainly that the data can get very large (20000 columns or so),
# so it's good to only take what you need.

zipped_frequency_list = sorted(zip(cv.get_feature_names(),
    np.asarray(matrix.sum(axis=0)).ravel()), key=lambda x: x[1], reverse=True)

vocab_list = filter_by_frequency(zipped_frequency_list, 4000)

like
1
song
2
album
3
band
4
sound
5
music
6
record
7
track
8
guitar
9
rock
10
time
11
pop
12
feel
13
new
14
year
15
work
16
way
17
love
18
vocal
19
re
20
come
21
play
22
releas
23
make
24
thing
25
voic
26
listen
27
best
28
minut
29
beat
30
lyric
31
littl
32
sing
33
drum
34
live
35
open
36
good
37
ve
38
end
39
moment
40
titl
41
rap
42
long
43
line
44
instrument
45
set
46
world
47
melodi
48
know
49
group
50
turn
51
produc
52
singl
53
point
54
artist
55
day
56
punk
57
kind
58
black
59
synth
60
find
61
debut
62
sens
63
bass
64
life
65
ep
66
old
67
danc
68
want
69
self
70
style
71
cover
72
mix
73
place
74
bit
75
metal
76
second
77
peopl
78
start
79
indi
80
hear
81
product
82
piano
83
right
84
label
85
hard
86
piec
87
solo
88
half
89
great
90
hous
91
tri
92
hand
93
man
94
better
95
big
96
let
97
think
98
earli
99
electron
100
follow
101
word
102
origin
103
get
104
past
105
hit
106
take
107
actual
108
nois
109
collect
110
power
111
post
112
differ
113
look
114
got
115
far
116
note
117
rhythm

1725
hiss
1726
highli
1727
command
1728
shop
1729
knock
1730
suck
1731
georg
1732
legaci
1733
preced
1734
persona
1735
phase
1736
chamber
1737
area
1738
advanc
1739
gather
1740
delay
1741
imprint
1742
zone
1743
descend
1744
bitch
1745
multipl
1746
quot
1747
gain
1748
bound
1749
complic
1750
lord
1751
disappear
1752
anchor
1753
thread
1754
collaps
1755
vital
1756
cliché
1757
joe
1758
1960
1759
stumbl
1760
moor
1761
male
1762
skip
1763
thousand
1764
cohen
1765
squar
1766
devil
1767
eleg
1768
toy
1769
violent
1770
honest
1771
link
1772
toss
1773
fragment
1774
chase
1775
event
1776
1999
1777
milk
1778
larger
1779
crisp
1780
unpredict
1781
veer
1782
restless
1783
underst
1784
okay
1785
upbeat
1786
darker
1787
stun
1788
leap
1789
anxieti
1790
refus
1791
vein
1792
crucial
1793
amaz
1794
offici
1795
moan
1796
subsequ
1797
primari
1798
belong
1799
warn
1800
and
1801
impuls
1802
weav
1803
thunder
1804
brutal
1805
gestur
1806
winter
1807
finest
1808
radiohead
1809
partner
1810
silli
1811
nail
181

vice
3475
heavier
3476
embellish
3477
steep
3478
estat
3479
bruis
3480
enorm
3481
sneak
3482
utter
3483
govern
3484
summon
3485
chatter
3486
domest
3487
riley
3488
sampler
3489
eras
3490
youtub
3491
vamp
3492
oppress
3493
connor
3494
guard
3495
perman
3496
terrain
3497
secur
3498
headlin
3499
henri
3500
cost
3501
pill
3502
deft
3503
consequ
3504
curtain
3505
marvel
3506
sufjan
3507
tengo
3508
satan
3509
affirm
3510
strictli
3511
director
3512
gurgl
3513
rat
3514
confirm
3515
download
3516
glimmer
3517
disconnect
3518
worthwhil
3519
chapter
3520
tweedi
3521
satur
3522
hebden
3523
in
3524
formid
3525
complain
3526
tread
3527
alright
3528
accident
3529
pride
3530
uneven
3531
attach
3532
numb
3533
vampir
3534
galaxi
3535
roger
3536
softli
3537
pretens
3538
traffic
3539
civil
3540
foil
3541
respond
3542
brick
3543
shini
3544
brazilian
3545
tug
3546
bolster
3547
outer
3548
baltimor
3549
cryptic
3550
castl
3551
catalogu
3552
fennesz
3553
defens
3554
gloss
3555
burden
3556
europ
3557
anniversa

In [8]:
# Refit our data, but this time only with the list of top n words we chose.
cv = TfidfVectorizer(vocabulary=vocab_list)
matrix = cv.fit_transform(corpus)

In [9]:
# Concat matrices toasdfa gether to get a neater df. This step isn't necessary,
# but I find dfs easier to examine the data with. Numpy arrays are more resource
# efficient so this step isn't necessary.
album_df2 = pd.concat([pitchfork_data, pd.DataFrame(matrix.todense(), columns=cv.get_feature_names())], axis=1).ffill()
del matrix

#### Do a bunch of stuff to merge best new music to review data.

In [11]:

best_new = pd.read_csv('/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/best_new_music.csv')
best_new.rename({'artist':'artist_name'}, axis=1,inplace=True)
best_new['category'] = 1

In [12]:
class renamer():
    def __init__(self):
        self.d = dict()

    def __call__(self, x):
        if x not in self.d:
            self.d[x] = 0
            return x
        else:
            self.d[x] += 1
            return "%s_%d" % (x, self.d[x])
#album_df2[['category']] = album_df2[['category']].fillna(value=0)
album_df2 = album_df2.clean_names()
#album_df2.drop('album_y', axis=1,inplace=True)
album_df2 = album_df2.rename(columns=renamer())

In [13]:
album_df2=album_df2.merge(best_new, how='left', on='artist_name')
feather.write_dataframe(album_df2, '/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/pitchfork_tfidf_aug30.feather')

In [None]:
album_df2.to_csv('/home/michelle/Documents/data projects/pitchfork_data_analysis/data/processed/pitchfork_tfidf_aug30.csv')

In [None]:
names = album_df2.columns

In [None]:
names = pd.DataFrame({'names':names})

In [None]:
x=names.groupby('names').size()

In [None]:
x[x>1]