# Adjective-Assisted Album Aggregator

I want to create a web app where a user can enter various adjectives and recieve recommendations for music based on those adjectives. For example, if a user entered "dreamy," "atmospheric," and "sleepy," they might get a Beach House album recommended to them, because maybe a review said those adjectives.

This will be powered by a web scraped database of Pitchfork album reviews.

In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from textblob import TextBlob
from IPython.display import clear_output
from PyDictionary import PyDictionary
import time

# Functions

In [2]:
def adjScraper(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    
    blob = TextBlob(soup.text)
    
    sentence = [word for (word,tag) in blob.tags if tag == 'JJ']
    
    sentence2 = ' '.join(sentence)
    
    return sentence2

In [32]:
def fullScraper(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    
    # pulling adjectives
    blob = TextBlob(soup.text)
    adj_list = [word for (word, tag) in blob.tags if tag == 'JJ']
    adj = ' '.join(adj_list)
    
    # pulling title
    title = soup.head.find('title').text
    
    # pulling score
    score = soup.body.find('span', class_='score').text
    
    return [title, score, adj]

In [3]:
def nameScraper(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    
    text = soup.text.strip()
    
    blob = TextBlob(soup.text)
    
    name = soup.head.find('title').text
    
    return name

In [4]:
def scoreScraper(url):
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    
    score = soup.body.find('span', class_='score').text

    return score

In [5]:
def get_albums(n_pages):
    '''
    This function retrieves names, scores, and adjectives from a
    number of pages of Pitchfork album reviews. It returns a pandas
    DataFrame of results. 
    
    n_pages (int): the number of pages to scrape. 
    '''
    
    # generating list of album review links
    links = []
    
    for i in range(1,n_pages):
        url = f'https://pitchfork.com/reviews/albums/?page={i}'
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, 'lxml')

        for link in soup.findAll('a'):
            link = link.get('href')
            if '/reviews/albums/' in link and '?' not in link and link != '/reviews/albums/':
                links.append(link)
                
        print(f'Pulling album links: {round((i/n_pages)*100)}% done')
        clear_output(wait=True)
    # now pull name and text from each review
    reviews = list()
    names = list()
    scores = list()
    album_n = 0
    
    for link in links:
        url = f'https://pitchfork.com{link}'
        review = reviewScraper(url)
        name = nameScraper(url)
        score = scoreScraper(url)
        names.append(name)
        reviews.append(review)
        scores.append(score)
        album_n += 1
        
        print(f'Scraping albums: {round((album_n/len(links))*100)}% done')
        clear_output(wait=True)
    
    df = pd.DataFrame([names, scores, reviews,]).transpose()
    
    df.rename(columns = {0:'name',1:'score', 2:'adjs'}, inplace = True)
    df['name'] = df['name'].apply(lambda x: x.replace('Album Review | Pitchfork', ''))
    
    # vectorizing adjectives
    corpus = df['adjs'].to_list()
    vec = CountVectorizer()
    adj_vec = vec.fit_transform(corpus)
    adjs = adj_vec.toarray()
    adj_df = pd.DataFrame(adjs, columns = vec.get_feature_names())
    reviews_vecd = pd.merge(df, adj_df, left_index = True, right_index=True)
    reviews_vecd = reviews_vecd.drop(columns='adjs')

    # double checking to remove non-adjectives
    return reviews_vecd

In [6]:
def find_album(df, adjs):
    result = df
    for adj in adjs:
        adj = adj.lower()

        if adj not in df.columns:
            return 'That adjective or combination of adjectives has not appeared in an album review.'
        
        result = result[result[adj] == 1]
        result = result.sort_values(['scores'], ascending=False)
    

    for index, row in result.head().iterrows():
        print(f"{row['name_x']} -- {row['scores']}\n")
    
    if result.shape[0] == 0:
        print('Sorry! No albums fit that description, according to Pitchfork.')
        
    return result.shape

In [12]:
# lets see if we can vectorize this work to speed it up
def get_albums2(n_pages):
    '''
    This function retrieves names, scores, and adjectives from a
    number of pages of Pitchfork album reviews. It returns a pandas
    DataFrame of results. 
    
    n_pages (int): the number of pages to scrape. 
    '''
    # generating list of album review links
    links = pd.Series(dtype='str')
    
    link_start = time.time()
    for i in range(1,n_pages):
        url = f'https://pitchfork.com/reviews/albums/?page={i}'
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, 'lxml')
        #print(page)

        for link in soup.findAll('a'):
            #print(link)
            link = link.get('href')
            if '/reviews/albums/' in link and '?' not in link and link != '/reviews/albums/':
                #print('appending link')
                links = links.append(pd.Series(link))
    # now pull name and text from each review
    link_end = time.time()
    print(f'Scraping took {round((link_end - link_start)/i,2)}s per page')
    
    reviews = pd.Series(dtype='str')
    names = pd.Series(dtype='str')
    scores = pd.Series(dtype='str')
    
    #formatting links for scraping
    df = pd.DataFrame(links, columns=['links']).reset_index().drop(columns='index')
    df['links'] = df['links'].apply(lambda x: f'https://pitchfork.com{x}')
    print('Formatted the links')
    
    # getting album names
    names_start = time.time()
    df['name'] = df['links'].apply(lambda x: nameScraper(x).replace('Album Review | Pitchfork', ''))
    #df['name'] = df['name'].apply(lambda x: x.replace('Album Review | Pitchfork', ''))
    names_end = time.time()
    print(f'Getting album names took {round((names_end - names_start)/links.shape[0],2)}s per album.')
    
    # getting album scores
    scores_start = time.time()
    df['scores'] = df['links'].apply(lambda x: scoreScraper(x))
    scores_end = time.time()
    print(f'Getting scores took {round((scores_end - scores_start)/links.shape[0],2)} s per album.')
    
    # getting adjectives
    adj_start = time.time()
    df['adjs'] = df['links'].apply(lambda x: adjScraper(x))
    adj_end = time.time()
    print(f'Getting adjs took {round((adj_end - adj_start)/links.shape[0],2)} s per album.')
    
    df.drop(columns='links', inplace=True)
    
    # now time to vectorize the adjectives
    
    corpus = df['adjs'].to_list()
    vec = CountVectorizer()
    corpus_vec = vec.fit_transform(corpus)
    adjs = corpus_vec.toarray()
    print('Vectorized the adjectives')
    
    adj_df = pd.DataFrame(adjs, columns = vec.get_feature_names())
    reviews_vecd = pd.merge(df, adj_df, left_index = True, right_index=True)
    reviews_vecd = reviews_vecd.drop(columns='adjs')
    
    final = time.time()
    print(f'\nOverall process took {round((final - link_start)/links.shape[0],2)} s per album.')
    
    return reviews_vecd

In [43]:
def get_albums3(n_pages):
    '''
    This function retrieves names, scores, and adjectives from a
    number of pages of Pitchfork album reviews. It returns a pandas
    DataFrame of results. 
    
    n_pages (int): the number of pages to scrape. 
    '''
    # generating list of album review links
    links = pd.Series(dtype='str')
    
    link_start = time.time()
    for i in range(1,n_pages):
        url = f'https://pitchfork.com/reviews/albums/?page={i}'
        response = requests.get(url)
        page = response.text
        soup = BeautifulSoup(page, 'lxml')
        #print(page)

        for link in soup.findAll('a'):
            #print(link)
            link = link.get('href')
            if '/reviews/albums/' in link and '?' not in link and link != '/reviews/albums/':
                #print('appending link')
                links = links.append(pd.Series(link))
                
    # now pull name and text from each review
    link_end = time.time()
    print(f'Scraping took {round((link_end - link_start)/i,2)}s per page')
    
    info = pd.DataFrame()
    
    #formatting links for scraping
    df = pd.DataFrame(links, columns=['links']).reset_index().drop(columns='index')
    df['links'] = df['links'].apply(lambda x: f'https://pitchfork.com{x}')
    print('Formatted the links')
    
    # getting album info
    info_start = time.time()
    df['info'] = df['links'].apply(lambda x: fullScraper(x))
    #df['name'] = df['name'].apply(lambda x: x.replace('Album Review | Pitchfork', ''))
    info_end = time.time()
    print(f'Getting album info took {round((info_end - info_start)/links.shape[0],2)}s per album.')
    
    df.drop(columns='links', inplace=True)
    
    # separating album info
    sep_start = time.time()
    df['title'] = df['info'].apply(lambda x: x[0])
    df['score'] = df['info'].apply(lambda x: x[1])
    df['adjs'] = df['info'].apply(lambda x: x[2])
    sep_end = time.time()
    
    print(f'Separating album info took {round((sep_end - sep_start)/links.shape[0],2)}s per album.')
                                                                                                            
    # now time to vectorize the adjectives
    corpus = df['adjs'].to_list()
    vec = CountVectorizer()
    corpus_vec = vec.fit_transform(corpus)
    adjs = corpus_vec.toarray()
    print('Vectorized the adjectives')
    
    adj_df = pd.DataFrame(adjs, columns = vec.get_feature_names())
    reviews_vecd = pd.merge(df, adj_df, left_index = True, right_index=True)
    reviews_vecd = reviews_vecd.drop(columns=['adjs','info'])
    
    final = time.time()
    print(f'\nOverall process took {round((final - link_start)/links.shape[0],2)} s per album.')
    
    return reviews_vecd

In [8]:
def find_album2(df, adjs):
    pydict = PyDictionary()
    result = df

    for adj in adjs:
        syns = pydict.synonym(adj)
        syns += [adj]
        print(f"\nSearching for albums described as {adj.upper()}. Also looking for albums described as: {', '.join(syns[:-1])}.")
        adj_df = pd.DataFrame()

        for syn in syns:
            if syn in result.columns:
                syn_df = result[result[syn] >= 1]
                #print(f'There are {syn_df.shape[0]} albums with "{syn}"\n')
                adj_df = pd.concat([adj_df, syn_df])

        #print(f'The synonyms of {adj} have {adj_df.shape[0]} entries.\n')

        result = pd.merge(result, adj_df, how = 'inner', 
                          left_index=True, right_index=True,
                          suffixes = (None,'_drop')).drop_duplicates()
        
        #sorting result based on score
        result.sort_values('scores', inplace=True, ascending=False)
        
    # output
    print(f'\nPitchfork has described {result.shape[0]} albums in that way.\nHere are the top scoring:\n')
    for index, row in result.head().iterrows():
        print(f"{row['name_x']} -- {row['scores']}\n")
                                                                                                                    

# Workspace

In [48]:
df_official = get_albums3(60)

Scraping took 1.05s per page
Formatted the links
Getting album info took 1.34s per album.
Separating album info took 0.0s per album.
Vectorized the adjectives

Overall process took 1.43 s per album.


In [56]:
df_official.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 708 entries, 0 to 707
Columns: 8825 entries, title_x to судно
dtypes: int64(8823), object(2)
memory usage: 47.7+ MB


In [28]:
find_album2(df_official, ['fast','aggressive'])


Searching for albums described as FAST. Also looking for albums described as: double-quick, speedy, express, hurrying, fastness, smart, quick, red-hot, high-velocity, winged, hot, meteoric, hurried, straightaway, alacritous, speed, scurrying, rapid, fleet, accelerated, immediate, blistering, high-speed, fast-paced, prompt, instant, sudden, instantaneous, windy, swift, swiftness, fast-breaking.

Searching for albums described as AGGRESSIVE. Also looking for albums described as: hard-hitting, truculent, vulturous, obstreperous, combative, hostile, battleful, self-assertive, militant, self-asserting, rapacious, assertive, pugnacious, scrappy, in-your-face, raptorial, competitive, vulturine, predatory, ravening, offensive, bellicose, rough, high-pressure.

Searching for albums described as PROFANE. Also looking for albums described as: blasphemous, dirty, blue.

Pitchfork has described 0 albums in that way.
Here are the top scoring:



# Saving dataframe 4 l8r

In [52]:
df_official.to_pickle('/Users/patricknorman/Documents/Python/Data/albums.pkl')

In [388]:
df_official = pd.read_pickle('/Users/patricknorman/Documents/Python/Data/albums.pkl')