In [68]:
%matplotlib inline

import requests
import time
import re
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager


## Scraping the data from the Top Albums of All Time

### Sputnikmusic
We need to get the content from the top albums of all time list. To do this, we first use Selenium webdriver to select 'all time' from the dropdown list. Then we can extract the content into a BeautifulSoup object.

In [34]:
def get_sput_alltime():
    """
    Returns a Beautiful Soup object containing the contents of the Top Albums of All Time page from sputnikmusic.com.
    Returns None if content fails to load.
    """
    
    current_year = "2020"
    rankings_url = "https://www.sputnikmusic.com/best/albums/" + current_year + "/"

    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.maximize_window()
    driver.get(rankings_url)

    time.sleep(5)
    content = driver.page_source.encode('utf-8').strip()

    if content is not None:
        soup = BeautifulSoup(content, 'html.parser')

        # Find the correct id for the dropdown
        yr_dd = soup.find("span", text=current_year)
        dd_id = yr_dd.parent.parent.get('id')

        # Select 'All Time' from the dropdown
        driver.find_element_by_id(dd_id).click()
        driver.find_element_by_id(dd_id + "_o_2").click()

        # Wait for webpage to load and then create the parse tree from the HTML
        time.sleep(5)
        content_all_time = driver.page_source.encode('utf-8').strip()

        if content_all_time is not None:
            soup_all_time = BeautifulSoup(content_all_time, 'html.parser')
            driver.quit()
            
            return soup_all_time


    driver.quit()
    return None


[WDM] - Current google-chrome version is 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183
[WDM] - Driver [C:\Users\mattd\.wdm\drivers\chromedriver\win32\85.0.4183.87\chromedriver.exe] found in cache


 


We create a pandas DataFrame object from the data obtained above and store it in a variable called data.

In [64]:
def create_data_frame(soup):
    """
    Creates the main data frame for the all-time list.
    """
    
    table = soup.find("tr", class_="alt1").parent.parent
    cells = table.find_all("td", class_="blackbox")
    
    common_str = " votes"
    data = []
    for i in range(0, len(cells), 2):
        row = {}
        row['Rank'] = str(int(cells[i].text)) # Remove leading 0's
        
        fonts = cells[i+1].find_all("font")
        row['Artist'] = fonts[0].text 
        row['Album'] = fonts[1].text
        row['Score'] = fonts[3].text
        row['Ratings'] = fonts[4].text[: -len(common_str)]
        
        data.append(row)
    
    return pd.DataFrame(data)
    
data = create_data_frame(soup_all_time)
print(data)
    

    Rank                Artist                                          Album  \
0      1            Pink Floyd                             Wish You Were Here   
1      2  Ludwig van Beethoven             Symphony No. 9 in D minor, Op. 125   
2      3        Charles Mingus            The Black Saint and the Sinner Lady   
3      4                   Nas                                       Illmatic   
4      5           Miles Davis                                   Kind of Blue   
..   ...                   ...                                            ...   
195  196           Suffocation                            Pierced from Within   
196  197         Leonard Cohen                         Songs of Leonard Cohen   
197  198            Koji Kondo  The Legend of Zelda 25th Anniversary Symphony   
198  199             Sigur Ros                                 AgÃ¦tis byrjun   
199  200                Slayer                                 Reign in Blood   

    Score Ratings  
0    4.

Unfortunately, the data on this page does not include the release year or genre(s) of each album, so we need to scrape those from their corresponding album and artist pages respectively. This needs to be done carefully so as to not overload the site with many requests per second.

In [42]:
def get_album_links(soup):
    """
    Returns the album page links for each album in sputnik all time list.
    """
    
    base_url = "https://www.sputnikmusic.com"
    table = soup.find("tr", class_="alt1").parent.parent
    
    links = [urljoin(base_url, a.get('href')) for a in table.find_all("a")]
    return links

In [128]:
def get_other_data(links, start, num = 20):
    """
    Collect data for release year and genres for Sputnik albums.
    
    Params:
    - links: Full list of album page links
    - start: index of first element in list to iterate over. First index is 0
    - num: Number of links to iterate over (default is 20)
    """

    base_url = "https://www.sputnikmusic.com"

    years = []
    genres = []

    for i in range(start, start + num):
        print(f"Iteration {i}:")

        r = requests.get(links[i])
        if r.status_code == 200:
            # Find the correct div
            soup = BeautifulSoup(r.content, "lxml")
            div = soup.find("div", {"style": re.compile("silverbar")})

            # Need to check case if we are on a review page or on a soundoff page
            if div is not None:
                # We are on a review page (usual case)
                a = div.find("a", {"href": re.compile("bands")})
                
                pattern = re.compile(r"\d+")
                m = pattern.search(div.find("p").text)
                year = m.group()
            
            else:
                # We are on a soundoff page
                tab = soup.find("table", class_="tableborder")
                a = tab.find("a", {"href": re.compile("bands")})
                
                b = tab.find_all("b", text=re.compile(r"\d+"))[-1]
                year = b.text
                    
            artist_link = urljoin(base_url, a.get('href'))
            years.append(year)

            print(f"Found album release year: {year}")

            # Wait 3 seconds before visiting artist page
            time.sleep(3)
            r2 = requests.get(artist_link)

            if r2.status_code == 200:
                print(f"Visiting artist page: {artist_link}")
                soup = BeautifulSoup(r2.content, "lxml")

                genre_div = soup.find("div", class_="tagwrap")
                anchors = genre_div.find_all("a")
                gen = [a.string.strip() for a in anchors]
                genres.append(gen)

                print(f"Found genres: {gen}")

            else:
                print(f"Get request to {artist_link} failed.")
                print(f"Status code: {r2.status_code}")
                break

        else:
            print(f"Get request to {links[i]} failed.")
            print(f"Status code: {r.status_code}")
            break

        # Wait 3 seconds before next iteration
        time.sleep(3)
        
    return years, genres
    


In [124]:
# Creates new empty list for years and genres
# Do not run if you only wish to extend already existent list
years, genres = [], []

In [161]:
links = get_album_links(soup_all_time)

# Choose starting position, get_other_data() gets only 20 data points to avoid making too many requests at a time 
new_years, new_genres = get_other_data(links, 180)
years.extend(new_years)
genres.extend(new_genres)

print(years)
print(genres)

Iteration 180:
Found album release year: 1998
Visiting artist page: https://www.sputnikmusic.com/bands/Death/657/
Found genres: ['Death Metal', 'Progressive Metal', 'Thrash Metal']
Iteration 181:
Found album release year: 1999
Visiting artist page: https://www.sputnikmusic.com/bands/Mr.-Bungle/982/
Found genres: ['Experimental', 'Metal', 'Alternative Rock']
Iteration 182:
Found album release year: 1968
Visiting artist page: https://www.sputnikmusic.com/bands/The-Zombies/560/
Found genres: ['Psychedelic', 'Pop']
Iteration 183:
Found album release year: 1997
Visiting artist page: https://www.sputnikmusic.com/bands/Strapping-Young-Lad/1747/
Found genres: ['Industrial', 'Death Metal', 'Progressive Metal']
Iteration 184:
Found album release year: 1995
Visiting artist page: https://www.sputnikmusic.com/bands/Ulver/954/
Found genres: ['Experimental', 'Black Metal', 'Electronic']
Iteration 185:
Found album release year: 1976
Visiting artist page: https://www.sputnikmusic.com/bands/Camel/2508/


In [191]:
# Add columns for release years and genres
data["Release Year"] = years
data["Genres"] = genres

print(data)


200
200
    Rank                Artist                                          Album  \
0      1            Pink Floyd                             Wish You Were Here   
1      2  Ludwig van Beethoven             Symphony No. 9 in D minor, Op. 125   
2      3        Charles Mingus            The Black Saint and the Sinner Lady   
3      4                   Nas                                       Illmatic   
4      5           Miles Davis                                   Kind of Blue   
..   ...                   ...                                            ...   
195  196           Suffocation                            Pierced from Within   
196  197         Leonard Cohen                         Songs of Leonard Cohen   
197  198            Koji Kondo  The Legend of Zelda 25th Anniversary Symphony   
198  199             Sigur Ros                                 AgÃ¦tis byrjun   
199  200                Slayer                                 Reign in Blood   

    Score Ratings R

In [192]:
data.to_csv("sput_data.csv", index=False)

Now we need to get the top 200 albums from rateyourmusic.com. To do this, we again use Selenium webdriver so that we can click through the first 5 pages of the rankings.

In [194]:
def get_rym_alltime():
    """
    Collects data from rateyourmusic's top 200 albums of all time list. Returns a pandas DataFrame.
    Selenium webdriver is used to move to each page.
    """
    
    base_url = "https://rateyourmusic.com/customchart"
    num_pages = 5
    
    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.maximize_window()
    driver.get(base_url)

    data_dict = {'Ranking': [], 'Artist': [], 'Album': [], 'Score': [], 'Ratings': [], 'Release Year': [], 'Genres': []}
    
    for i in range(num_pages):
        
        # Wait for page to load and get contents
        time.sleep(5)
        content = driver.page_source.encode('utf-8').strip()

        if content is not None:
            soup = BeautifulSoup(content, "lxml")
            table = soup.find("table", class_="mbgen")

            # Get all rankings from current page
            rank_spans = table.find_all("span", class_="ooookiig")
            data_dict['Ranking'].extend([r.text for r in rank_spans])

            chart_details = table.find_all("div", class_="chart_detail")

            # Find all artists in each row
            art_tags = [cd.find_all("a", class_="artist") for cd in chart_details]

            # Albums with multiple artists get combined into single string
            for at in art_tags:
                artists = [a.text for a in at]
                comb_artist = ", ".join(artists)
                data_dict['Artist'].append(comb_artist)

            # Get all album titles from current page
            alb_tags = [cd.find("a", class_="album") for cd in chart_details]
            data_dict["Album"].extend([a.text for a in alb_tags])

            # Get release years from current page
            dates = [cd.find("div", class_="chart_year").text for cd in chart_details]

            pattern = re.compile(r"\d{4}")
            for d in dates:
                m = pattern.search(d)
                data_dict['Release Year'].append(m.group())


            # Get genres for each row on current page
            gen_tags = [cd.find_all("a", class_="genre") for cd in chart_details]

            for gt in gen_tags:
                data_dict['Genres'].append([g.text for g in gt])


            chart_stats = table.find_all("div", class_="chart_stats")

            # Get score and number of ratings for each album on the page
            for cs in chart_stats:
                bolds = cs.find_all("b")
                data_dict['Score'].append(bolds[0].text)
                data_dict['Ratings'].append(bolds[1].text)
            
            # Go to next page if not on page 5
            if i < 4:
                driver.find_element_by_link_text(str(i+2)).click()
        
        else:
            print(f"Failed to load page {i+1}")
        
            
    driver.quit()
    
    rym_data = pd.DataFrame(data_dict)
    return rym_data

In [195]:
rym_data = get_rym_alltime()
print(rym_data)


[WDM] - Current google-chrome version is 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183
[WDM] - Driver [C:\Users\mattd\.wdm\drivers\chromedriver\win32\85.0.4183.87\chromedriver.exe] found in cache


 
    Ranking             Artist                             Album Score  \
0         1          Radiohead                       OK Computer  4.23   
1         2          Radiohead                             Kid A  4.23   
2         3         Pink Floyd         The Dark Side of the Moon  4.23   
3         4         Pink Floyd                Wish You Were Here  4.29   
4         5       King Crimson  In the Court of the Crimson King  4.31   
..      ...                ...                               ...   ...   
195     196             Burzum                         Filosofem  3.91   
196     197            MF DOOM                         Mm.. Food  3.97   
197     198      Black Sabbath            Sabbath Bloody Sabbath  3.99   
198     199  Bruce Springsteen                       Born to Run  3.96   
199     200          The Doors                      Strange Days  3.94   

    Ratings Release Year                                     Genres  
0    59,967         1997               

In [196]:
rym_data.to_csv("rym_data.csv", index=False)