In [68]:
%matplotlib inline

import requests
import time
import re
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager


## Scraping the data from the Top Albums of All Time

### Sputnikmusic
We need to get the content from the top albums of all time list. To do this, we first use Selenium webdriver to select 'all time' from the dropdown list. Then we can extract the content into a BeautifulSoup object.

In [34]:
def get_sput_alltime():
    """
    Returns a Beautiful Soup object containing the contents of the Top Albums of All Time page from sputnikmusic.com.
    Returns None if content fails to load.
    """
    
    current_year = "2020"
    rankings_url = "https://www.sputnikmusic.com/best/albums/" + current_year + "/"

    driver = webdriver.Chrome(ChromeDriverManager().install())
    driver.maximize_window()
    driver.get(rankings_url)

    time.sleep(5)
    content = driver.page_source.encode('utf-8').strip()

    if content is not None:
        soup = BeautifulSoup(content, 'html.parser')

        # Find the correct id for the dropdown
        yr_dd = soup.find("span", text=current_year)
        dd_id = yr_dd.parent.parent.get('id')

        # Select 'All Time' from the dropdown
        driver.find_element_by_id(dd_id).click()
        driver.find_element_by_id(dd_id + "_o_2").click()

        # Wait for webpage to load and then create the parse tree from the HTML
        time.sleep(5)
        content_all_time = driver.page_source.encode('utf-8').strip()

        if content_all_time is not None:
            soup_all_time = BeautifulSoup(content_all_time, 'html.parser')
            driver.quit()
            
            return soup_all_time


    driver.quit()
    return None


[WDM] - Current google-chrome version is 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183
[WDM] - Driver [C:\Users\mattd\.wdm\drivers\chromedriver\win32\85.0.4183.87\chromedriver.exe] found in cache


 


We create a pandas DataFrame object from the data obtained above and store it in a variable called data.

In [64]:
def create_data_frame(soup):
    """
    Creates the main data frame for the all-time list.
    """
    
    table = soup.find("tr", class_="alt1").parent.parent
    cells = table.find_all("td", class_="blackbox")
    
    common_str = " votes"
    data = []
    for i in range(0, len(cells), 2):
        row = {}
        row['Rank'] = str(int(cells[i].text)) # Remove leading 0's
        
        fonts = cells[i+1].find_all("font")
        row['Artist'] = fonts[0].text 
        row['Album'] = fonts[1].text
        row['Score'] = fonts[3].text
        row['Ratings'] = fonts[4].text[: -len(common_str)]
        
        data.append(row)
    
    return pd.DataFrame(data)
    
data = create_data_frame(soup_all_time)
print(data)
    

    Rank                Artist                                          Album  \
0      1            Pink Floyd                             Wish You Were Here   
1      2  Ludwig van Beethoven             Symphony No. 9 in D minor, Op. 125   
2      3        Charles Mingus            The Black Saint and the Sinner Lady   
3      4                   Nas                                       Illmatic   
4      5           Miles Davis                                   Kind of Blue   
..   ...                   ...                                            ...   
195  196           Suffocation                            Pierced from Within   
196  197         Leonard Cohen                         Songs of Leonard Cohen   
197  198            Koji Kondo  The Legend of Zelda 25th Anniversary Symphony   
198  199             Sigur Ros                                 AgÃ¦tis byrjun   
199  200                Slayer                                 Reign in Blood   

    Score Ratings  
0    4.

Unfortunately, the data on this page does not include the release year or genre(s) of each album, so we need to scrape those from their corresponding album and artist pages respectively. This needs to be done carefully so as to not overload the site with many requests per second.

In [42]:
def get_album_links(soup):
    """
    Returns the album page links for each album in sputnik all time list.
    """
    
    base_url = "https://www.sputnikmusic.com"
    table = soup.find("tr", class_="alt1").parent.parent
    
    links = [urljoin(base_url, a.get('href')) for a in table.find_all("a")]
    return links

In [127]:
def get_other_data(links, start, num = 20):
    """
    Collect data for release year and genres for Sputnik albums.
    
    Params:
    - links: Full list of album page links
    - start: index of first element in list to iterate over. First index is 0
    - num: Number of links to iterate over (default is 20)
    """

    base_url = "https://www.sputnikmusic.com"

    years = []
    genres = []

    for i in range(start, start + num):
        print(f"Iteration {i}:")

        r = requests.get(links[i])
        if r.status_code == 200:
            # Find the correct div
            soup = BeautifulSoup(r.content, "lxml")
            div = soup.find("div", {"style": re.compile("silverbar")})

            # Need to check case if we are on a review page or on a soundoff page
            if div is not None:
                # We are on a review page (usual case)
                a = div.find("a", {"href": re.compile("bands")})
                
                pattern = re.compile(r"\d+")
                m = pattern.search(div.find("p").text)
                year = m.group()
            
            else:
                # We are on a soundoff page
                tab = soup.find("table", class_="tableborder")
                a = tab.find("a", {"href": re.compile("bands")})
                
                b = tab.find("b", text=re.compile(r"\d+"))
                year = b.text
                    
            artist_link = urljoin(base_url, a.get('href'))
            years.append(year)

            print(f"Found album release year: {year}")

            # Wait 3 seconds before visiting artist page
            time.sleep(3)
            r2 = requests.get(artist_link)

            if r2.status_code == 200:
                print(f"Visiting artist page: {artist_link}")
                soup = BeautifulSoup(r2.content, "lxml")

                genre_div = soup.find("div", class_="tagwrap")
                anchors = genre_div.find_all("a")
                gen = [a.string.strip() for a in anchors]
                genres.append(gen)

                print(f"Found genres: {gen}")

            else:
                print(f"Get request to {artist_link} failed.")
                print(f"Status code: {r2.status_code}")
                break

        else:
            print(f"Get request to {links[i]} failed.")
            print(f"Status code: {r.status_code}")
            break

        # Wait 3 seconds before next iteration
        time.sleep(3)
        
    return years, genres
    


In [124]:
# Creates new empty list for years and genres
# Do not run if you only wish to extend already existent list
years, genres = [], []

In [126]:
links = get_album_links(soup_all_time)

# Choose starting position, get_other_data() gets only 20 data points to avoid making too many requests at a time 
new_years, new_genres = get_other_data(links, 20)
years.extend(new_years)
genres.extend(new_genres)

print(years)
print(genres)

Iteration 10:
Found album release year: 1969
Visiting artist page: https://www.sputnikmusic.com/bands/King-Crimson/82/
Found genres: ['Progressive Rock', 'Jazz Fusion', 'Experimental']
Iteration 11:
Found album release year: 1969
Visiting artist page: https://www.sputnikmusic.com/bands/The-Beatles/73/
Found genres: ['Rock', 'Pop', 'Psychedelic']
Iteration 12:
Found album release year: 1997
Visiting artist page: https://www.sputnikmusic.com/bands/Radiohead/86/
Found genres: ['Alternative Rock', 'Electronic', 'Experimental']
Iteration 13:
Found album release year: 1973
Visiting artist page: https://www.sputnikmusic.com/bands/Pink-Floyd/110/
Found genres: ['Progressive Rock', 'Psychedelic']
Iteration 14:
Found album release year: 1995
Visiting artist page: https://www.sputnikmusic.com/bands/GZA/3419/
Found genres: ['Hip-Hop']
Iteration 15:
Found album release year: 1972
Visiting artist page: https://www.sputnikmusic.com/bands/David-Bowie/493/
Found genres: ['Rock', 'Experimental', 'Post P