In [2]:
import yaml

def get_config():    
    with open("../config.yaml", "r") as stream:
        return yaml.safe_load(stream)
    
config = get_config()

lastfm_api_key = config.get("lastfm").get("api_key")

In [3]:
import requests
from bs4 import BeautifulSoup

def get_track_info(artist, track):
    url = f"https://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key={lastfm_api_key}&artist={artist}&track={track}&format=json"
    response = requests.get(url)
    return response.json()

def get_track_eras(url):
    markup = requests.get(url).text
    soup = BeautifulSoup(markup, "html.parser")
    eras = soup.find_all("li", class_="era")
    return [era.text.lower().strip().replace(" ", "-") for era in eras]

def filter_eras(eras):
    blacklist = ["piano", "composers"]

    return list(set([era for era in eras if era not in blacklist]))

track_info = get_track_info("Franz Schubert", "Sonata for Piano No. 17 in D, D850")
print(track_info["track"])

track_eras = get_track_eras(track_info["track"]["url"])
print(track_eras)

filtered_eras = filter_eras(track_eras)
print(filtered_eras)

{'name': 'Sonata for Piano No. 17 in D, D850', 'url': 'https://www.last.fm/music/Franz+Schubert/_/Sonata+for+Piano+No.+17+in+D,+D850', 'duration': '0', 'streamable': {'#text': '0', 'fulltrack': '0'}, 'listeners': '316', 'playcount': '1029', 'artist': {'name': 'Franz Schubert', 'mbid': 'f91e3a88-24ee-4563-8963-fab73d2765ed', 'url': 'https://www.last.fm/music/Franz+Schubert'}, 'toptags': {'tag': []}}
['classical', 'romantic', 'piano', 'composers', 'classical', 'romantic', 'piano']
['classical', 'romantic']


In [4]:
import pandas as pd

raw_metadata = pd.read_csv("../data/raw/metadata.csv")
clean_metadata = pd.read_csv("../data/clean/metadata.csv")

joined_metadata = pd.merge(raw_metadata, clean_metadata, on="midi_filename", how="inner")

display(joined_metadata.head())


Unnamed: 0,canonical_composer,canonical_title,split_x,year,midi_filename,audio_filename,duration_x,split_y,duration_y
0,Alban Berg,Sonata Op. 1,train,2017,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,train,464.649433
1,Alexander Scriabin,"24 Preludes Op. 11, No. 13-24",train,2004,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,train,872.640588
2,Alexander Scriabin,"3 Etudes, Op. 65",validation,2006,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,validation,397.857508
3,Alexander Scriabin,"5 Preludes, Op.15",validation,2009,2009/MIDI-Unprocessed_07_R1_2009_04-05_ORIG_MI...,2009/MIDI-Unprocessed_07_R1_2009_04-05_ORIG_MI...,400.557826,validation,400.557826
4,Alexander Scriabin,"Entragete, Op.63",test,2009,2009/MIDI-Unprocessed_11_R1_2009_06-09_ORIG_MI...,2009/MIDI-Unprocessed_11_R1_2009_06-09_ORIG_MI...,163.74583,test,163.74583


In [5]:
import time

def apply_eras(row):
    composer = row["canonical_composer"].replace("#", "%23")
    title = row["canonical_title"].replace("#", "%23")

    filtered_eras = []
    for i in range(3):
        if len(filtered_eras) > 0: break

        try:
            track_info = get_track_info(composer, title)
            if "track" not in track_info: return []

            track_eras = get_track_eras(track_info["track"]["url"])
            filtered_eras = filter_eras(track_eras)
        except:
            time.sleep(1)
            continue


    return filtered_eras

for i in range(0, len(clean_metadata), 50):
    batch = joined_metadata.iloc[i:i + 50]
    
    time.sleep(1)
    for idx, row in batch.iterrows():
        clean_metadata.loc[idx, "eras"] = "+".join(apply_eras(row))
    
    clean_metadata.to_csv("../data/clean/eras.csv", index=False)

In [7]:
import requests
from bs4 import BeautifulSoup

def get_track_info_allmusic(artist, track):
    artist = artist.replace(" ", "%20")
    track = track.replace(" ", "%20")

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
    }
    url = f"https://www.allmusic.com/search/all/{artist}%2C%20{track}"
    markup = requests.get(url, headers=headers).text
    soup = BeautifulSoup(markup, "html.parser")
    
    composition_url = soup.select_one("#pageContainer .composition .title a").attrs["href"]
    composition_markup = requests.get(composition_url, headers=headers).text
    composition_soup = BeautifulSoup(composition_markup, "html.parser")
    
    result = "classical"
    try:
        period = composition_soup.select_one(".period div")
        print(period)
        if period: result = period.text.strip().lower()
        else:
            genre = composition_soup.select_one(".genre div")
            print(genre)
            if genre: result = genre.text.strip().lower()
    except:
        result = "classical"

    return result

def get_eras_allmusic(row):
    composer = row["canonical_composer"].replace("#", "%23")
    title = row["canonical_title"].replace("#", "%23")

    try:
        return get_track_info_allmusic(composer, title)
    except Exception as e:
        print(e)
        print(f"Failed to get info for {composer} - {title}")


for i in range(0, len(clean_metadata), 50):
    batch = joined_metadata.iloc[i:i + 50]
    
    time.sleep(1)
    for idx, row in batch.iterrows():
        if not clean_metadata.loc[idx, "eras"]:
            clean_metadata.loc[idx, "eras"] = get_eras_allmusic(row)
    
clean_metadata.to_csv("../data/clean/eras.csv", index=False)

'NoneType' object has no attribute 'attrs'
Failed to get info for Alexander Scriabin - Entragete, Op.63
'NoneType' object has no attribute 'attrs'
Failed to get info for Alexander Scriabin - Etude Op. 42, Nos. 4 & 5
'NoneType' object has no attribute 'attrs'
Failed to get info for Alexander Scriabin - Etudes from Op.8
'NoneType' object has no attribute 'attrs'
Failed to get info for Alexander Scriabin - Fragilite, Op.51
<div>
                                            Post-Romantic                                    </div>
<div>
                                            Romantic                                    </div>
<div>
                                            Modern                                    </div>
'NoneType' object has no attribute 'attrs'
Failed to get info for Alexander Scriabin - Two Impromptus, Op. 14 (Complete)
'NoneType' object has no attribute 'attrs'
Failed to get info for Carl Maria von Weber - Sonata No. 1, 2nd, 3rd and 4th movements
'NoneType' object h

In [14]:
top_eras = ["post-romantic", "romantic", "baroque", "impressionist", "modern", "classical"]

eras_df = pd.read_csv("../data/clean/eras.csv")
eras_df.head()

Unnamed: 0,split,midi_filename,duration,tags
0,train,2017/MIDI-Unprocessed_066_PIANO066_MID--AUDIO-...,464.649433,contemporary-classical+20th-century-classical+...
1,train,2004/MIDI-Unprocessed_XP_21_R1_2004_01_ORIG_MI...,872.640588,post-romantic
2,validation,2006/MIDI-Unprocessed_17_R1_2006_01-06_ORIG_MI...,397.857508,composer+classical+russian+romantic
3,validation,2009/MIDI-Unprocessed_07_R1_2009_04-05_ORIG_MI...,400.557826,composer+classical+russian+romantic
4,test,2009/MIDI-Unprocessed_11_R1_2009_06-09_ORIG_MI...,163.74583,


In [22]:
for i, row in eras_df.iterrows():
    if not row["eras"]: continue

    eras = str(row["eras"]).split("+")

    for era in top_eras:
        if era in eras or era in str(row["eras"]):
            eras_df.loc[i, "eras"] = era.strip().lower()
            break

eras_df.to_csv("../data/clean/eras.csv", index=False)

In [23]:
eras_df["eras"].value_counts()

tags
romantic         432
baroque           74
classical         40
modern            18
impressionist     12
post-romantic     11
contemporary       2
Name: count, dtype: int64

In [21]:
eras_df.dropna(subset=["eras"], inplace=True)
eras_df["eras"].isna().sum()

0