# Imports and Constants

An API key must be given. Registration required first.

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import requests
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil import parser

from dotenv import load_dotenv

from tqdm import tqdm, tqdm_notebook

# import metalhistory as mh
import metalhistory.data_query_functions as dqf

In [3]:
tqdm.pandas()

# For devs: Overview of functions
These functions from utils.py work (untested) so far:

In [4]:
lastfm = dqf.LastFM()
lastfm.get_album_info('Death', 'Symbolic', verbose=1)
# lastfm.get_album_matches('Burzum', 'Filosofem',  verbose=1)
# lastfm.get_track_info('Black Sabbath', 'Paranoid', 'War Pigs', verbose=1)

Generated API Request: http://ws.audioscrobbler.com/2.0/?&api_key=2e171c2f29c3f9b0258954e2edb289c3&method=album.getinfo&artist=Death&album=Symbolic&format=json


{'name': 'Symbolic',
 'artist': 'Death',
 'mbid': '321a3c33-9310-4b9f-b104-762e465ec60f',
 'url': 'https://www.last.fm/music/Death/Symbolic',
 'image': [{'#text': 'https://lastfm.freetls.fastly.net/i/u/34s/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'small'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/64s/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'medium'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/174s/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'large'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'extralarge'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'mega'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': ''}],
 'listeners': '180443',
 'playcount': '5710284',
 'tracks': {'track': [{'name': 'Symbolic',
    'url': 'https://www.last.fm/music/Death/_/Symboli

# Data Preprocessing

In [5]:
df_csv = pd.read_csv('data/MA_10k_albums.csv')
# df_csv = df_csv.dropna(axis=0)
df_csv

Unnamed: 0,artist,album,MA_score
0,Slayer,Reign in Blood,36.01
1,Metallica,Kill 'Em All,33.39
2,Hades Archer,Penis Metal,32.67
3,Iron Maiden,Iron Maiden,32.38
4,Metallica,Master of Puppets,31.83
...,...,...,...
9995,Iron Maiden,Live at the Rainbow,1.92
9996,Jorn,Worldchanger,1.92
9997,Juggernaut,Trouble Within,1.92
9998,Lacrimas Profundere,Memorandum,1.92


## Collected all instances of an album

In [6]:
lastfm = dqf.LastFM()
results_df = pd.DataFrame()

LIMIT=5

df_head = df_csv.head(LIMIT)

for idx, row in tqdm_notebook(df_head.iterrows(), total=df_head.shape[0], desc='artists'):
    artist = row['artist']
    album = row['album']
    
    print('Querying for', artist, '-', album)
    matches = list(lastfm.get_album_matches(artist, album)['results']['albummatches'].values())[0]
    # print(matches)

    nr_matches =  len(matches)
    nr_kept_matches = 0

    for match in tqdm_notebook(matches, desc='album matches', leave=False):
        if match['artist'] == artist:
            results_df = results_df.append({
                'artist': artist,
                'album': album,
                'album_instance': match['name'],
                'lastfm_info': lastfm.get_album_info(match['artist'], match['name'])
            }, ignore_index=True)
            nr_kept_matches += 1        
    print('Kept', nr_kept_matches, 'out of', nr_matches, 'matches for', artist, '-', album, end='\n\n')

artists:   0%|          | 0/5 [00:00<?, ?it/s]

Querying for Slayer - Reign in Blood


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 44 out of 50 matches for Slayer - Reign in Blood

Querying for Metallica - Kill 'Em All


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 29 out of 50 matches for Metallica - Kill 'Em All

Querying for Hades Archer - Penis Metal


album matches:   0%|          | 0/21 [00:00<?, ?it/s]

Kept 7 out of 21 matches for Hades Archer - Penis Metal

Querying for Iron Maiden - Iron Maiden


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 50 out of 50 matches for Iron Maiden - Iron Maiden

Querying for Metallica - Master of Puppets


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 35 out of 50 matches for Metallica - Master of Puppets



In [7]:
results_df

Unnamed: 0,album,album_instance,artist,lastfm_info
0,Reign in Blood,Reign in Blood,Slayer,"{'name': 'Reign in Blood', 'artist': 'Slayer',..."
1,Reign in Blood,Reign In Blood (Expanded),Slayer,"{'name': 'Reign In Blood (Expanded)', 'artist'..."
2,Reign in Blood,Reign In Blood (Expanded Edition),Slayer,"{'name': 'Reign In Blood (Expanded Edition)', ..."
3,Reign in Blood,Reign In Blood (1994 Reissue),Slayer,"{'name': 'Reign In Blood (1994 Reissue)', 'art..."
4,Reign in Blood,Reign In Blood (Remastered),Slayer,"{'name': 'Reign In Blood (Remastered)', 'artis..."
...,...,...,...,...
160,Master of Puppets,Master Of Puppets (UK CD MFN 60),Metallica,"{'name': 'Master Of Puppets (UK CD MFN 60)', '..."
161,Master of Puppets,Master Of Puppets (Remastered) [Explicit],Metallica,{'name': 'Master Of Puppets (Remastered) [Expl...
162,Master of Puppets,Master Of Puppets (UK Version),Metallica,"{'name': 'Master Of Puppets (UK Version)', 'ar..."
163,Master of Puppets,Master of Puppets {DCC 24K},Metallica,"{'name': 'Master of Puppets {DCC 24K}', 'artis..."


Add some entries of the lastfm info as column to the dataframe.

In [15]:
df = results_df
df['listeners'] = df.progress_apply(lambda row: int(row['lastfm_info']['listeners']), axis=1)
df['playcount'] = df.progress_apply(lambda row: int(row['lastfm_info']['playcount']), axis=1)
df

100%|██████████| 165/165 [00:00<00:00, 98555.99it/s]
100%|██████████| 165/165 [00:00<00:00, 96927.19it/s]


Unnamed: 0,album,album_instance,artist,lastfm_info,listeners,playcount
0,Reign in Blood,Reign in Blood,Slayer,"{'name': 'Reign in Blood', 'artist': 'Slayer',...",822151,15579798
1,Reign in Blood,Reign In Blood (Expanded),Slayer,"{'name': 'Reign In Blood (Expanded)', 'artist'...",165687,2127792
2,Reign in Blood,Reign In Blood (Expanded Edition),Slayer,"{'name': 'Reign In Blood (Expanded Edition)', ...",12823,372560
3,Reign in Blood,Reign In Blood (1994 Reissue),Slayer,"{'name': 'Reign In Blood (1994 Reissue)', 'art...",4856,130026
4,Reign in Blood,Reign In Blood (Remastered),Slayer,"{'name': 'Reign In Blood (Remastered)', 'artis...",2092,75516
...,...,...,...,...,...,...
160,Master of Puppets,Master Of Puppets (UK CD MFN 60),Metallica,"{'name': 'Master Of Puppets (UK CD MFN 60)', '...",175,3179
161,Master of Puppets,Master Of Puppets (Remastered) [Explicit],Metallica,{'name': 'Master Of Puppets (Remastered) [Expl...,307,2160
162,Master of Puppets,Master Of Puppets (UK Version),Metallica,"{'name': 'Master Of Puppets (UK Version)', 'ar...",650,3254
163,Master of Puppets,Master of Puppets {DCC 24K},Metallica,"{'name': 'Master of Puppets {DCC 24K}', 'artis...",366,3066


Sum up the different entries that correspond to the same album:

In [16]:
#TODO: Should probably be part of the utils.py
cumulative_df = df.drop(['album_instance'], axis=1).groupby(['artist','album']).sum()
cumulative_df

Unnamed: 0_level_0,Unnamed: 1_level_0,listeners,playcount
artist,album,Unnamed: 2_level_1,Unnamed: 3_level_1
Hades Archer,Penis Metal,489,9519
Iron Maiden,Iron Maiden,8060736,137681269
Metallica,Kill 'Em All,673071,16147372
Metallica,Master of Puppets,1231515,25623793
Slayer,Reign in Blood,1027288,18550929


## Get tags of the most relevant album match

In [17]:
#TODO: Move these functions to the data_query_functions.py !
def get_tags(x):
    tag_names = []
    for tag in x['tags']['tag']:
        tag_names.append(tag['name'])
    return tag_names

def get_url(x):
    return x['url']

def get_info(x):
    artist = x.name[0]
    album = x.name[1]
    lastfm = dqf.LastFM()
    album_info = lastfm.get_album_info(artist, album)
    return album_info

def get_img(x):
    return x['image']

# TODO: Release date not present. Should use another API for that?
# def get_releasedate(x):
#     return x['releasedate']


In [18]:
# cumulative_df['artist'] = cumulative_df.apply(lambda row: row.name[0], axis=1)
# cumulative_df['album'] = cumulative_df.apply(lambda row: row.name[1], axis=1)
cumulative_df['lastfm_info'] = cumulative_df.apply(lambda row: get_info(row), axis=1)
cumulative_df['tags'] = cumulative_df.apply(lambda row: get_tags(row['lastfm_info']), axis=1)
cumulative_df['url'] = cumulative_df.apply(lambda row: get_url(row['lastfm_info']), axis=1)
cumulative_df['images'] = cumulative_df.apply(lambda row: get_img(row['lastfm_info']), axis=1)
# cumulative_df['release'] = cumulative_df.apply(lambda row: get_releasedate(row['lastfm_info']), axis=1)
cumulative_df


Unnamed: 0_level_0,Unnamed: 1_level_0,listeners,playcount,lastfm_info,tags,url,images
artist,album,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Hades Archer,Penis Metal,489,9519,"{'name': 'Penis Metal', 'artist': 'Hades Arche...",[],https://www.last.fm/music/Hades+Archer/Penis+M...,[{'#text': 'https://lastfm.freetls.fastly.net/...
Iron Maiden,Iron Maiden,8060736,137681269,"{'name': 'Iron Maiden', 'artist': 'Iron Maiden...","[heavy metal, albums I own, NWOBHM, metal, 1980]",https://www.last.fm/music/Iron+Maiden/Iron+Maiden,[{'#text': 'https://lastfm.freetls.fastly.net/...
Metallica,Kill 'Em All,673071,16147372,"{'name': 'Kill 'Em All', 'artist': 'Metallica'...","[thrash metal, albums I own, metal, heavy meta...",https://www.last.fm/music/Metallica/Kill+%27Em...,[{'#text': 'https://lastfm.freetls.fastly.net/...
Metallica,Master of Puppets,1231515,25623793,"{'name': 'Master of Puppets', 'artist': 'Metal...","[thrash metal, albums I own, metal, heavy meta...",https://www.last.fm/music/Metallica/Master+of+...,[{'#text': 'https://lastfm.freetls.fastly.net/...
Slayer,Reign in Blood,1027288,18550929,"{'name': 'Reign in Blood', 'artist': 'Slayer',...","[thrash metal, albums I own, metal, speed meta...",https://www.last.fm/music/Slayer/Reign+in+Blood,[{'#text': 'https://lastfm.freetls.fastly.net/...


In [19]:
cumulative_df.drop('lastfm_info', axis=1).sort_values(by='listeners', ascending=False).to_csv('./data/proc_MA_10k_albums.csv')