# Imports and Constants

An API key must be given. Registration required first.

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import requests
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil import parser

from dotenv import load_dotenv

from tqdm import tqdm, tqdm_notebook

# import metalhistory as mh
import metalhistory.data_query_functions as dqf

In [3]:
tqdm.pandas()

# For devs: Overview of functions
These functions from utils.py work (untested) so far:

In [4]:
lastfm = dqf.LastFM()
lastfm.get_album_info('Death', 'Symbolic', verbose=1)
# lastfm.get_album_matches('Burzum', 'Filosofem',  verbose=1)
# lastfm.get_track_info('Black Sabbath', 'Paranoid', 'War Pigs', verbose=1)

Generated API Request: http://ws.audioscrobbler.com/2.0/?&api_key=2e171c2f29c3f9b0258954e2edb289c3&method=album.getinfo&artist=Death&album=Symbolic&format=json


{'name': 'Symbolic',
 'artist': 'Death',
 'mbid': '321a3c33-9310-4b9f-b104-762e465ec60f',
 'url': 'https://www.last.fm/music/Death/Symbolic',
 'image': [{'#text': 'https://lastfm.freetls.fastly.net/i/u/34s/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'small'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/64s/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'medium'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/174s/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'large'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'extralarge'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': 'mega'},
  {'#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/459b51d39e5447e8c7f86ea0a8b34487.png',
   'size': ''}],
 'listeners': '180443',
 'playcount': '5710284',
 'tracks': {'track': [{'name': 'Symbolic',
    'url': 'https://www.last.fm/music/Death/_/Symboli

# Data Preprocessing

In [5]:
df_csv = pd.read_csv('data/MA_10k_albums.csv')
# df_csv = df_csv.dropna(axis=0)
df_csv

Unnamed: 0,artist,album,MA_score
0,Slayer,Reign in Blood,36.01
1,Metallica,Kill 'Em All,33.39
2,Hades Archer,Penis Metal,32.67
3,Iron Maiden,Iron Maiden,32.38
4,Metallica,Master of Puppets,31.83
...,...,...,...
9995,Iron Maiden,Live at the Rainbow,1.92
9996,Jorn,Worldchanger,1.92
9997,Juggernaut,Trouble Within,1.92
9998,Lacrimas Profundere,Memorandum,1.92


## Collected all instances of an album

In [10]:
lastfm = dqf.LastFM()
results_df = pd.DataFrame()

LIMIT=37 # Obituary - Cause of Death is causing some error here at idx 38!

df_head = df_csv.head(LIMIT)

for idx, row in tqdm_notebook(df_head.iterrows(), total=df_head.shape[0], desc='artists'):
    artist = row['artist']
    album = row['album']
    
    print(idx,': Querying for', artist, '-', album)
    matches = list(lastfm.get_album_matches(artist, album)['results']['albummatches'].values())[0]
    # print(matches)

    nr_matches =  len(matches)
    nr_kept_matches = 0

    for match in tqdm_notebook(matches, desc='album matches', leave=False):
        if match['artist'] == artist:
            results_df = results_df.append({
                'artist': artist,
                'album': album,
                'album_instance': match['name'],
                'lastfm_info': lastfm.get_album_info(match['artist'], match['name'])
            }, ignore_index=True)
            nr_kept_matches += 1        
    print('Kept', nr_kept_matches, 'out of', nr_matches, 'matches for', artist, '-', album, end='\n\n')

artists:   0%|          | 0/37 [00:00<?, ?it/s]

0 : Querying for Slayer - Reign in Blood


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 44 out of 50 matches for Slayer - Reign in Blood

1 : Querying for Metallica - Kill 'Em All


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 29 out of 50 matches for Metallica - Kill 'Em All

2 : Querying for Hades Archer - Penis Metal


album matches:   0%|          | 0/21 [00:00<?, ?it/s]

Kept 7 out of 21 matches for Hades Archer - Penis Metal

3 : Querying for Iron Maiden - Iron Maiden


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 50 out of 50 matches for Iron Maiden - Iron Maiden

4 : Querying for Metallica - Master of Puppets


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 35 out of 50 matches for Metallica - Master of Puppets

5 : Querying for Iron Maiden - The Number of the Beast


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 41 out of 50 matches for Iron Maiden - The Number of the Beast

6 : Querying for Megadeth - Rust in Peace


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 47 out of 50 matches for Megadeth - Rust in Peace

7 : Querying for Agalloch - The Mantle


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 11 out of 50 matches for Agalloch - The Mantle

8 : Querying for Metallica - Ride the Lightning


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 40 out of 50 matches for Metallica - Ride the Lightning

9 : Querying for Black Sabbath - Paranoid


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 17 out of 50 matches for Black Sabbath - Paranoid

10 : Querying for Morbid Angel - Altars of Madness


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 45 out of 50 matches for Morbid Angel - Altars of Madness

11 : Querying for Burzum - Hvis lyset tar oss


album matches:   0%|          | 0/49 [00:00<?, ?it/s]

Kept 39 out of 49 matches for Burzum - Hvis lyset tar oss

12 : Querying for Burzum - Filosofem


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 17 out of 50 matches for Burzum - Filosofem

13 : Querying for Death - Symbolic


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 24 out of 50 matches for Death - Symbolic

14 : Querying for Dissection - Storm of the Light's Bane


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 49 out of 50 matches for Dissection - Storm of the Light's Bane

15 : Querying for Metallica - ...and Justice for All


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 41 out of 50 matches for Metallica - ...and Justice for All

16 : Querying for Iron Maiden - Powerslave


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 40 out of 50 matches for Iron Maiden - Powerslave

17 : Querying for Sepultura - Beneath the Remains


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 41 out of 50 matches for Sepultura - Beneath the Remains

18 : Querying for Megadeth - Dystopia


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 1 out of 50 matches for Megadeth - Dystopia

19 : Querying for Black Sabbath - Black Sabbath


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 50 out of 50 matches for Black Sabbath - Black Sabbath

20 : Querying for Sodom - Agent Orange


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 11 out of 50 matches for Sodom - Agent Orange

21 : Querying for Death - Scream Bloody Gore


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 49 out of 50 matches for Death - Scream Bloody Gore

22 : Querying for Wintersun - Time I


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 1 out of 50 matches for Wintersun - Time I

23 : Querying for Black Sabbath - 13


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 2 out of 50 matches for Black Sabbath - 13

24 : Querying for Megadeth - Countdown to Extinction


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 48 out of 50 matches for Megadeth - Countdown to Extinction

25 : Querying for Judas Priest - Painkiller


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 7 out of 50 matches for Judas Priest - Painkiller

26 : Querying for Dark Angel - Darkness Descends


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 33 out of 50 matches for Dark Angel - Darkness Descends

27 : Querying for Immortal - At the Heart of Winter


album matches:   0%|          | 0/49 [00:00<?, ?it/s]

Kept 31 out of 49 matches for Immortal - At the Heart of Winter

28 : Querying for Behemoth - The Satanist


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 22 out of 50 matches for Behemoth - The Satanist

29 : Querying for Demolition Hammer - Epidemic of Violence


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 42 out of 50 matches for Demolition Hammer - Epidemic of Violence

30 : Querying for Iron Maiden - Seventh Son of a Seventh Son


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 46 out of 50 matches for Iron Maiden - Seventh Son of a Seventh Son

31 : Querying for Iron Maiden - Killers


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 5 out of 50 matches for Iron Maiden - Killers

32 : Querying for Slayer - Show No Mercy


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 33 out of 50 matches for Slayer - Show No Mercy

33 : Querying for Death - Human


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 1 out of 50 matches for Death - Human

34 : Querying for Children of Bodom - Follow the Reaper


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 39 out of 50 matches for Children of Bodom - Follow the Reaper

35 : Querying for Mayhem - Deathcrush


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 24 out of 50 matches for Mayhem - Deathcrush

36 : Querying for Wintersun - Wintersun


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 33 out of 50 matches for Wintersun - Wintersun



In [15]:
results_df.head(535).tail()

Unnamed: 0,album,album_instance,artist,lastfm_info
530,...and Justice for All,...And Justice For All (1988),Metallica,"{'name': '...And Justice For All (1988)', 'art..."
531,...and Justice for All,...And Justice For All (Super Deluxe Edition),Metallica,{'name': '...And Justice For All (Super Deluxe...
532,...and Justice for All,...And Justice For All!,Metallica,"{'name': '...And Justice For All!', 'artist': ..."
533,...and Justice for All,...And Justice For All + Bonus Tracks,Metallica,
534,...and Justice for All,...And Justice For All (Ultimate),Metallica,"{'name': '...And Justice For All (Ultimate)', ..."


Add some entries of the lastfm info as column to the dataframe.

In [33]:
def get_listeners(x):
    if isinstance(x, dict):
        return int(x['listeners'])
    else:
        return None

def get_playcounts(x):
    if isinstance(x, dict):
        return int(x['playcount'])
    else:
        return None


In [35]:
df = results_df
df['listeners'] = df.progress_apply(lambda row: get_listeners(row['lastfm_info']), axis=1)
df['playcount'] = df.progress_apply(lambda row: get_playcounts(row['lastfm_info']), axis=1)
df

100%|██████████| 1095/1095 [00:00<00:00, 150834.60it/s]
100%|██████████| 1095/1095 [00:00<00:00, 142885.32it/s]


Unnamed: 0,album,album_instance,artist,lastfm_info,listeners,playcount
0,Reign in Blood,Reign in Blood,Slayer,"{'name': 'Reign in Blood', 'artist': 'Slayer',...",822151.0,15579798.0
1,Reign in Blood,Reign In Blood (Expanded),Slayer,"{'name': 'Reign In Blood (Expanded)', 'artist'...",165687.0,2127792.0
2,Reign in Blood,Reign In Blood (Expanded Edition),Slayer,"{'name': 'Reign In Blood (Expanded Edition)', ...",12823.0,372560.0
3,Reign in Blood,Reign In Blood (1994 Reissue),Slayer,"{'name': 'Reign In Blood (1994 Reissue)', 'art...",4856.0,130026.0
4,Reign in Blood,Reign In Blood (Remastered),Slayer,"{'name': 'Reign In Blood (Remastered)', 'artis...",2092.0,75516.0
...,...,...,...,...,...,...
1090,Wintersun,(2004) - Wintersun,Wintersun,"{'name': '(2004) - Wintersun', 'artist': 'Wint...",11.0,1383.0
1091,Wintersun,Jari Mäenpää,Wintersun,"{'name': 'Jari Mäenpää', 'artist': 'Wintersun'...",241.0,7222.0
1092,Wintersun,(2004) Wintersun,Wintersun,"{'name': '(2004) Wintersun', 'artist': 'Winter...",35.0,930.0
1093,Wintersun,Wintersun (Japan Release),Wintersun,"{'name': 'Wintersun (Japan Release)', 'artist'...",2.0,665.0


Sum up the different entries that correspond to the same album:

In [36]:
#TODO: Should probably be part of the utils.py
cumulative_df = df.drop(['album_instance'], axis=1).groupby(['artist','album']).sum()
cumulative_df

Unnamed: 0_level_0,Unnamed: 1_level_0,listeners,playcount
artist,album,Unnamed: 2_level_1,Unnamed: 3_level_1
Agalloch,The Mantle,352832.0,12132834.0
Behemoth,The Satanist,121675.0,5022416.0
Black Sabbath,13,194395.0,3628614.0
Black Sabbath,Black Sabbath,6272170.0,78871048.0
Black Sabbath,Paranoid,1937226.0,21612793.0
Burzum,Filosofem,594290.0,13026055.0
Burzum,Hvis lyset tar oss,224814.0,3189025.0
Children of Bodom,Follow the Reaper,398938.0,9722337.0
Dark Angel,Darkness Descends,78794.0,1750565.0
Death,Human,149273.0,3632595.0


## Get tags of the most relevant album match

In [59]:
#TODO: Move these functions to the data_query_functions.py !
def get_tags(x):
    tag_names = []
    for tag in x['tags']['tag']:
        tag_names.append(tag['name'])
    return tag_names

def get_url(x):
    return x['url']

def get_info(x):
    artist = x.name[0]
    album = x.name[1]
    lastfm = dqf.LastFM()
    album_info = lastfm.get_album_info(artist, album)
    return album_info

def get_img(x):
    return x['image']

# TODO: Release date not present. Should use another API for that?
# def get_releasedate(x):
#     return x['releasedate']

def get_mbid(x):
    try:
        return x['mbid']
    except KeyError:
        return None

import requests
import pandas as pd
import xmltodict
import time

def get_release_date(x):
    print(x)
    if x is not None:
        response = requests.get('http://musicbrainz.org/ws/2/release/' + str(x) + '?inc=release-groups&fmt=xml')
        while response.status_code == 503:
            print('503, wating for five seconds.')
            time.sleep(5)
            response = requests.get('http://musicbrainz.org/ws/2/release/' + str(x) + '?inc=release-groups&fmt=xml')

        response_dict = xmltodict.parse(response.text)

        return response_dict['metadata']['release']['release-group']['first-release-date']

    else:
        return None




In [60]:
cumulative_df['lastfm_info'] = cumulative_df.apply(lambda row: get_info(row), axis=1)
cumulative_df['tags'] = cumulative_df.apply(lambda row: get_tags(row['lastfm_info']), axis=1)
cumulative_df['url'] = cumulative_df.apply(lambda row: get_url(row['lastfm_info']), axis=1)
cumulative_df['images'] = cumulative_df.apply(lambda row: get_img(row['lastfm_info']), axis=1)
cumulative_df['mbid'] = cumulative_df.apply(lambda row: get_mbid(row['lastfm_info']), axis=1)
cumulative_df['release_date'] = cumulative_df.apply(lambda row: get_release_date(row['mbid']), axis=1)
cumulative_df.drop(['lastfm_info', 'url', 'images'], axis=1)


7ac99528-77a9-3624-84b7-3400f6f56e47
8f2a9448-cec0-41aa-b518-3e067c65a336
7dbf4b1f-d3e9-47bc-9194-d15b31017bd6
d4d6b8d9-413f-3aa6-9f4b-d51be1eb740c
2982b682-36ea-3605-b959-04e746736070
1ce9177c-62a0-4403-a7ee-7359026fcbf6
b6928219-0772-39ac-8156-91a609b2fd5e
6b15fc3b-ea9f-30a9-9209-c6743add0913
383aedc8-841b-40a9-b17f-00b5eb8a18dd
c5ca70aa-d86b-4a0d-84fc-910ca6011881
2ba42c9d-f654-4214-9fbb-a5d694d955d1
503, wating for five seconds.
321a3c33-9310-4b9f-b104-762e465ec60f
5b18c38f-a133-4bd4-a2f2-da6272a189df
76a7d348-f6e2-4e36-b529-b3cc003c3231
None
8c8395a0-3a02-35d6-a975-535112ec057e
25da813d-4dbd-32c0-aef0-307e790f0709
91ddcf18-98af-4f73-890c-bfc44c1d91e2
556c0066-8114-33ad-aa45-ab2f203e2777
53a1a90f-cc58-39ea-9fc2-20dd103bda01
6ea83f20-d053-3495-bc25-76aa61da13ab
7e9b0af2-5fba-4cfa-8258-23be6afe768d
08f88d68-c0bf-474a-8707-43cd58e28ab6
1a77f8a7-54ab-4568-8003-42240cd29ab0
None
2b904e74-daba-397c-a151-bafb125ceb44
4dede30f-58cf-4d43-a857-b342ad7be945
c06ed440-f25d-3127-aadb-ebe9c685b3d

Unnamed: 0_level_0,Unnamed: 1_level_0,listeners,playcount,tags,mbid,release_date
artist,album,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Agalloch,The Mantle,352832.0,12132834.0,"[folk metal, doom metal, black metal, albums I...",7ac99528-77a9-3624-84b7-3400f6f56e47,2002-08-13
Behemoth,The Satanist,121675.0,5022416.0,"[2014, Blackened Death Metal, black metal, dea...",8f2a9448-cec0-41aa-b518-3e067c65a336,2014-02-03
Black Sabbath,13,194395.0,3628614.0,"[heavy metal, 2013, doom metal, hard rock, metal]",7dbf4b1f-d3e9-47bc-9194-d15b31017bd6,2013-06-07
Black Sabbath,Black Sabbath,6272170.0,78871048.0,"[heavy metal, hard rock, albums I own, 1970, c...",d4d6b8d9-413f-3aa6-9f4b-d51be1eb740c,1970-02-13
Black Sabbath,Paranoid,1937226.0,21612793.0,"[heavy metal, hard rock, albums I own, classic...",2982b682-36ea-3605-b959-04e746736070,1970-09-18
Burzum,Filosofem,594290.0,13026055.0,"[black metal, ambient black metal, atmospheric...",1ce9177c-62a0-4403-a7ee-7359026fcbf6,1996-01-31
Burzum,Hvis lyset tar oss,224814.0,3189025.0,"[black metal, ambient black metal, atmospheric...",b6928219-0772-39ac-8156-91a609b2fd5e,1994-04
Children of Bodom,Follow the Reaper,398938.0,9722337.0,"[Melodic Death Metal, albums I own, Power meta...",6b15fc3b-ea9f-30a9-9209-c6743add0913,2000-10-30
Dark Angel,Darkness Descends,78794.0,1750565.0,"[thrash metal, albums I own, 1986, bay area th...",383aedc8-841b-40a9-b17f-00b5eb8a18dd,1986-11-17
Death,Human,149273.0,3632595.0,"[death metal, Technical Death Metal, albums I ...",c5ca70aa-d86b-4a0d-84fc-910ca6011881,1991-10-22


In [64]:
attr_list = ['release_date', 'listeners', 'playcount', 'tags', 'mbid', 'url', 'images']
cumulative_df[attr_list].sort_values(by='listeners', ascending=False).to_csv('./data/proc_MA_10k_albums.csv')