# Imports and Constants

An API key must be given. Registration required first.

In [1]:
%load_ext autoreload

In [2]:
%autoreload 2
import requests
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil import parser

from dotenv import load_dotenv

from tqdm import tqdm, tqdm_notebook

# import metalhistory as mh
import metalhistory.data_query_functions as dqf

import requests
import pandas as pd
import xmltodict
import time

In [3]:
tqdm.pandas()

# For devs: Overview of functions
These functions from utils.py work (untested) so far:

In [4]:
# lastfm = dqf.LastFM()
# lastfm.get_album_info('Death', 'Symbolic', verbose=1)
# lastfm.get_album_matches(album='Filosofem')
# lastfm.get_track_info('Black Sabbath', 'Paranoid', 'War Pigs', verbose=1)

# Data Preprocessing

In [5]:
df_csv = pd.read_csv('data/MA_10k_albums.csv')
df_csv

Unnamed: 0,artist,album,MA_score
0,Slayer,Reign in Blood,36.01
1,Metallica,Kill 'Em All,33.39
2,Hades Archer,Penis Metal,32.67
3,Iron Maiden,Iron Maiden,32.38
4,Metallica,Master of Puppets,31.83
...,...,...,...
9995,Iron Maiden,Live at the Rainbow,1.92
9996,Jorn,Worldchanger,1.92
9997,Juggernaut,Trouble Within,1.92
9998,Lacrimas Profundere,Memorandum,1.92


## Collected all instances of an album

In [6]:
lastfm = dqf.LastFM()
results_df = pd.DataFrame()

LIMIT=10 # Obituary - Cause of Death is causing some error here at idx 38 URL special key probably causes that issue!

df_head = df_csv.head(LIMIT).tail(2)

for idx, row in tqdm_notebook(df_head.iterrows(), total=df_head.shape[0], desc='artists'):
    artist = row['artist']
    album = row['album']
    
    print(idx,': Querying for', artist, '-', album)
    matches = list(lastfm.get_album_matches(album=album)['results']['albummatches'].values())[0]

    nr_matches =  len(matches)
    nr_kept_matches = 0

    for match in tqdm_notebook(matches, desc='album matches', leave=False):
        if match['artist'] == artist:
            results_df = results_df.append({
                'artist': artist,
                'album': album,
                'album_instance': match['name'],
                'lastfm_info': lastfm.get_album_info(artist=match['artist'], album=match['name'])
            }, ignore_index=True)
            nr_kept_matches += 1        
    print('Kept', nr_kept_matches, 'out of', nr_matches, 'matches for', artist, '-', album, end='\n\n')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for idx, row in tqdm_notebook(df_head.iterrows(), total=df_head.shape[0], desc='artists'):


artists:   0%|          | 0/2 [00:00<?, ?it/s]

8 : Querying for Metallica - Ride the Lightning


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for match in tqdm_notebook(matches, desc='album matches', leave=False):


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 40 out of 50 matches for Metallica - Ride the Lightning

9 : Querying for Black Sabbath - Paranoid


album matches:   0%|          | 0/50 [00:00<?, ?it/s]

Kept 17 out of 50 matches for Black Sabbath - Paranoid



Add some entries of the lastfm info as column to the dataframe.

In [7]:
def get_listeners(x):
    if isinstance(x, dict):
        return int(x['listeners'])
    else:
        return None

def get_playcounts(x):
    if isinstance(x, dict):
        return int(x['playcount'])
    else:
        return None


In [8]:
df = results_df
df['listeners'] = df.progress_apply(lambda row: get_listeners(row['lastfm_info']), axis=1)
df['playcount'] = df.progress_apply(lambda row: get_playcounts(row['lastfm_info']), axis=1)
df

100%|██████████| 57/57 [00:00<00:00, 34533.49it/s]
100%|██████████| 57/57 [00:00<00:00, 53246.18it/s]


Unnamed: 0,album,album_instance,artist,lastfm_info,listeners,playcount
0,Ride the Lightning,Ride the Lightning,Metallica,"{'name': 'Ride the Lightning', 'artist': 'Meta...",802823,19338958
1,Ride the Lightning,Ride The Lightning (Remastered),Metallica,"{'name': 'Ride The Lightning (Remastered)', 'a...",124326,1784844
2,Ride the Lightning,Ride the Lightning (Deluxe / Remastered),Metallica,{'name': 'Ride the Lightning (Deluxe / Remaste...,105070,975581
3,Ride the Lightning,Ride the Lightning (Deluxe Remaster),Metallica,{'name': 'Ride the Lightning (Deluxe Remaster)...,99819,943034
4,Ride the Lightning,Ride_The_Lightning,Metallica,"{'name': 'Ride_The_Lightning', 'artist': 'Meta...",13751,242036
5,Ride the Lightning,Ride The Lightning (DCC Gold CD),Metallica,"{'name': 'Ride The Lightning (DCC Gold CD)', '...",4045,87844
6,Ride the Lightning,"Ride The Lightning [1988, 25DP 5340]",Metallica,"{'name': 'Ride The Lightning [1988, 25DP 5340]...",1593,32735
7,Ride the Lightning,Ride the Lightning (2006 Japaneese Reissue),Metallica,{'name': 'Ride the Lightning (2006 Japaneese R...,1918,27422
8,Ride the Lightning,Ride The Lightning (DCC 24K Gold Remaster),Metallica,{'name': 'Ride The Lightning (DCC 24K Gold Rem...,1204,29149
9,Ride the Lightning,"Ride the Lightning (2000 DCC Remastered, Gold ...",Metallica,{'name': 'Ride the Lightning (2000 DCC Remaste...,901,25602


Sum up the different entries that correspond to the same album:

In [9]:
#TODO: Should probably be part of the utils.py
cumulative_df = df.drop(['album_instance'], axis=1).groupby(['artist','album']).sum()
cumulative_df

Unnamed: 0_level_0,Unnamed: 1_level_0,listeners,playcount
artist,album,Unnamed: 2_level_1,Unnamed: 3_level_1
Black Sabbath,Paranoid,1939701,21640048
Metallica,Ride the Lightning,1164875,23631002


## Get tags of the most relevant album match

In [10]:
#TODO: Move these functions to the data_query_functions.py !
def get_tags(x):
    tag_names = []
    for tag in x['tags']['tag']:
        tag_names.append(tag['name'])
    return tag_names

def get_url(x):
    return x['url']

def get_info(x):
    artist = x.name[0]
    album = x.name[1]
    lastfm = dqf.LastFM()
    album_info = lastfm.get_album_info(artist=artist, album=album)
    return album_info

def get_img(x):
    return x['image']

def get_mbid(x):
    try:
        return x['mbid']
    except KeyError:
        return None

def get_release_date(x):
    print(x)
    if x is not None:
        response = requests.get('http://musicbrainz.org/ws/2/release/' + str(x) + '?inc=release-groups&fmt=xml')
        while response.status_code == 503:
            print('503, wating for five seconds.')
            time.sleep(5)
            response = requests.get('http://musicbrainz.org/ws/2/release/' + str(x) + '?inc=release-groups&fmt=xml')

        response_dict = xmltodict.parse(response.text)

        return response_dict['metadata']['release']['release-group']['first-release-date']

    else:
        return None




In [11]:
cumulative_df['lastfm_info'] = cumulative_df.apply(lambda row: get_info(row), axis=1)
cumulative_df['tags'] = cumulative_df.apply(lambda row: get_tags(row['lastfm_info']), axis=1)
cumulative_df['url'] = cumulative_df.apply(lambda row: get_url(row['lastfm_info']), axis=1)
cumulative_df['images'] = cumulative_df.apply(lambda row: get_img(row['lastfm_info']), axis=1)
cumulative_df['mbid'] = cumulative_df.apply(lambda row: get_mbid(row['lastfm_info']), axis=1)
cumulative_df['release_date'] = cumulative_df.apply(lambda row: get_release_date(row['mbid']), axis=1)
cumulative_df.drop(['lastfm_info', 'url', 'images'], axis=1)


2982b682-36ea-3605-b959-04e746736070
2236dd07-a2f3-466a-973d-9069001a89da


Unnamed: 0_level_0,Unnamed: 1_level_0,listeners,playcount,tags,mbid,release_date
artist,album,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Black Sabbath,Paranoid,1939701,21640048,"[heavy metal, hard rock, albums I own, classic...",2982b682-36ea-3605-b959-04e746736070,1970-09-18
Metallica,Ride the Lightning,1164875,23631002,"[thrash metal, albums I own, metal, heavy meta...",2236dd07-a2f3-466a-973d-9069001a89da,1984-07-30


In [None]:
attr_list = ['release_date', 'listeners', 'playcount', 'tags', 'mbid', 'url', 'images']
cumulative_df[attr_list].sort_values(by='listeners', ascending=False).to_csv('./data/proc_MA_10k_albums.csv')