# Imports and Constants

An API key must be given. Registration required first.

In [16]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
%autoreload 2
import requests
import pandas as pd
import numpy as np

from datetime import datetime
from dateutil import parser

from dotenv import load_dotenv

import metalhistory as mh
import metalhistory.utils as utils

# For devs: Overview of functions
These functions from utils.py work (untested) so far:

In [18]:
lastfm = utils.LastFM()
lastfm.get_album_info('Black Sabbath', 'Paranoid', verbose=1)
lastfm.get_album_matches('Black Sabbath', 'Paranoid', verbose=1)
lastfm.get_track_info('Black Sabbath', 'Paranoid', 'War Pigs', verbose=1)

Generated API Request: http://ws.audioscrobbler.com/2.0/?&api_key=2e171c2f29c3f9b0258954e2edb289c3&method=album.getinfo&artist=Black Sabbath&album=Paranoid&format=json
Generated API Request: http://ws.audioscrobbler.com/2.0/?&api_key=2e171c2f29c3f9b0258954e2edb289c3&method=album.search&artist=Black Sabbath&album=Paranoid&format=json
Generated API Request: http://ws.audioscrobbler.com/2.0/?&api_key=2e171c2f29c3f9b0258954e2edb289c3&method=track.getinfo&artist=Black Sabbath&album=Paranoid&track=War Pigs&format=json


{'name': 'War Pigs',
 'mbid': 'c2786bd8-7dc7-4633-ab6c-70c70ebd432f',
 'url': 'https://www.last.fm/music/Black+Sabbath/_/War+Pigs',
 'duration': '476000',
 'streamable': {'#text': '0', 'fulltrack': '0'},
 'listeners': '673385',
 'playcount': '3737549',
 'artist': {'name': 'Black Sabbath',
  'mbid': '5182c1d9-c7d2-4dad-afa0-ccfeada921a8',
  'url': 'https://www.last.fm/music/Black+Sabbath'},
 'album': {'artist': 'Black Sabbath',
  'title': 'Greatest Hits',
  'mbid': '4d7125aa-0352-4b93-a769-7179c5bdd255',
  'url': 'https://www.last.fm/music/Black+Sabbath/Greatest+Hits',
  'image': [{'#text': 'https://lastfm.freetls.fastly.net/i/u/34s/e88b3980787c4714b79c3cab44986595.png',
    'size': 'small'},
   {'#text': 'https://lastfm.freetls.fastly.net/i/u/64s/e88b3980787c4714b79c3cab44986595.png',
    'size': 'medium'},
   {'#text': 'https://lastfm.freetls.fastly.net/i/u/174s/e88b3980787c4714b79c3cab44986595.png',
    'size': 'large'},
   {'#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/e88

# Data analysis

Show the first 5 entries of the Excel-file.

In [19]:
df = pd.read_csv('data/MA_10k_albums.csv', sheet_name='Gesamt')
df = df.drop(labels=['Unnamed: 9', 0.5, 'Year', 'Month', 'Day', 'Spotify', 'Factor', 'Score'], axis=1)
df = df.dropna(axis=0)
df.head()

Unnamed: 0,Number,Band,Album
0,1.0,Black Sabbath,Paranoid
1,2.0,Led Zeppelin,Led Zeppelin IV
2,3.0,Deep Purple,Machine Head
3,4.0,Queen,A Night at the Opera
4,5.0,Queen,Jazz


Many albums occur several times with similar names. Their counts should be summed up.
See the following example for "Black Sabbath - Paranoid" and "Led Zeppelin - Led Zeppelin IV".

In [20]:
lastfm = utils.LastFM()
results_df = pd.DataFrame()
for idx, row in df.head(2).iterrows():
    artist = row['Band']
    album = row['Album']
    matches = lastfm.get_album_matches(artist, album)

    for match in matches['results']['albummatches'].values():
        for d in match:
            if d['artist'] == artist:
                results_df = results_df.append({
                    'artist': artist,
                    'album': album,
                    'album_instance': d['name'],
                    'lastfm_info': lastfm.get_album_info(d['artist'], d['name'])
                }, ignore_index=True)
                print(d['artist'], '-', d['name'])

Black Sabbath - Paranoid
Black Sabbath - Paranoid (2009 Remastered Version)
Black Sabbath - Paranoid (Remastered)
Black Sabbath - Paranoid (2014 Remaster)
Black Sabbath - Paranoid (Remaster)
Black Sabbath - Paranoid (Remastered Edition)
Black Sabbath - Paranoid (Deluxe Edition)
Black Sabbath - Paranoid (Deluxe Expanded Edit
Black Sabbath - Paranoid (Original Album) Disc 1
Black Sabbath - Paranoid (Deluxe Expanded Edition) CD1
Black Sabbath - Paranoid (Black Box: Disc 2)
Black Sabbath - Paranoid (Us 1st Press, Wbm 3104-2)
Black Sabbath - Paranoid (Deluxe Expanded Edition, 2cd, Sanctuary 1782444)
Black Sabbath - Paranoid [2009, Sanctuary, 1782444]
Black Sabbath - Paranoid - Black Box -  The Complete Original Black Sabbath
Black Sabbath - Paranoid [1989, Vertigo, 23PD-134]
Black Sabbath - Paranoid [Deluxe Edition]
Black Sabbath - Paranoid (Black Box, R2 73923-B)
Led Zeppelin - Led Zeppelin IV
Led Zeppelin - Led Zeppelin IV (Deluxe Edition)
Led Zeppelin - Led Zeppelin IV (1994 Remaster
Led

Add some entries of the lastfm info as column to the dataframe.

In [21]:
df = results_df
df['listeners'] = df.apply(lambda row: int(row['lastfm_info']['listeners']), axis=1)
df['playcount'] = df.apply(lambda row: int(row['lastfm_info']['playcount']), axis=1)
# df['release'] = df.apply(lambda row: get_release_date(row['lastfm_info']), axis=1)
df['url'] = df.apply(lambda row: row['lastfm_info']['url'], axis=1)
df

Unnamed: 0,album,album_instance,artist,lastfm_info,listeners,playcount,url
0,Paranoid,Paranoid,Black Sabbath,"{'name': 'Paranoid', 'artist': 'Black Sabbath'...",1259763,15301867,https://www.last.fm/music/Black+Sabbath/Paranoid
1,Paranoid,Paranoid (2009 Remastered Version),Black Sabbath,"{'name': 'Paranoid (2009 Remastered Version)',...",253344,2837094,https://www.last.fm/music/Black+Sabbath/Parano...
2,Paranoid,Paranoid (Remastered),Black Sabbath,"{'name': 'Paranoid (Remastered)', 'artist': 'B...",170170,1435428,https://www.last.fm/music/Black+Sabbath/Parano...
3,Paranoid,Paranoid (2014 Remaster),Black Sabbath,"{'name': 'Paranoid (2014 Remaster)', 'artist':...",78861,543345,https://www.last.fm/music/Black+Sabbath/Parano...
4,Paranoid,Paranoid (Remaster),Black Sabbath,"{'name': 'Paranoid (Remaster)', 'artist': 'Bla...",49957,274526,https://www.last.fm/music/Black+Sabbath/Parano...
...,...,...,...,...,...,...,...
59,Led Zeppelin IV,Led Zeppelin IV [Deluxe Edition] Disc 1,Led Zeppelin,{'name': 'Led Zeppelin IV [Deluxe Edition] Dis...,108,2673,https://www.last.fm/music/Led+Zeppelin/Led+Zep...
60,Led Zeppelin IV,Led Zepplin IV,Led Zeppelin,"{'name': 'Led Zepplin IV', 'artist': 'Led Zepp...",1042,9425,https://www.last.fm/music/Led+Zeppelin/Led+Zep...
61,Led Zeppelin IV,Led Zeppelin IV - Super Deluxe Edition (CD1),Led Zeppelin,{'name': 'Led Zeppelin IV - Super Deluxe Editi...,195,2706,https://www.last.fm/music/Led+Zeppelin/Led+Zep...
62,Led Zeppelin IV,IV [Box Set Remaster],Led Zeppelin,"{'name': 'IV [Box Set Remaster]', 'artist': 'L...",373,8263,https://www.last.fm/music/Led+Zeppelin/IV+%5BB...


Sum up the different entries that correspond to the same album:

In [22]:
#TODO: Should probably be part of the utils.py
cumulative_df = df.drop(['album_instance'], axis=1).groupby(['artist','album']).sum()
cumulative_df

Unnamed: 0_level_0,Unnamed: 1_level_0,listeners,playcount
artist,album,Unnamed: 2_level_1,Unnamed: 3_level_1
Black Sabbath,Paranoid,1936391,21611158
Led Zeppelin,Led Zeppelin IV,1895932,26756229


Get the album last fm album entries with the most listeners.

In [23]:
idx = df.groupby(['artist', 'album'])['listeners'].transform(max) == df['listeners']
top_albums_df = df[idx]
top_albums_df

Unnamed: 0,album,album_instance,artist,lastfm_info,listeners,playcount,url
0,Paranoid,Paranoid,Black Sabbath,"{'name': 'Paranoid', 'artist': 'Black Sabbath'...",1259763,15301867,https://www.last.fm/music/Black+Sabbath/Paranoid
18,Led Zeppelin IV,Led Zeppelin IV,Led Zeppelin,"{'name': 'Led Zeppelin IV', 'artist': 'Led Zep...",1048970,17008960,https://www.last.fm/music/Led+Zeppelin/Led+Zep...


Get listeners per song

In [24]:
tracks_df = pd.DataFrame()

for idx, row in top_albums_df.iterrows():
    artist = row['artist']
    album = row['album']
    album_instance = row['album_instance']
    tracks = row['lastfm_info']['tracks']['track']

    for track in tracks:
        lastfm_info = lastfm.get_track_info(artist, album, track['name'])

        tracks_df = tracks_df.append({
            'artist': artist,
            'album': album,
            'track': track['name'],
            'lastfm_info': lastfm_info
        }, ignore_index=True)

In [25]:
tracks_df['listeners'] = tracks_df.apply(lambda row: int(row['lastfm_info']['listeners']), axis=1)
tracks_df['playcount'] = tracks_df.apply(lambda row: int(row['lastfm_info']['playcount']), axis=1)
tracks_df = tracks_df.drop(['lastfm_info'], axis=1)
tracks_df.groupby('album')['listeners']
# tracks_df.drop(['lastfm_info'], axis=1).groupby(['artist', 'album']).sort(by='listeners')

top_tracks_df  = tracks_df.groupby(['album']).apply(lambda x: x.sort_values(['listeners'], ascending=False))
top_tracks_df

Unnamed: 0_level_0,Unnamed: 1_level_0,album,artist,track,listeners,playcount
album,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Led Zeppelin IV,11,Led Zeppelin IV,Led Zeppelin,Stairway to Heaven,1310821,11171648
Led Zeppelin IV,8,Led Zeppelin IV,Led Zeppelin,Black Dog,897099,5855617
Led Zeppelin IV,9,Led Zeppelin IV,Led Zeppelin,Rock and Roll,743872,4834377
Led Zeppelin IV,14,Led Zeppelin IV,Led Zeppelin,Going to California,618739,3818484
Led Zeppelin IV,15,Led Zeppelin IV,Led Zeppelin,When the Levee Breaks,609603,3742656
Led Zeppelin IV,10,Led Zeppelin IV,Led Zeppelin,The Battle of Evermore,513801,2805326
Led Zeppelin IV,12,Led Zeppelin IV,Led Zeppelin,Misty Mountain Hop,513397,2657664
Led Zeppelin IV,13,Led Zeppelin IV,Led Zeppelin,Four Sticks,391363,1931674
Paranoid,1,Paranoid,Black Sabbath,Paranoid,1310226,9155686
Paranoid,3,Paranoid,Black Sabbath,Iron Man,968142,5655105


# Export functions

Export results to Excel sheet

In [26]:
cumulative_df.to_excel('data/history_of_metal_lastfm_albums_cumulative.xlsx', sheet_name='albums_(cumulative)')
df.to_excel('data/history_of_metal_lastfm_albums_raw.xlsx', sheet_name='albums_(raw)')
top_tracks_df.to_excel('data/history_of_metal_lastfm_tracks.xlsx', sheet_name='tracks')