In [1]:
#pip install requests-cache

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import requests
import requests_cache
import json
import time
from IPython.display import clear_output
from tqdm.notebook import tqdm
tqdm.pandas()


In [3]:
API_KEY = '06ac7ca40cbf657711947df768107808'
USER_AGENT = 'BaggerMagger'

In [4]:
# Testing response with key and user

headers = {
    'user-agent': USER_AGENT
}

payload = {
    'api_key': API_KEY,
    'method': 'chart.getTopArtists',
    'format': 'json'
}

response = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=payload)
response.status_code

200

In [5]:
# Function to get from Last FM api

def lastfm_get(payload):
    headers = {'user-agent': USER_AGENT}
    url = 'https://ws.audioscrobbler.com/2.0/'

    # Add API key and format to the payload
    payload['api_key'] = API_KEY
    payload['format'] = 'json'

    response = requests.get(url, headers=headers, params=payload)
    return response

In [6]:
# Testing function

response = lastfm_get(payload)
response.status_code

200

In [7]:
# Function to print json format from API call

def jprint(json_object):
    text = json.dumps(json_object, sort_keys=True, indent=4)
    print(text)

In [8]:
jprint(response.json())

{
    "artists": {
        "@attr": {
            "page": "1",
            "perPage": "50",
            "total": "5759937",
            "totalPages": "115199"
        },
        "artist": [
            {
                "image": [
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/34s/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "size": "small"
                    },
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/64s/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "size": "medium"
                    },
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/174s/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "size": "large"
                    },
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/300x300/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "s

In [9]:
jprint(response.json()['artists']['@attr'])

{
    "page": "1",
    "perPage": "50",
    "total": "5759937",
    "totalPages": "115199"
}


In [10]:
# Loop to receive information about top artists from last.fm
# Running loop typically takes roughly 1.5 hours

responses = []

page = 1
total_pages = 2     # Dummy number to start the loop, set as any number higher than 1

while page <= total_pages:
    payload = {
        'method': 'chart.getTopArtists',
        'limit': 500,
        'page': page
    }

    # Print status and clear each loop
    print(f"Requesting page {page}/{total_pages}")
    clear_output(wait = True)

    # API call
    response = lastfm_get(payload)

    # Check response for error, break loop if true
    if response.status_code != 200:
        print(response.text)
        break

    # Extract information for while loop from API call
    page = int(response.json()['artists']['@attr']['page'])
    total_pages = int(response.json()['artists']['@attr']['totalPages'])

    responses.append(response)
    
    # Check if cached, if not sleep to prevent too many API calls
    if not getattr(response, 'from_cache', False):
        time.sleep(0.25)

    page += 1

Requesting page 11522/11522


In [11]:
# Convert pages dataframes

r0 = responses[0]
r0_json = r0.json()
r0_artists = r0_json['artists']['artist']
r0_df = pd.DataFrame(r0_artists)
r0_df.shape # Check shape, should be 500 rows or as specified in limit

(50, 7)

In [12]:
# Combine each page into one df

frames = [pd.DataFrame(r.json()['artists']['artist']) for r in responses]
artists = pd.concat(frames)
artists.head()

Unnamed: 0,name,playcount,listeners,mbid,url,streamable,image
0,Skrillex,75604223,2488974,ae002c5d-aac6-490b-a39a-30aa9e2edf2b,https://www.last.fm/music/Skrillex,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
1,Duran Duran,48955941,2381125,1a1cd7f3-e5df-4eca-bae2-2757c9e656b5,https://www.last.fm/music/Duran+Duran,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
2,TLC,22548288,2020846,99790314-885a-4975-8614-9c5bc890364d,https://www.last.fm/music/TLC,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
3,My Bloody Valentine,94671048,1648537,8ca01f46-53ac-4af2-8516-55a909c0905e,https://www.last.fm/music/My+Bloody+Valentine,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
4,Wiz Khalifa,66854659,3239453,f5dfa020-ad69-41cd-b3d4-fd7af0414e94,https://www.last.fm/music/Wiz+Khalifa,0,[{'#text': 'https://lastfm.freetls.fastly.net/...


In [13]:
# Shape of full df

artists.shape

(6850, 7)

In [14]:
# Removing image column

artists = artists.drop('image', axis=1)
artists.head()

Unnamed: 0,name,playcount,listeners,mbid,url,streamable
0,Skrillex,75604223,2488974,ae002c5d-aac6-490b-a39a-30aa9e2edf2b,https://www.last.fm/music/Skrillex,0
1,Duran Duran,48955941,2381125,1a1cd7f3-e5df-4eca-bae2-2757c9e656b5,https://www.last.fm/music/Duran+Duran,0
2,TLC,22548288,2020846,99790314-885a-4975-8614-9c5bc890364d,https://www.last.fm/music/TLC,0
3,My Bloody Valentine,94671048,1648537,8ca01f46-53ac-4af2-8516-55a909c0905e,https://www.last.fm/music/My+Bloody+Valentine,0
4,Wiz Khalifa,66854659,3239453,f5dfa020-ad69-41cd-b3d4-fd7af0414e94,https://www.last.fm/music/Wiz+Khalifa,0


In [15]:
# Checking streamable values

artists.streamable.unique()

array(['0'], dtype=object)

In [16]:
# All columns have same value in row, removing redundant information

artists = artists.drop('streamable', axis=1)
artists.head(30)

Unnamed: 0,name,playcount,listeners,mbid,url
0,Skrillex,75604223,2488974,ae002c5d-aac6-490b-a39a-30aa9e2edf2b,https://www.last.fm/music/Skrillex
1,Duran Duran,48955941,2381125,1a1cd7f3-e5df-4eca-bae2-2757c9e656b5,https://www.last.fm/music/Duran+Duran
2,TLC,22548288,2020846,99790314-885a-4975-8614-9c5bc890364d,https://www.last.fm/music/TLC
3,My Bloody Valentine,94671048,1648537,8ca01f46-53ac-4af2-8516-55a909c0905e,https://www.last.fm/music/My+Bloody+Valentine
4,Wiz Khalifa,66854659,3239453,f5dfa020-ad69-41cd-b3d4-fd7af0414e94,https://www.last.fm/music/Wiz+Khalifa
5,Neon Trees,24442717,1622917,16243662-8538-4746-a0fb-0d15b5828b8e,https://www.last.fm/music/Neon+Trees
6,U2,179339775,4311995,a3cb23fc-acd3-4ce0-8f36-1e5aa6a18432,https://www.last.fm/music/U2
7,Liana Flores,15309385,746662,,https://www.last.fm/music/Liana+Flores
8,Zedd,43600195,1964721,56c4b861-0922-4c3a-a9b9-3bfcb00f8274,https://www.last.fm/music/Zedd
9,Avenged Sevenfold,147191511,2324597,24e1b53c-3085-4581-8472-0b0088d2508c,https://www.last.fm/music/Avenged+Sevenfold


In [17]:
# Create function to add most popular tag for each artist

def get_artist_tags(artist):
    response = lastfm_get({
        'method': 'artist.getTopTags',
        'artist':  artist
    })

    # Error check
    if response.status_code != 200:
        return None

    # Get first tag
    tags = [t['name'] for t in response.json()['toptags']['tag'][:1]]

    # Limit calls from API
    if not getattr(response, 'from_cache', False):
        time.sleep(0.25)
    return tags

In [18]:
# Run function with tqdm apply function to display progress and estimated time to completion

artists['tags'] = artists['name'].progress_apply(get_artist_tags)

  0%|          | 0/6850 [00:00<?, ?it/s]

In [19]:
artists.head()

Unnamed: 0,name,playcount,listeners,mbid,url,tags
0,Skrillex,75604223,2488974,ae002c5d-aac6-490b-a39a-30aa9e2edf2b,https://www.last.fm/music/Skrillex,[dubstep]
1,Duran Duran,48955941,2381125,1a1cd7f3-e5df-4eca-bae2-2757c9e656b5,https://www.last.fm/music/Duran+Duran,[new wave]
2,TLC,22548288,2020846,99790314-885a-4975-8614-9c5bc890364d,https://www.last.fm/music/TLC,[rnb]
3,My Bloody Valentine,94671048,1648537,8ca01f46-53ac-4af2-8516-55a909c0905e,https://www.last.fm/music/My+Bloody+Valentine,[shoegaze]
4,Wiz Khalifa,66854659,3239453,f5dfa020-ad69-41cd-b3d4-fd7af0414e94,https://www.last.fm/music/Wiz+Khalifa,[Hip-Hop]


In [20]:
# Create csv for uploading to MySQL server

artists_csv_data = artists.to_csv('artists.csv', index = False) 

In [21]:
# Data wrangling continued in MySQL

artists.describe()

Unnamed: 0,name,playcount,listeners,mbid,url,tags
count,6850,6850,6850,6850.0,6850,6850
unique,6837,6834,6799,3599.0,6837,571
top,YOUNG MULTI,340350,54556,,https://www.last.fm/music/YOUNG+MULTI,[pop]
freq,2,2,3,3241.0,2,393
