In [1]:
#pip install requests-cache

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import requests
import requests_cache
import json
import time
from IPython.display import clear_output
from tqdm.notebook import tqdm
tqdm.pandas()


In [3]:
API_KEY = '06ac7ca40cbf657711947df768107808'
USER_AGENT = 'BaggerMagger'

In [4]:
# Testing response with key and user

headers = {
    'user-agent': USER_AGENT
}

payload = {
    'api_key': API_KEY,
    'method': 'chart.getTopArtists',
    'format': 'json'
}

response = requests.get('https://ws.audioscrobbler.com/2.0/', headers=headers, params=payload)
response.status_code

200

In [5]:
# Function to get from Last FM api

def lastfm_get(payload):
    headers = {'user-agent': USER_AGENT}
    url = 'https://ws.audioscrobbler.com/2.0/'

    # Add API key and format to the payload
    payload['api_key'] = API_KEY
    payload['format'] = 'json'

    response = requests.get(url, headers=headers, params=payload)
    return response

In [6]:
# Testing function

response = lastfm_get(payload)
response.status_code

200

In [7]:
# Function to print json format from API call

def jprint(json_object):
    text = json.dumps(json_object, sort_keys=True, indent=4)
    print(text)

In [8]:
jprint(response.json())

{
    "artists": {
        "@attr": {
            "page": "1",
            "perPage": "50",
            "total": "5588168",
            "totalPages": "111764"
        },
        "artist": [
            {
                "image": [
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/34s/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "size": "small"
                    },
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/64s/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "size": "medium"
                    },
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/174s/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "size": "large"
                    },
                    {
                        "#text": "https://lastfm.freetls.fastly.net/i/u/300x300/2a96cbd8b46e442fc41c2b86b821562f.png",
                        "s

In [9]:
jprint(response.json()['artists']['@attr'])

{
    "page": "1",
    "perPage": "50",
    "total": "5588168",
    "totalPages": "111764"
}


In [10]:
# Loop to receive information about top artists from last.fm
# Running loop typically takes roughly 1.5 hours

responses = []

page = 1
total_pages = 2     # Dummy number to start the loop, set as any number higher than 1

while page <= total_pages:
    payload = {
        'method': 'chart.getTopArtists',
        'limit': 500,
        'page': page
    }

    # Print status and clear each loop
    print(f"Requesting page {page}/{total_pages}")
    clear_output(wait = True)

    # API call
    response = lastfm_get(payload)

    # Check response for error, break loop if true
    if response.status_code != 200:
        print(response.text)
        break

    # Extract information for while loop from API call
    page = int(response.json()['artists']['@attr']['page'])
    total_pages = int(response.json()['artists']['@attr']['totalPages'])

    responses.append(response)
    
    # Check if cached, if not sleep to prevent too many API calls
    if not getattr(response, 'from_cache', False):
        time.sleep(0.25)

    page += 1

Requesting page 11181/11181


In [11]:
# Convert pages dataframes

r0 = responses[0]
r0_json = r0.json()
r0_artists = r0_json['artists']['artist']
r0_df = pd.DataFrame(r0_artists)
r0_df.shape # Check shape, should be 500 rows or as specified in limit

(500, 7)

In [12]:
# Combine each page into one df

frames = [pd.DataFrame(r.json()['artists']['artist']) for r in responses]
artists = pd.concat(frames)
artists.head()

Unnamed: 0,name,playcount,listeners,mbid,url,streamable,image
0,Taylor Swift,1772569650,4295299,20244d07-534f-4eff-b4d4-930878889970,https://www.last.fm/music/Taylor+Swift,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
1,The Weeknd,545335861,3569189,c8b03190-306c-4120-bb0b-6f2ebfc06ea9,https://www.last.fm/music/The+Weeknd,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
2,Drake,561148590,5218186,b49b81cc-d5b7-4bdd-aadb-385df8de69a6,https://www.last.fm/music/Drake,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
3,Kanye West,858028673,6261136,164f0d73-1234-4e2c-8743-d77bf2191051,https://www.last.fm/music/Kanye+West,0,[{'#text': 'https://lastfm.freetls.fastly.net/...
4,Lana Del Rey,771388485,3703443,b7539c32-53e7-4908-bda3-81449c367da6,https://www.last.fm/music/Lana+Del+Rey,0,[{'#text': 'https://lastfm.freetls.fastly.net/...


In [13]:
# Shape of full df

artists.shape

(10000, 7)

In [14]:
# Removing image column

artists = artists.drop('image', axis=1)
artists.head()

Unnamed: 0,name,playcount,listeners,mbid,url,streamable
0,Taylor Swift,1772569650,4295299,20244d07-534f-4eff-b4d4-930878889970,https://www.last.fm/music/Taylor+Swift,0
1,The Weeknd,545335861,3569189,c8b03190-306c-4120-bb0b-6f2ebfc06ea9,https://www.last.fm/music/The+Weeknd,0
2,Drake,561148590,5218186,b49b81cc-d5b7-4bdd-aadb-385df8de69a6,https://www.last.fm/music/Drake,0
3,Kanye West,858028673,6261136,164f0d73-1234-4e2c-8743-d77bf2191051,https://www.last.fm/music/Kanye+West,0
4,Lana Del Rey,771388485,3703443,b7539c32-53e7-4908-bda3-81449c367da6,https://www.last.fm/music/Lana+Del+Rey,0


In [15]:
# Checking streamable values

artists.streamable.unique()

array(['0'], dtype=object)

In [16]:
# All columns have same value in row, removing redundant information

artists = artists.drop('streamable', axis=1)
artists.head(30)

Unnamed: 0,name,playcount,listeners,mbid,url
0,Taylor Swift,1772569650,4295299,20244d07-534f-4eff-b4d4-930878889970,https://www.last.fm/music/Taylor+Swift
1,The Weeknd,545335861,3569189,c8b03190-306c-4120-bb0b-6f2ebfc06ea9,https://www.last.fm/music/The+Weeknd
2,Drake,561148590,5218186,b49b81cc-d5b7-4bdd-aadb-385df8de69a6,https://www.last.fm/music/Drake
3,Kanye West,858028673,6261136,164f0d73-1234-4e2c-8743-d77bf2191051,https://www.last.fm/music/Kanye+West
4,Lana Del Rey,771388485,3703443,b7539c32-53e7-4908-bda3-81449c367da6,https://www.last.fm/music/Lana+Del+Rey
5,Kendrick Lamar,434769991,3259373,381086ea-f511-4aba-bdf9-71c753dc5077,https://www.last.fm/music/Kendrick+Lamar
6,Doja Cat,239833551,2223036,5df62a88-cac9-490a-b62c-c7c88f4020f4,https://www.last.fm/music/Doja+Cat
7,Arctic Monkeys,581012890,5299449,ada7a83c-e3e1-40f1-93f9-3e73dbc9298a,https://www.last.fm/music/Arctic+Monkeys
8,"Tyler, the Creator",437271830,2565689,f6beac20-5dfe-4d1f-ae02-0b0a740aafd6,"https://www.last.fm/music/Tyler,+the+Creator"
9,Rihanna,343702101,6391808,db36a76f-4cdf-43ac-8cd0-5e48092d2bae,https://www.last.fm/music/Rihanna


In [17]:
# Create function to add most popular tag for each artist

def get_artist_tags(artist):
    response = lastfm_get({
        'method': 'artist.getTopTags',
        'artist':  artist
    })

    # Error check
    if response.status_code != 200:
        return None

    # Get first tag
    tags = [t['name'] for t in response.json()['toptags']['tag'][:1]]

    # Limit calls from API
    if not getattr(response, 'from_cache', False):
        time.sleep(0.25)
    return tags

In [18]:
# Run function with tqdm apply function to display progress and estimated time to completion

artists['tags'] = artists['name'].progress_apply(get_artist_tags)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [19]:
artists.head()

Unnamed: 0,name,playcount,listeners,mbid,url,tags
0,Taylor Swift,1772569650,4295299,20244d07-534f-4eff-b4d4-930878889970,https://www.last.fm/music/Taylor+Swift,[country]
1,The Weeknd,545335861,3569189,c8b03190-306c-4120-bb0b-6f2ebfc06ea9,https://www.last.fm/music/The+Weeknd,[rnb]
2,Drake,561148590,5218186,b49b81cc-d5b7-4bdd-aadb-385df8de69a6,https://www.last.fm/music/Drake,[Hip-Hop]
3,Kanye West,858028673,6261136,164f0d73-1234-4e2c-8743-d77bf2191051,https://www.last.fm/music/Kanye+West,[Hip-Hop]
4,Lana Del Rey,771388485,3703443,b7539c32-53e7-4908-bda3-81449c367da6,https://www.last.fm/music/Lana+Del+Rey,[female vocalists]


In [20]:
# Create csv for uploading to MySQL server

artists_csv_data = artists.to_csv('artists.csv', index = False) 

In [21]:
# Data wrangling continued in MySQL

artists.describe()

Unnamed: 0,name,playcount,listeners,mbid,url,tags
count,10000,10000,10000,10000.0,10000,10000
unique,9813,9807,9729,5302.0,9813,674
top,José Feliciano,1320940,107810,,https://www.last.fm/music/Jos%C3%A9+Feliciano,[Hip-Hop]
freq,2,3,4,4599.0,2,605
