# __Spotify API__

In [1]:
# Data Manipulation and Analysis
import pandas as pd
#import math

# Web and API Interactions
import requests
import urllib.parse

# Data Serialization and Storage
import json

# Security and Secrets Management
import secrets
from dotenv import load_dotenv

# System and File Interactions
import os

# Time and Scheduling
from time import sleep

# Error messages
import logging

# Data Manipulation and Analysis
import ast

In [2]:
# Get the current working directory
this_dir = os.getcwd()

# Define directories for XML and data files
xml_dir = os.path.join(this_dir, 'xml_dir')
data_dir = os.path.join(this_dir, 'data')

# Load environment variables from .env file
load_dotenv()

# Get username and password from environment variables
username = os.getenv('USERNAME')
password = os.getenv('PASSWORD')

# Set redirect URI for Spotify authentication
redirect_uri = 'https://open.spotify.com/intl-it'

# Define endpoints for Spotify API interactions
e_points = {
    # Authorization endpoint for user authentication
    'auth': 'https://accounts.spotify.com/authorize',
    # Token endpoint for obtaining access token
    'token': 'https://accounts.spotify.com/api/token',
    # Login endpoint for user login
    'logint': 'https://accounts.spotify.com/it/login',
    # Endpoint for retrieving user's followed artists
    'artist': 'https://api.spotify.com/v1/me/following',
}

# Initialize an empty dictionary to store standard payload data
standard_payload = {}

# Get Spotify client ID from environment variable and add to payload
client_id = os.getenv('SPOTIFY_CLIENT_ID')
standard_payload['client_id'] = client_id  # Add client ID to payload

# Get Spotify client secret from environment variable and add to payload
client_secret = os.getenv('SPOTIFY_CLIENT_SECRET')
standard_payload['client_secret'] = client_secret  # Add client secret to payload

## Authorization

The first step is to request authorization from the user so that our app can access to the Spotify resources on the user's behalf.  
To do this, our application must build and __send a GET request__ to the /authorize endpoint with the following parameters:

```python
    AUTH_ENDPOINT = 'https://accounts.spotify.com/authorize'
```

<table data-encore-id="table" class="Table__TableElement-sc-evwssh-0 fPsMQq"><thead><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Query Parameter</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Relevance</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Value</th></tr></thead><tbody><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">client_id</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">The Client ID generated after registering your application.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">response_type</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Set to <code>code</code>.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">redirect_uri</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">The URI to redirect to after the user grants or denies permission. This URI needs to have been entered in the Redirect URI allowlist that you specified when you registered your application (See the <a data-encore-id="textLink" class="Link-sc-k8gsk-0 hkKYOq" href="https://developer.spotify.com/documentation/web-api/concepts/apps">app guide</a>). The value of <code>redirect_uri</code> here must exactly match one of the values you entered when you registered your application, including upper or lowercase, terminating slashes, and such.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">state</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Optional, but strongly recommended</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">This provides protection against attacks such as cross-site request forgery. See <a data-encore-id="textLink" class="Link-sc-k8gsk-0 hkKYOq" href="https://datatracker.ietf.org/doc/html/rfc6749#section-4.1">RFC-6749</a>.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">scope</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Optional</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">A space-separated list of <a data-encore-id="textLink" class="Link-sc-k8gsk-0 hkKYOq" href="https://developer.spotify.com/documentation/web-api/concepts/scopes">scopes</a>.If no scopes are specified, authorization will be granted only to access publicly available information: that is, only information normally visible in the Spotify desktop, web, and mobile players.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">show_dialog</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Optional</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Whether or not to force the user to approve the app again if they’ve already done so. If <code>false</code> (default), a user who has already approved the application may be automatically redirected to the URI specified by <code>redirect_uri</code>. If <code>true</code>, the user will not be automatically redirected and will have to approve the app again.</span></td></tr></tbody></table>

```python
    AUTH_ENDPOINT = 'https://accounts.spotify.com/authorize'
```

In [3]:
# Define query parameters for the authorization request
querystring = {
    # Client ID obtained from Spotify Developer Dashboard
    'client_id': client_id,
    # Response type set to 'code' for authorization code flow
    'response_type': 'code',
    # Redirect URI for after authorization
    'redirect_uri': redirect_uri,
    # Random state parameter to prevent CSRF attacks
    'state': secrets.token_hex(8),
    # Scope of permissions requested (in this case, user-follow-read)
    'scope': 'user-follow-read',
}

# Send a GET request to the Spotify authorization endpoint with query parameters
r = requests.get(e_points['auth'], params=urllib.parse.urlencode(querystring))

# Print the full URL of the authorization request
print(r.url)

https://accounts.spotify.com/it/login?continue=https%3A%2F%2Faccounts.spotify.com%2Fauthorize%3Fscope%3Duser-follow-read%26response_type%3Dcode%26redirect_uri%3Dhttps%253A%252F%252Fopen.spotify.com%252Fintl-it%26state%3Deaa7116a67afe613%26client_id%3D63ff05747c964b36b7464af88058c1e1


In [7]:
# Function to extract the authorization code from a URL
auth_code_url = 'https://open.spotify.com/intl-it?code=AQAjcSx_OPEELR83aM99gOdi3QtemX87D3-L8t_Yal0dmYuary4E0BhyTJI2s6FHuyEeFCoM9lcuo8z437o2fpxhOqxMNjLw8249So7f3AEWnIZldhGMK3GXGqAuRJmXEqQ99opMnKvTx2UcBjRFyVzWQ6L7fwW60naKjIwEpsX7aZ8yt4A60LJEm4HxZnDtPHFzWfza&state=eaa7116a67afe613'

# Extract the authorization code from the URL
# The authorization code is the value of the 'code' query string
# It is extracted using the split() method to divide the URL into parts
# and select the part that contains the authorization code
auth_code = auth_code_url.split('code=')[-1].split('&state=')[0]

## Tokens

### Access

If the user accepted your request, then your app is ready to exchange the authorization code for an access token.  
It can do this by __sending a POST request__ to the /api/token endpoint.  

The following headers must be included in the request:

<table data-encore-id="table" class="Table__TableElement-sc-evwssh-0 fPsMQq"><thead><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Header Parameter</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Relevance</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Value</th></tr></thead><tbody><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Authorization</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Base 64 encoded string that contains the client ID and client secret key. The field must have the format: <code>Authorization: Basic &lt;base64 encoded client_id:client_secret&gt;</code></span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Content-Type</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Set to <code>application/x-www-form-urlencoded</code>.</span></td></tr></tbody></table>

The body of this POST request must contain these parameters encoded in application/x-www-form-urlencoded:

<table data-encore-id="table" class="Table__TableElement-sc-evwssh-0 fPsMQq"><thead><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Body Parameters</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Relevance</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Value</th></tr></thead><tbody><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">grant_type</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">This field must contain the value <code>"authorization_code"</code>.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">code</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">The authorization code returned from the previous request.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">redirect_uri</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">This parameter is used for validation only (there is no actual redirection). The value of this parameter must exactly match the value of <code>redirect_uri</code> supplied when requesting the authorization code.</span></td></tr></tbody></table>

In [5]:
def get_tokens(auth_code: str) -> tuple:
    """
    Retrieves access and refresh tokens from Spotify API using authorization code.

    Args:
        auth_code (str): The authorization code obtained from the Spotify authorization flow.

    Returns:
        tuple: A tuple containing the access token and refresh token.

    Raises:
        Exception: If the response status code is not 200 OK.
    """
    # Create a copy of the standard payload dictionary
    payload = standard_payload.copy()

    # Add the required parameters to the payload dictionary
    payload.update({
        'grant_type': 'authorization_code',
        'code': auth_code,
        'redirect_uri': redirect_uri,
    })

    # Send a POST request to the Spotify token endpoint with the payload
    response = requests.post(e_points['token'], data=payload)

    # Check if the response status code is 200 OK
    if response.status_code == 200:
        # Print the response object for debugging purposes
        print('Response object:', json.dumps(response.json(), indent=4))
        # Extract the access token and refresh token from the response data
        data = response.json()
        return data['access_token'], data['refresh_token']
    else:
        # Raise an exception if the response status code is not 200 OK
        raise Exception(f"Error: {response.status_code}, {response.text}")


def get_new_access_token(r_token: str) -> str:
    """
    Retrieves a new access token from Spotify API using a refresh token.

    Args:
        r_token (str): The refresh token obtained from a previous authorization flow.

    Returns:
        str: The new access token.

    Raises:
        Exception: If the response status code is not 200 OK.
    """
    # Create a copy of the standard payload dictionary
    payload = standard_payload.copy()

    # Add the required parameters to the payload dictionary
    payload.update({
        'grant_type': 'refresh_token',
        'refresh_token': r_token,
    })

    # Send a POST request to the Spotify token endpoint with the payload
    response = requests.post(e_points['token'], data=payload)

    # Check if the response status code is 200 OK
    if response.status_code == 200:
        # Print the response object for debugging purposes
        print('Response object:', json.dumps(response.json(), indent=4))
        # Extract the new access token from the response data
        data = response.json()
        return data['access_token']
    else:
        # Raise an exception if the response status code is not 200 OK
        raise Exception(f"Error: {response.status_code}, {response.text}")

In [8]:
# Retrieve access and refresh tokens from Spotify API using the authorization code
access_token, refresh_token = get_tokens(auth_code)

Response object: {
    "access_token": "BQBcrv1Q_ojkaaYaHaYpDhdn4cXJAQadO4puxJVMW6OOTeAvGsdOeG-P7T76Ly7n26cTWWIeF5YeCo6s6xTw5nAglT-GdqEBYvCfv8KXG8a0EfK_alSWSKgmmfk3mkS-PasK-DCyM3NurX6aNwfpAURf9Z02LLvvTBwV543jlgwDZgZzLM_fCmcQ9PiHKARYrTY",
    "token_type": "Bearer",
    "expires_in": 3600,
    "refresh_token": "AQAlrrITdQ0IOFiUg_g9i-flED8ucU7gMa8XqWc41FZQrotznB9QzhNJIgsiKZQB6ESkkOdMqbvkA-F6Gn-QjH-09HiAsw1AH4XZpQN67SeerHSrHJTjX9DjaeRcq14_e_Y",
    "scope": "user-follow-read"
}


### Refresh

To refresh an access token, we must __send a POST request__ to the /api/token endpoint with the following headers:

<table data-encore-id="table" class="Table__TableElement-sc-evwssh-0 fPsMQq"><thead><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Header Parameter</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Relevance</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Value</th></tr></thead><tbody><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Content-Type</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Always set to <code>application/x-www-form-urlencoded</code>.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Authorization</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><strong>Only required for the</strong> <a data-encore-id="textLink" class="Link-sc-k8gsk-0 hkKYOq" href="https://developer.spotify.com/documentation/web-api/tutorials/code-flow">Authorization Code</a></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Base 64 encoded string that contains the client ID and client secret key. The field must have the format: <code>Authorization: Basic &lt;base64 encoded client_id:client_secret&gt;</code></span></td></tr></tbody></table>

and the following body:

<table data-encore-id="table" class="Table__TableElement-sc-evwssh-0 fPsMQq"><thead><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Body Parameter</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Relevance</th><th scope="col" class="TableHeaderCell__TableHeaderCellElement-sc-16kf5kl-0 gfUuhl encore-text-body-small-bold" data-encore-id="tableHeaderCell">Value</th></tr></thead><tbody><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">grant_type</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">Set it to <code>refresh_token</code>.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">refresh_token</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><em>Required</em></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">The refresh token returned from the authorization token request.</span></td></tr><tr data-encore-id="tableRow" class="TableRow__TableRowElement-sc-1kuhzdh-0 hoXXxN"><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">client_id</span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM"><strong>Only required for the</strong> <a data-encore-id="textLink" class="Link-sc-k8gsk-0 hkKYOq" href="https://developer.spotify.com/documentation/web-api/tutorials/code-flow">PKCE extension</a></span></td><td class="TableCell__TableCellElement-sc-1nn7cfv-0 gOsDpy encore-text-body-small" data-encore-id="tableCell"><span data-encore-id="type" class="Type__TypeElement-sc-goli3j-0 kqItdM">The client ID for your app, available from the developer dashboard.</span></td></tr></tbody></table>

In [9]:
new_access_token = get_new_access_token(refresh_token)

Response object: {
    "access_token": "BQB8mob6yveWag6IotXtt0d5qlRxh2_tStIC6623TGgeD0dmasmuKrbXfFvymzQ1P4OcwZwRHRT2VVWPC0kzuRm_tpCiZu0-_d9-tQlO_ZLbAb7mOyzBp9jp_mJh8atv2EKyhjE-7jBT7y1qFAIdzEsY9zNCUCGH1Ko362MDEPIcs2qTE_YITZ7prZEZbrc2Gnc",
    "token_type": "Bearer",
    "expires_in": 3600,
    "scope": "user-follow-read"
}


## API Call

In [10]:
def extendar(book: dict, pages: dict) -> dict:

    assert isinstance(book, dict), f'\'Book\' element must be a dict, not a {type(book).__name__.upper()}. Exiting...\n'
    assert isinstance(pages, dict), f'\'Pages\' element must be a dict, not a {type(pages).__name__.upper()}. Exiting...\n'

    book_copy = book.copy()
    for k, v in pages.items():

        if k in book_copy: # Should be always True!
            if isinstance(book_copy[k], list):
                if isinstance(v, list):
                    book_copy[k] = book_copy[k] + v
                else:
                    book_copy[k].append(v)
            else:
                book_copy[k] = [book_copy[k], v]
        else:
            book_copy[k] = v

    return book_copy

### Response (JSON)

For every call, the response is a JSON with a __dict__ object. This dict has a single key '_artists_'.  
Its value is a dict with 6 keys:  
__['items', 'next', 'total', 'cursors', 'limit', 'href']__

| i | Key     | Val type   |   Len | First Val
|---:|:--------|:-----------|:------|:----------
|  1 | __items__   | list       |    50 | Array of artist objects. (see below)|
|  2 | __next__    | str        |   | URL to the next page of items. ( null if none) |
|  3 | __total__   | int        |   | The total number of items available to return. |                                                                                                                                                                                                                    
|  4 | __cursors__ | dict       |     1 | 'after': the cursor to use as key to find the next page of items; 'before': the cursor to use as key to find the previous page of items.  |
|  5 | __limit__   | int        |   | The maximum number of items in the response (as set in the query or by default: 20). |
|  6 | __href__    | str        |   | A link to the Web API endpoint returning the full result of the request. |     

Values of '_items_' from the previous 'artists' dict have type dict with 10 keys:  
__['__external_urls__', '__followers__', '__genres__', 'href', '__id__', 'images', '__name__', 'popularity', '__type__', 'uri']__

| i | Key           | Val type   |   Len | First Val    |  What is? |
|----|---------------|------------|-------|------------------| ----------------- | 
|  1 | external_urls    |   dict    |    1  |   spotify |   key: 'spotify', value: 'the Spotify URL for the object'|
|  2 | followers    | dict       |  2    |  href | key: 'total', value: 'the total integer of followers' |
|  3 | __genres__   | list       |  6    |  c64 | list of genres the artist is associated with: ["Prog rock","Grunge"] |
|  4 | href | str   |   | /v1/artists/{artist_id} |     |
|  5 | __id__   | str   |   |   00Uv0804nrBM2RxUBTkyHj  |   Spotify ID for the artist   |
|  6 | images   |   list    |   3   |   {'height': '', 'url': '', 'width': ''}   |      |
|  7 | __name__ |   str |   |   Wobbler |   Name of the artist |
|  8 | popularity    | int        | | 17  |  |
|  9 | __type__          | str        | | artist  | "artist" |
| 10 | uri           | str        | | spotify:artist:00Uv0804nrBM2RxUBTkyHj   |  |

In [11]:
limit = 50
def get_follow_dict(init_dict: dict | None = None, after_value: str | None = None, access_token: str | None = None) -> dict:
    
    headers_flist = {'Authorization': 'Bearer ' + access_token}
    querystring_flist = {'type': 'artist', 'limit': limit}
    artist_epoint = e_points['artist']

    if not init_dict:
        try:
            response = requests.get(artist_epoint, params=urllib.parse.urlencode(querystring_flist), headers=headers_flist)
            response.raise_for_status()
            init_dict = response.json().get('artists')
            print(f'Initialized \'init_dict\' {type(init_dict).__name__.upper()}: {list(init_dict.keys())}.\n')
        except requests.exceptions.RequestException as e:
            logging.error(f"Error in 'get_follow_dict' function: {str(e)}\n")
            return {}
        
    if not after_value:
        if len(init_dict.get('items')) == init_dict.get('total'):
            print(f'LIST ALREADY EXISTS AND IT\'S FULL!!\n')
            return init_dict
        after_value = init_dict.get('cursors').get('after')
 
    querystring_flist['after'] = after_value
    try:
        response = requests.get(artist_epoint, params=urllib.parse.urlencode(querystring_flist), headers=headers_flist)
        response.raise_for_status()
        data = response.json().get('artists')
        return extendar(init_dict, data)
    
    except requests.exceptions.RequestException as e:
        logging.error(f"Error in 'get_follow_dict' function: {str(e)}")

In [12]:
follow_dict, after_value, actual_length, total = None, None, 0, 0
while True:
    follow_dict = get_follow_dict(follow_dict, after_value, access_token)

    if not total:    
        total = follow_dict.get('total')[-1]
    actual_length = len(follow_dict.get('items'))

    print(f'>>>Follow_dict has {actual_length} items\n')

    if actual_length >= total:
        print('\t!!List is full!!')
        print(f'\t\tFollow_dict \'total\' items: {total}')
        print(f'\t\tFollow_dict fetched \'items\': {actual_length}')
        print(f'\t\tFollow_dict type: {type(follow_dict).__name__.upper()}')
        print(f'\t\tFollow_dict keys: {list(follow_dict.keys())}\n')
        break

    after_value = follow_dict.get('cursors')[-1].get('after')

Initialized 'init_dict' DICT: ['items', 'next', 'total', 'cursors', 'limit', 'href'].

>>>Follow_dict has 100 items

>>>Follow_dict has 150 items

>>>Follow_dict has 200 items

>>>Follow_dict has 250 items

>>>Follow_dict has 257 items

	!!List is full!!
		Follow_dict 'total' items: 257
		Follow_dict fetched 'items': 257
		Follow_dict type: DICT
		Follow_dict keys: ['items', 'next', 'total', 'cursors', 'limit', 'href']



In [13]:
# Crea una lista di nomi di artisti seguiti
namelist = [follow_dict.get('items')[i].get('name') for i in range(actual_length)]

# Identifica e stampa i nomi di artisti duplicati nella lista
duplicates = set([name for name in namelist if namelist.count(name) > 1])
for name in duplicates:
    print(name)

Zeus


### Short (DataFrame)

Colonne:

- __id__: str long 22 (Spotify ID)
- __name__: str (Name of the artist)
- __genres__: list (list of genres)
- __count of genres__: int (number of genres in artist)
- __followers__: int (number of followers)
- __popularity__: int (Spotify score of the artist)

In [14]:
if not os.path.exists('spotify_followlist_short.csv'):
    relevant_cols = ['id', 'name', 'genres', 'followers', 'popularity']
    short_df = pd.DataFrame(follow_dict.get('items'), columns=relevant_cols)

    short_df['name'] = short_df['name'].str.lower()
    short_df['followers'] = short_df['followers'].apply(lambda x: x.get('total'))

    short_df['specific_genre'] = short_df['genres'].apply(lambda x: set(word for sublist in x for word in sublist.split() ))

    short_df.to_csv('spotify_followlist_short.csv', index=False)
    print('File created successfully')
elif len(pd.read_csv('spotify_followlist_short.csv')) == total:
    print('Nothing to do! File already exists and it\'s full!')
else:
    print('File already exists')
    short_df = pd.read_csv('spotify_followlist_short.csv')
    short_df['genres'] = short_df['genres'].apply(lambda x: ast.literal_eval(x))
    short_df['specific_genre'] = short_df['genres'].apply(lambda x: set(word for sublist in x for word in sublist.split()))

short_df.head(2)


File created successfully


Unnamed: 0,id,name,genres,followers,popularity,specific_genre
0,00Uv0804nrBM2RxUBTkyHj,wobbler,"[c64, italian progressive rock, neo-progressiv...",18632,18,"{italian, neo-progressive, norwegian, progress..."
1,02fDf7HEPtBZLtPzCyxSR2,lanark artefax,"[deconstructed club, fluxwork, mandible, scott...",17495,21,"{club, mandible, deconstructed, scottish, uk, ..."


### Long (DataFrame)

In [15]:
long_df = pd.DataFrame(follow_dict.get('items'))

long_df.head(3)

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri
0,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 18632}","[c64, italian progressive rock, neo-progressiv...",https://api.spotify.com/v1/artists/00Uv0804nrB...,00Uv0804nrBM2RxUBTkyHj,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Wobbler,18,artist,spotify:artist:00Uv0804nrBM2RxUBTkyHj
1,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 17495}","[deconstructed club, fluxwork, mandible, scott...",https://api.spotify.com/v1/artists/02fDf7HEPtB...,02fDf7HEPtBZLtPzCyxSR2,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Lanark Artefax,21,artist,spotify:artist:02fDf7HEPtBZLtPzCyxSR2
2,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 146537}","[bergamo indie, italian alternative]",https://api.spotify.com/v1/artists/058thnz6oda...,058thnz6odaxoAN4n3cMBN,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Verdena,43,artist,spotify:artist:058thnz6odaxoAN4n3cMBN


### Genres (DataFrame)

In [16]:
# Set dei 'genres'
genre_set = set()
for genre in long_df['genres']:
    genre_set.update(genre)


# Esplodi la colonna 'genres' in righe separate
genres_df = long_df['genres'].explode()

# Conta le occorrenze di ogni elemento univoco
genres_df = genres_df.value_counts().sort_values(ascending=False)

# Converte la serie in un dataframe
genres_df = genres_df.to_frame().reset_index()
# Reindicizza il dataframe per riordinare gli indici
genres_df = genres_df.reindex(range(len(genres_df)))
# Rinomina le colonne
genres_df.columns = ['genre', 'count']

In [17]:
# Stampa il numero di genere univoci
print('Unique \'genres\' count:', len(genre_set))

# Stampa le 5 prime categorie più comuni
print(f'Top 5 genres over {len(long_df)} artists retrieved:')
genres_df.head(5)

Unique 'genres' count: 373
Top 5 genres over 257 artists retrieved:


Unnamed: 0,genre,count
0,djent,22
1,post-metal,20
2,progressive metal,16
3,progressive jazz fusion,14
4,instrumental rock,14


In [18]:
def gen_count(genre):

    lookup_res = genres_df.loc[genres_df['genre'] == genre, 'count']

    return lookup_res.values[0] if len(lookup_res) == 1 else lookup_res.iloc[0]


def most_genre_col(df):

    x = df.copy()
    column = x['genres'].apply(lambda x: sorted([(gen_count(l), l) for l in x], reverse=True))
    x = x.assign(most_genre=column)

    return x

In [19]:
long_df_file_file_file_file = most_genre_col(long_df)
long_df.head(2)


Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri
0,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 18632}","[c64, italian progressive rock, neo-progressiv...",https://api.spotify.com/v1/artists/00Uv0804nrB...,00Uv0804nrBM2RxUBTkyHj,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Wobbler,18,artist,spotify:artist:00Uv0804nrBM2RxUBTkyHj
1,{'spotify': 'https://open.spotify.com/artist/0...,"{'href': None, 'total': 17495}","[deconstructed club, fluxwork, mandible, scott...",https://api.spotify.com/v1/artists/02fDf7HEPtB...,02fDf7HEPtBZLtPzCyxSR2,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",Lanark Artefax,21,artist,spotify:artist:02fDf7HEPtBZLtPzCyxSR2


# BiTown DB

## XML Maps (binary)

In [20]:
def str_to_dict(s):
    return ast.literal_eval(s)

In [21]:
data_long = os.listdir(os.path.join(os.getcwd(), 'data', 'long'))
long_df_csv = os.path.join(os.getcwd(), 'data', 'long', data_long[0])

In [23]:
long_df_file = pd.read_csv(long_df_csv, sep=';')
long_df_file.head(3)

Unnamed: 0,external_urls,followers,genres,href,id,images,name,popularity,type,uri,most_genre,specific_genre
0,{'spotify': 'https://open.spotify.com/artist/0...,18621,"['c64', 'italian progressive rock', 'neo-progr...",https://api.spotify.com/v1/artists/00Uv0804nrB...,00Uv0804nrBM2RxUBTkyHj,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",wobbler,18,artist,spotify:artist:00Uv0804nrBM2RxUBTkyHj,"[(5, 'progressive rock'), (4, 'italian progres...","{'italian', 'rock', 'neo-progressive', 'progre..."
1,{'spotify': 'https://open.spotify.com/artist/0...,17465,"['deconstructed club', 'fluxwork', 'mandible',...",https://api.spotify.com/v1/artists/02fDf7HEPtB...,02fDf7HEPtBZLtPzCyxSR2,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",lanark artefax,21,artist,spotify:artist:02fDf7HEPtBZLtPzCyxSR2,"[(4, 'uk experimental electronic'), (4, 'mandi...","{'mandible', 'club', 'deconstructed', 'uk', 'f..."
2,{'spotify': 'https://open.spotify.com/artist/0...,146423,"['bergamo indie', 'italian alternative']",https://api.spotify.com/v1/artists/058thnz6oda...,058thnz6odaxoAN4n3cMBN,"[{'height': 640, 'url': 'https://i.scdn.co/ima...",verdena,43,artist,spotify:artist:058thnz6odaxoAN4n3cMBN,"[(5, 'italian alternative'), (1, 'bergamo indi...","{'indie', 'bergamo', 'italian', 'alternative'}"


In [24]:
long_df_file['genres'] = long_df_file['genres'].apply(lambda x: ast.literal_eval(x))

In [25]:
long_df_file['most_genre'] = long_df_file['most_genre'].apply(lambda x: ast.literal_eval(x))

In [26]:
for i in list(filter(lambda x: x != 'followers', long_df.columns)):
    val_df = long_df[i].values[0]
    val_file = long_df_file[i].values[0]
    print('Time for column', i)
    if val_df != val_file:
        print('\t',i, '...', type(val_df), 'VS' ,type(val_file), '\n')
        print('\t\t', val_df, 'VS', val_file, '\n')
    for j in range(len(long_df[i].values[0])):
        print(f'{j}/{len(val_df)}')
        if val_df.values[j] != val_file.values[j]:
            print('\t\t', val_df[j], 'VS', val_file[j], '\n')

Time for column external_urls
external_urls <class 'dict'> <class 'str'> 

		 {'spotify': 'https://open.spotify.com/artist/00Uv0804nrBM2RxUBTkyHj'} VS {'spotify': 'https://open.spotify.com/artist/00Uv0804nrBM2RxUBTkyHj'} 

0/1


TypeError: 'builtin_function_or_method' object is not subscriptable

In [None]:
display(long_df_file[:2])
display(long_df[:2])

In [None]:
long_df_file.equals(long_df)

In [146]:
import gzip
import xml.etree.ElementTree as ET
from io import BytesIO
import random


# URL for sitemap of Bandsintown, which is a gzipped XML file
domain = 'https://www.bandsintown.com'
sitemap = domain + "/sitemap/sitemap.xml.gz"

user_agents_list = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.3",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
]


# Headers for requests
headers_dict = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "it-IT,it;q=0.9,en-IT;q=0.8,en;q=0.7,en-US;q=0.6",
    "cache-control": "max-age=0",
    "priority": "u=0, i",
    "sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "none",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "referer": "https://www.google.com/",
    "cookie": "_csrf=70PMEtfImhL5LR29VJVKPOLx; bit_geo=%257B%2522latitude%2522%253A45.4722%252C%2522longitude%2522%253A9.1922%252C%2522name%2522%253A%2522Milan%252C%2520Italy%2522%257D; OptanonAlertBoxClosed=2024-10-03T08:27:05.419Z; eupubconsent-v2=CQF6ZfAQF6ZfAAcABBENBJFsAP_gAEPgACiQKftV_G__bWlr8X73aftkeY1P9_h77sQxBhfJE-4FzLvW_JwXx2ExNA36tqIKmRIAu3bBIQNlGJDUTVCgaogVryDMaE2coTNKJ6BkiFMRM2dYCF5vm4tj-QKY5vr991dx2B-t7dr83dzyz4VHn3a5_2a0WJCdA5-tDfv9bROb-9IOd_x8v4v8_F_rE2_eT1l_tWvp7D9-cts7_XW89_fff_9Ln_-uB_-_3_gp4ASYaFRAGWRISEGgYQQIAVBWEBFAgAAABIGiAgBMGBTsDAJdYSIAQAoABggBAACDIAEAAAkACEQAQAFAgAAgECgADAAgGAgAYGAAMAFgIBAACA6BimBBAoFgAkZkRCmBCEAkEBLZUIJAECCuEIRZ4FEAiJgoAAASACsAAQFgsDiSQErEggS4g2gAAIAEAggAKEUnZgCCAM2WovFk2jK0wLR8wXPaYBkgQAAA.f_wACHwAAAAA; ccuid=d36095b4-dee3-4167-9508-5fa73bbf6537; _cc_id=12e49cdacf035eb741fc38f74e9f69e1; panoramaId_expiry=1728548860319; panoramaId=1824a7993c31a1ff510cdc348d7e4945a7029be6f9be70b9d450219d66d0fbb4; panoramaIdType=panoIndiv; _au_1d=AU1D-0100-001725055099-ZI839P79-SH7O; _au_last_seen_iab_tcf=1727944026396; _gid=GA1.2.38992958.1727944026; bit_pc=3; OptanonConsent=isGpcEnabled=0&datestamp=Thu+Oct+03+2024+10%3A27%3A14+GMT%2B0200+(Ora+legale+dell%E2%80%99Europa+centrale)&version=202406.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0004%3A1%2CC0003%3A1%2CC0002%3A1%2CV2STACK42%3A1&geolocation=IT%3B25&AwaitingReconsent=false; __gads=ID=67ecf92d7d6a56a2:T=1727944061:RT=1727945506:S=ALNI_MZF4MFr5sCJ7lSosHemRPe9cJ_B-Q; __gpi=UID=00000f0113668b04:T=1727944061:RT=1727945506:S=ALNI_MYzDAVIT_Wq35N7GPdoMxQhsL7tqg; __eoi=ID=b8c47d98c1014060:T=1727944061:RT=1727945506:S=AA-AfjZXoUKSlWRyu6A6Eg428QcN; cto_bundle=Xda9bF9IYzhDblJDc3hnRU44JTJCSTdabE82ZFRGSDg3Vms0Q3NqWkphTERRaHJlS3N2Z0olMkJoNG9yUWglMkZTT0xrSDEyRkx5eEUlMkZBelJyaERPNWdGWllYNjBYVXFwQ0FEaUgwODRJZlRDNnRqJTJCNzRra2M2Qm9mck8lMkJHNEY1b2NWa1hDU0NNWEFrYW9Zd1VuWUlkSk9panpVNTJSeUt2d1lRUVR2NCUyQlNwNmtXU1hZTnVLMCUzRA; cto_bidid=CAtonF95ZnkxMTloJTJCMlptQXkzM09ZWnJzQUZjNnhNaEpoMjUwZm04ZVZROTlCVEI4ZEViNFRHRzFVam1JUjZTWDV5a1oxNzhTQzg5TW9BMHAxYzQxJTJCb0UzJTJCZCUyQk5KUmNySFdzN05wWE41NkIlMkJRTEwyWFpmUWdrMnlsZmxkQlo3S0tkTjU; _ga_7VSQQ2WNWN=GS1.1.1727944025.1.1.1727945473.0.0.0; _ga_FVWZ0RM4DH=GS1.1.1727944033.1.0.1727945473.60.0.0; _ga=GA1.1.1945502946.1727944026"
}

In [22]:
"""
Scarica e analizza il file XML compresso da un URL specificato.

La funzione scarica il file XML compresso da un URL specificato, lo decomprime e lo analizza per estrarre gli URL degli artisti presenti nel file.

Args:
    url (str): L'URL del file XML compresso da scaricare.

Returns:
    list: Una lista di URL degli artisti presenti nel file XML.

Raises:
    None: La funzione stampa un messaggio di errore e restituisce None se si verifica un errore durante il download o l'analisi del file.
"""
def xml_maptree(url):

    global s

    if not s:
        s = requests.Session()

    headers_dict['User-Agent'] = random.choice(user_agents_list)
    print('\nRequest for:', url)
    r = s.get(url, headers=headers_dict)

    if r.status_code == 200:
        if not isinstance(r.content, bytes):
            print('Error: Content is not bytes')
            return None
    else:
        print('Error:', r.status_code)
        return None

    with gzip.GzipFile(fileobj=BytesIO(r.content)) as gz:
        # READ
        xmlgz = gz.read()

        # WRITE
        xmlgz_f = os.path.join(xml_dir, os.path.basename(url))
        with open(xmlgz_f, 'wb') as f:
            f.write(xmlgz)

        print(f'Added {os.path.basename(xmlgz_f)}')
        
        # PARSE
        root = ET.fromstring(xmlgz)

        artists_urls_in_map = []
        for x in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
            if 'artist' in x.text:
                artists_urls_in_map.append(x.text)

    return artists_urls_in_map

In [23]:
"""
Retrieves an XML file from a URL, decompresses it, and saves it to a local directory.

Args:
    filelist (list): A list to store the names of the saved XML files.
    url (str): The URL of the XML file to retrieve.

Returns:
    list: The updated list of saved XML file names.

Notes:
    - The function checks if the response content is a bytes instance before attempting to decompress it.
    - If the response status code is not 200, a warning message is printed.
    - The saved XML file is named in the format 'XX_filename.xml', where 'XX' is a zero-padded index and 'filename' is the original file name from the URL.
"""
def xml_branch(filelist, url):
    
    global s

    if not s:
        s = requests.Session()


    headers_dict['User-Agent'] = random.choice(user_agents_list)
    print('\nRequest for:', url)
    r = s.get(url, headers=headers_dict)
    
    if r.status_code == 200:

        if not isinstance(r.content, bytes):
            print(f'Warning: {url.split("/")[-1]} is not bytes instance')

        else:       
            with gzip.GzipFile(fileobj=BytesIO(r.content)) as gz:
                # READ
                gz = gz.read()

                # SAVE
                index = str(i+1).zfill(2)
                fname = f'{index}_{os.path.basename(url)}'
                fname_f = os.path.join(xml_dir, fname)
                with open(fname_f, 'wb') as f:
                    f.write(gz)

                filelist.append(fname)
                print(f'Added {fname}')

    else: 
        print(f'Warning: {url.split("/")[-1]} request status code: {r.status_code}')

    return filelist

In [None]:
# Initiate session
s = requests.Session()

xfiles = os.listdir(xml_dir)
# Get sitemap of Bandsintown
if 'sitemap.xml.gz' in xfiles:
    print(f'{os.path.basename(sitemap)} already downloaded')
else:
    xml_urls = xml_maptree(sitemap)
    # Download all files from URLs inside the sitemap structure
    for i,xml_url in enumerate(xml_urls):
        if os.path.basename(xml_url) in [x.split('_')[-1] for x in xfiles]:
            print(f'{xml_url.split("/")[-1]} already downloaded')
            continue
        filelist = xml_branch(xfiles, xml_url)
        sleep(1)

## Artist URL (set/txt)

Ora abbiamo due __set__ ottenuti da bandsintown:

- `artist_url_set`: contiene gli URL alle pagine degli artisti
```plaintext  
    https://www.bandsintown.com/a/14870794-hend-lo
    https://www.bandsintown.com/a/8649612-equipo-latino-mundial
```
- `artist_set`: contiene i nomi degli artisti, estratti dagli URL
```plaintext  
    hend lo
    equipo latino mundial
```

Similmente gli stessi contenuti sono scritti in due file di __testo__:

- `artists_url_from_xml.txt`
- `artists_name_from_xml.txt`

In [None]:
skip = os.path.exists(os.path.join(xml_dir, 'artists_name_from_xml.txt')) and os.path.getsize(os.path.join(xml_dir, 'artists_name_from_xml.txt')) > 0

if skip:
    print('Files already!')
else:
    artist_url_set = set()
    artist_name_set = set()
    equal, j = 0, 0

    zipps = [x for x in xfiles if x.endswith('.xml.gz') and 'artists' in x]
    for j,xfile in enumerate(zipps):
        xfile = os.path.join(xml_dir, xfile)
        
        root = ET.parse(xfile).getroot()

        print(f"Parsing {str(j+1).zfill(2)}/{len(zipps)}: {xfile.split('\\')[-1]} -> Size: {os.path.getsize(xfile) / 1024**2:.2f} MB")

        for url in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
            
            artist_url_set.add(url.text)
            try:
                name = url.text.split('/')[-1]

                if len(name) > 0:
                    name = name.split('-', 1)[1].replace('-', ' ').lower()
                    artist_name_set.add(name)
                else:
                    raise IndexError
                
            except IndexError:
                continue

    with open(os.path.join(xml_dir, 'artists_url_from_xml.txt'), 'w', encoding='utf-8') as f:
        for artist_url in artist_url_set:
            f.write(artist_url + '\n')
        print(f'\nScritti {len(artist_url_set):,.0f} URL artisti in file \'artists_url_from_xml\'')

    with open(os.path.join(xml_dir, 'artists_name_from_xml.txt'), 'w', encoding='utf-8') as f:
        for artist_name in artist_name_set:
            f.write(artist_name + '\n')
        print(f'\nScritti {len(artist_name_set):,.0f} nomi in file \'artists_url_names_xml\'')
    
    if equal > 0:
        print(f'\nSkipped {equal:,.0f} files \'XX_artists_xml.gz\' ...')

### BiTown (DataFrame)

In [24]:
# 'URL' column
bintown_df = pd.read_csv(os.path.join(data_dir, 'BiT_Archive', 'BiT_urls_names.csv'))

def url_to_name(str):
    if '-' not in str:
        return
    return str.split('/')[-1].split('-', 1)[1].replace('-', ' ').lower()

# 'NAME' column
bintown_df['bitown_NAME'] = bintown_df['BiT_name'].map(url_to_name)


print(f'DF has {bintown_df.shape[0]:,.0f} rows. Columns: {list(bintown_df.columns)}')

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\mcontini\\Desktop\\GitHub\\public\\Season\\Bandsintown\\data\\artists_url_from_xml.txt'

In [None]:
# Count of artists with non latin characters (RU, JAP, ...)
nacount = bintown_df[bintown_df['bitown_NAME'].isna()].count().sum()
# Drop them
bintown_df = bintown_df.dropna(subset=['bitown_NAME'])


print(f'\'bintown_df\' contains all {bintown_df.shape[0]:,.0f} urls from scraped sitemap with their corresponding names.\nSome of them may be incorrect due to false endpoints!\nNote that {nacount:,.0f} URLs were dropped due to non latin characters or other inconsistent naming.\n\nURLs found: {bintown_df.shape[0]:,.0f}')

### Merged (DataFrame)

In [None]:
bintown_df = pd.read_csv('bintown_df.csv', sep=';', index_col=False, keep_default_na=False)
if not bintown_df.empty():
    pass

In [None]:
# MERGE!
# Left: eventi in bandsintown.com
# Right: artisti in Spotify followlist
merged_df = bintown_df.merge(short_df, how='right', left_on='bitown_NAME', right_on='name', indicator=True)

# Sort by 'popularity' and 'followers'
merged_df.sort_values(by=['popularity', 'followers'], ascending=False, inplace=True)


notfound = merged_df[merged_df['_merge']=='right_only'].drop(columns=['BiT_name', 'BiT_nurl', '_merge']).reset_index(drop=True)
print(f'\'notfound\': the following {notfound.shape[0]:,.0f} artists were not found in BiTown (Spotify list has {merged_df.shape[0]:,.0f}).')

notfound.head(3)

In [None]:
bandsinspot = merged_df[merged_df['_merge']=='both'].drop(columns=['bitown_NAME', 'id','_merge']).reset_index(drop=True)
bandsinspot['name'] = bandsinspot['name'].str.lower()

print(f'\'bandsinspot\': the other {bandsinspot.shape[0]:,.0f} artists were found in the Bandsintown archive.')
display(bandsinspot.head(3))


bandsinspot.to_csv('bandsinspot_df.csv', sep=';', index=False)

### Searchtools

Si vuole controllare se un artista `valuename` è presente in `short_df` e/o in `bintown_df`

```python
    # Get a certain row in 'bintown_df' and 'short_df' filtering by artist name
    valuename = 'chelsea wolfe'

    x = bintown_df[bintown_df['bitown_NAME'] == valuename.lower()]
    y = short_df[short_df['name'] == valuename.lower()]

    if not x.empty and not y.empty:
        print(x.iloc[0], '\n\n', y.iloc[0], sep='')
    else:
        print(f'\'{valuename}\' not found.')
```

In [None]:
"""
# Get a certain row in 'bintown_df' and 'short_df' filtering by artist name
valuename = 'chelsea wolfe'

x = bintown_df[bintown_df['bitown_NAME'] == valuename.lower()]
y = short_df[short_df['name'] == valuename.lower()]

if not x.empty and not y.empty:
    print(x.iloc[0], '\n\n', y.iloc[0], sep='')
else:
    print(f'\'{valuename}\' not found.')
"""

Il valore `search_for` è il valore da cercare in `in_column`, ad esempio se si vuole cercare un particolare _genere_ nelle liste __genres__ degli artisti

```python
    # Check if 'search_for' keyword (list or str) is in 'in_column' column
    search_for = 'psy'
    in_column = 'genres'

    k = short_df.copy()

    k['search'] = k[in_column].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
    k = k[ k['search'].str.contains(search_for, case=False) ]

    if not k.empty:
        k = k.sort_values(['popularity', 'followers'], ascending=False)

    k
```

In [None]:
"""
# Check if 'search_for' keyword (list or str) is in 'in_column' column
search_for = 'oriental'
in_column = 'genres'

k = short_df.copy()

k['search'] = k[in_column].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
k = k[ k['search'].str.contains(search_for, case=False) ]

# NO duplicated words in 'search'
k['search'] = k['search'].map(lambda x: {i.lower() for i in x.split()})
k['search'] = k['search'].map(lambda x: ', '.join(x))

if not k.empty:
    k = k.sort_values(['popularity', 'followers'], ascending=False)

print(f'{k.shape[0] if not k.empty else 'No'} artists with \'{search_for}\' in column {in_column}.')

k[['name', 'search']]
"""

## BeastSoup

### Async Scraping

In [260]:
bandsinspot = pd.read_csv(os.path.join(data_dir, 'BiT_Archive', 'BiT_Spotify_innerjoin.csv'), sep=';', index_col=False, keep_default_na=False)

In [None]:
from bs4 import BeautifulSoup
import random
#from concurrent.futures import ThreadPoolExecutor

s = requests.Session()
url_series = bandsinspot['BiT_url']

print(f'Number of URLs to scrape: {len(url_series)}')

user_agents_list = [
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.3",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.3",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPhone; CPU iPhone OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_4 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_0_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
    "Mozilla/5.0 (iPad; CPU OS 12_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Mobile/15E148 Safari/604.1",
]


import requests

headers_dict = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "accept-language": "it-IT,it;q=0.9,en-IT;q=0.8,en;q=0.7,en-US;q=0.6",
    "priority": "u=0, i",
    #"sec-ch-ua": "\"Google Chrome\";v=\"129\", \"Not=A?Brand\";v=\"8\", \"Chromium\";v=\"129\"",
    #"sec-ch-ua-mobile": "?0",
    #"sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "none",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "cookie": "cto_bidid=YpcZ2l95ZnkxMTloJTJCMlptQXkzM09ZWnJzQUZjNnhNaEpoMjUwZm04ZVZROTlCVEI4ZEViNFRHRzFVam1JUjZTWDV5a1oxNzhTQzg5TW9BMHAxYzQxJTJCb0UzJTJCVUJYWmluY29PbE16MUJIWVV5JTJCSjFGTHplRWk5NXRkUFZ3T292eHBFb1B6; _csrf=4J8rodMvFtc_YcI-_7wgzSXs; bit_pc=1; bit_geo=%257B%2522latitude%2522%253A45.4722%252C%2522longitude%2522%253A9.1922%252C%2522name%2522%253A%2522Milan%252C%2520Italy%2522%257D; _ga_FVWZ0RM4DH=GS1.1.1727983632.1.0.1727983632.60.0.0; OptanonAlertBoxClosed=2024-10-03T19:27:15.445Z; OptanonConsent=isGpcEnabled=0&datestamp=Thu+Oct+03+2024+21%3A27%3A15+GMT%2B0200+(Ora+legale+dell%E2%80%99Europa+centrale)&version=202406.1.0&browserGpcFlag=0&isIABGlobal=false&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0004%3A1%2CC0003%3A1%2CC0002%3A1%2CV2STACK42%3A1&geolocation=IT%3B25&AwaitingReconsent=false; eupubconsent-v2=CQF6ZfAQF6ZfAAcABBENBJFsAP_gAEPgACiQKftV_G__bWlr8X73aftkeY1P9_h77sQxBhfJE-4FzLvW_JwXx2ExNA36tqIKmRIAu3bBIQNlGJDUTVCgaogVryDMaE2coTNKJ6BkiFMRM2dYCF5vm4tj-QKY5vr991dx2B-t7dr83dzyz4VHn3a5_2a0WJCdA5-tDfv9bROb-9IOd_x8v4v8_F_rE2_eT1l_tWvp7D9-cts7_XW89_fff_9Ln_-uB_-_3_gp4ASYaFRAGWRISEGgYQQIAVBWEBFAgAAABIGiAgBMGBTsDAJdYSIAQAoABggBAACDIAEAAAkACEQAQAFAgAAgECgADAAgGAgAYGAAMAFgIBAACA6BimBBAoFgAkZkRCmBCEAkEBLZUIJAECCuEIRZ4FEAiJgoAAASACsAAQFgsDiSQErEggS4g2gAAIAEAggAKEUnZgCCAM2WovFk2jK0wLR8wXPaYBkgQAAA.f_wACHwAAAAA; cookie=627f66c8-02ef-4bda-97fb-153e424e47e7; cookie_cst=HSziLAwsTg%3D%3D; ccsid=89cbc030-e204-46c4-ae56-003decdf4e4a; _au_1d=AU1D-0100-001727983636-VJSG1DCV-CVLO; _au_last_seen_iab_tcf=1727983636507; lotame_domain_check=bandsintown.com; _cc_id=12e49cdacf035eb741fc38f74e9f69e1; panoramaId_expiry=1728588471504; panoramaId=1824a7993c31a1ff510cdc348d7e4945a7029be6f9be70b9d450219d66d0fbb4; panoramaIdType=panoIndiv; _ga=GA1.2.1945502946.1727944026; _gid=GA1.2.107389706.1727983637; ccuid=e9e2111f-cabb-457b-afdf-1233b184091c; __qca=P0-1295155093-1727983636129; __gads=ID=5d28bebfaa4f90a2:T=1727971455:RT=1727983672:S=ALNI_MaYX8mrGHlQAtSL1GiPLczPJaXZyA; __gpi=UID=00000f1f788d55e0:T=1727971455:RT=1727983672:S=ALNI_MZrVEOHL92V_k3LOYcHkwWniFH6-w; __eoi=ID=bca63a398a258927:T=1727971455:RT=1727983672:S=AA-AfjaitKB_ejY1FOiwM6VCecXY; _ga_7VSQQ2WNWN=GS1.1.1727983632.3.1.1727983637.0.0.0; cto_bundle=vOvdKV9IYzhDblJDc3hnRU44JTJCSTdabE82ZGJ6bTd5TEpzJTJGTlhrNThQY1g2U2doOVlVOEQxekZxSmpRME1QeiUyQiUyQmlIRFNuU0syaXklMkZKeDlWQ0F5ZTI2VmUwU0xWNmZPeUE4Y3olMkY5cnp6c3F6Rkt5V0dGRFF6ZTh6WVJtZTVrVW5wSU42bjdNQWJKUTZ6UEp0JTJCT1BHMDFpREJ1YyUyRlQzaTRWTUxacGFMbWJOZ01peGdzJTNE"
}


In [262]:
def dicts_from_soup(jsoups: list) -> list[dict]:
    """
    Extracts dictionaries from a nested list of JSON-like objects.

    Args:
        jsoups: A list containing nested lists and dictionaries.

    Returns:
        A list of dictionaries with the key '@type' equal to 'MusicEvent'.
    """

    artist_dicts = []
    for soup in jsoups:

        if isinstance(soup, list):
            for dictionary in soup:
                artist_dicts.append(dictionary)

        elif isinstance(soup, dict):
            artist_dicts.append(soup)
        else:
            print(f'ERROR: {type(soup)} not dict nor list')

    return [artist_dicts[i] for i in range(len(artist_dicts)) if artist_dicts[i]['@type'] == 'MusicEvent']

In [64]:
location_data_keys = ['addressCountry', 'addressLocality', 'streetAddress', 'postalCode']
artist_data_keys = ['name', 'url', 'startDate']

def keypop(dictionary: dict) -> dict:
    """
    Removes keys from a dictionary that are not in a list of keys.

    Args:
        diction: The dictionary to modify.

    Returns:
        A new dictionary with the specified keys removed.
    """

    #print('\n\'keypop\' function')

    dick = dictionary.copy()

    before_keys = dictionary.keys()
    for k in before_keys:
        if k not in artist_data_keys + location_data_keys + ['coordinates', 'artist']:
            dick.pop(k)

    return dick


def unwrap(diction: dict) -> dict:
    """
    Extracts specific keys from a dictionary and modifies the dictionary in-place.

    Args:
        diction: The dictionary to modify.

    Returns:
        The modified dictionary.
    """

    #print('\n\'unwrap\' function')

    dick = diction.copy()

    if not isinstance(dick, dict):
        print('Not a dict')
        return
    elif 'location' not in list(dick.keys()):
        print('No location key')
        return
    
    dick['artist'] = dick['performer']['name']

    location = dick['location']['address']
    for i in location.keys():
        if len(i) > 0 and i in location_data_keys:
            dick[i] = location[i]

    dick['coordinates'] = ( dick['location']['geo']['latitude'], dick['location']['geo']['longitude'] )

    return dick


def artist_event_df(event_dict_list: list[dict] | dict | int) -> pd.DataFrame:
    """
    Converts a list of dictionaries to a pandas DataFrame.

    Args:
        event_dict_list: A list of dictionaries or a single dictionary.
        It may be a int if response.status_code is not 200 in previous GET request

    Returns:
        A pandas DataFrame containing the data from the input dictionaries.
    """

    if isinstance(event_dict_list, int):
        print('\nWARNING: Last request terminated with status code:', event_dict_list)
        return

    ev_list = list(filter(lambda x: x is not None, event_dict_list if isinstance(event_dict_list, list) else [event_dict_list]))

    if len(ev_list) == 0:
        print(f'\nNo events for selected artist... Check method or change artist!')
        return
    #else:
        #print(f'\nWorking on {len(ev_list)} dict...')

    ev_df_list = []
    for ev in ev_list:
        ev = unwrap(ev)
        ev = keypop(ev)
        ev_df_list.append(ev)

    ev_df = pd.DataFrame.from_records(ev_df_list)
    print(f'Shape:', ev_df.shape)

    return ev_df

### AllEvents! (DataFrame)

In [None]:
import concurrent.futures
import requests
from threading import Lock

import pandas as pd
import os

lock = Lock()

data_df = pd.DataFrame()
datalist = []

url_series = bandsinspot['BiT_url']

In [1]:
def json_response(response_text):
    if response_text:
        zuppa = BeautifulSoup(response_text, 'html.parser')
        script_soup = zuppa.find_all('script', type='application/ld+json')
        
        jsoups = [json.loads(script_soup[i].text) for i in range(len(script_soup))]

        dict_events = dicts_from_soup(jsoups)
        
        return dict_events if dict_events else None
    
    else:
        raise requests.RequestException
    

def fetch_url(url):
    #with lock:
        #sleep(random.uniform(0.6, 1.7))  # sleep casuale tra le richieste
    
    bot_headers = headers_dict
    bot_headers['User-Agent'] = random.choice(user_agents_list)
    
    print('\nFetching:', url.split('/')[-1].split('-',1)[1])
    
    response = s.get(url, headers=bot_headers)
    #response.raise_for_status()
    if response.status_code == 200:
        return response.text
    
    else:
        e = response.status_code
        print(f"Error with {url} - {e}")
        return None


def process_response(response_text):
    if response_text:
        return json_response(response_text)
    else:
        return None

In [None]:
errors = 0
error_urls = []
if 'all_events_df.csv' not in os.listdir() or 'all_events_alldicts.csv' not in os.listdir():
    if data_df.empty:
        with concurrent.futures.ThreadPoolExecutor(max_workers=30) as executor:
            futures = {executor.submit(fetch_url, url): url for url in url_series}
            responses = []
            for future in concurrent.futures.as_completed(futures):
                url = futures[future]

                response_text = future.result()
                response_data = process_response(response_text)
                if response_data is not None:
                    datalist.extend(response_data)
                else:
                    errors += 1
                    error_urls.append(url)
                    print(f"Added in error list: {url.split('/')[-1].split('-',1)[1]}")

        data_df_alldicts = pd.DataFrame(datalist)
        data_df = artist_event_df(datalist)
        
        
        if errors == 0:
            data_df_alldicts.to_csv('all_events_alldicts.csv', mode='w', sep=';', index=False)
            data_df.to_csv('all_events_df.csv', mode='w', sep=';', index=False)
            print('dataframes saved! in folder', os.getcwd().split('GitHub\\')[1])
        else:
            print('DF wasn\'t saved in CSV file because of', errors, 'errors')
        

else:
    print('CSV already saved in path:', os.getcwd().split('GitHub\\')[1])
    data_df_alldicts = pd.read_csv('all_events_alldicts.csv', sep=';')
    data_df = pd.read_csv('all_events_df.csv', sep=';')
    

print(data_df.shape)
data_df.head()

In [None]:
print(errors)
print(*error_urls, sep='\n')

In [None]:
# Try again with error_urls as single sync request
if errors > 0:
    for url in error_urls:
        text = fetch_url(url)
        
        response_data = process_response(text)
        if response_data is not None:
            datalist.extend(response_data)
        else:
            errors -= 1
            print(f"Error again with {url} -> {errors} errors left")

if errors == 0 and 'all_events_df.csv' not in os.listdir() and 'all_events_alldicts.csv' not in os.listdir():
    data_df_alldicts.to_csv('all_events_alldicts.csv', mode='w', sep=';', index=False)
    data_df.to_csv('all_events_df.csv', mode='w', sep=';', index=False)
    print('dataframes saved! in folder', os.getcwd().split('GitHub\\')[1])
else:
    print('DF wasn\'t saved in CSV file because of', errors, 'errors left...')

In [None]:
def pdatalist2(list):
    xd = pd.DataFrame(list)
    print('Shape:', xd.shape)
    return xd

if 'all_events_df.csv' not in os.listdir():
    if errors == 0:
        df_datalist = pd.DataFrame(datalist)
        df_datalist.to_csv('all_events_alldicts.csv', mode='w', sep=';', index=False)
        artist_event_df(datalist).to_csv('all_events_df.csv', mode='w', sep=';', index=False)
        print('dataframes saved! in folder', os.getcwd().split('GitHub\\')[1])
    else:
        print('DF wasn\'t saved in CSV file because of', errors, 'errors')
else:
    print('Files already saved!')

# Querying (csv)

In [52]:
import numpy as np
from datetime import datetime
import geopy.distance

def coordinates(x):
    if 'coordinates' not in x.columns:
        print('no \'coordinates\' column in dataframe')
        return x
    
    if type(x.iloc[0]['coordinates']) != tuple:
        print(f'\'coordinates\' column will be converted from {type(x.iloc[0]['coordinates'])} to tuple')
        x['coordinates'] = x['coordinates'].apply(lambda y: tuple((float(y.split('(')[1].split(',')[0]), float(y.split(',')[1].split(')')[0]))))

    x['latitude'] = x['coordinates'].apply(lambda x: round(x[0], 2))
    x['longitude'] = x['coordinates'].apply(lambda x: round(x[1], 2))
    return x


def distance_from_sesto(home, lat, lon):
    place = (lat, lon)
    distance = geopy.distance.geodesic(home, place).km
    return round(distance, 1)


def date_format(date):
    date = datetime.strptime(date, '%Y-%m-%dT%H:%M:%S')
    date = date.strftime('%d/%m/%Y %H:%M')
    return date


def fill_empties(x):
    for c in x.columns:
        x[c] = x[c].apply(lambda y: '' if y in ['nan', 'NaN', np.nan] else y)
    return x

In [None]:
# SestoSG
# (lat,lon) in gradi decimali (DD) 
sestosg = (45.53, 9.23)

def complexity(df):
    x = df.copy()
    x = fill_empties(x)
    x = coordinates(x)
    x['Date'] = x['startDate'].apply(lambda y: date_format(y))
    x['distance_from_Sesto'] = x.apply(lambda y: distance_from_sesto(sestosg, y['latitude'], y['longitude']), axis=1)
    x['artist'] = x['artist'].str.lower()
    x = x.drop(columns=['url', 'postalCode', 'coordinates'], inplace=False)
    return x

if 'all_events_df.csv' not in os.listdir():
    print('no \'all_events_df.csv\' found in folder', os.getcwd().split('GitHub\\')[1])
else:
    all_events_df = pd.read_csv('all_events_df.csv', sep=';')
    #print(all_events_df.shape, all_events_df.columns)
    all_events_df = complexity(all_events_df)
    print('dataframe shape:', all_events_df.shape)
    
all_events_df.sort_values(by='startDate', ascending=True).head(5)

In [None]:
raga = all_events_df[ all_events_df['distance_from_Sesto'] < 10 ].sort_values(by='startDate', ascending=True)

raga

In [None]:
genres_map = bandsinspot.set_index('name')['genres'].to_dict()
popularity_map = bandsinspot.set_index('name')['popularity'].to_dict()

def mapper(df_from, keyword, column_returned):
    return df_from.set_index(keyword)[column_returned].to_dict()

all_events_df['genres'] = all_events_df['artist'].map(mapper(bandsinspot, 'name', 'genres'))
all_events_df['popularity'] = all_events_df['artist'].map(mapper(bandsinspot, 'name', 'popularity'))
raga = all_events_df[ all_events_df['distance_from_Sesto'] < 10 ].sort_values(by='startDate', ascending=True)

raga

# Europe (DataFrame)

In [None]:
display(bandsinspot.head())
display(short_df.head())

In [None]:
def is_europe(latitude, longitude):
    # Definizione dei limiti di latitudine e longitudine per l'Europa
    s = 34
    n = 72
    e = -25
    w = 45

    # Verifica se il punto rientra nei limiti definiti
    if s <= latitude <= n and e <= longitude <= w:
        return True
    return False

df_europe = j.copy()
df_europe = df_europe[ df_europe.apply(lambda x: is_europe(x['latitude'], x['longitude']), axis=1) ]
df_europe['month'] = df_europe['startDate'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S').month)
df_europe['year'] = df_europe['startDate'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S').year)

# Query for year=2025 and month=1
#df_europe = df_europe.query('year==2025 and month==5').sort_values(by='startDate', ascending=False)
# Column selection
#df_europe = df_europe[['artist', 'Date', 'addressLocality', 'addressCountry', 'distance_from_Sesto', 'genres', 'popularity']]
df_europe['specific_genre'] = df_europe['genres'].apply(lambda x: set(word for sublist in x for word in sublist.split() ))
df_europe