# HTML Parsing

Dobbiamo usare un parser HTML per estrarre le informazioni relative alle band seguite su Spotify.  
In particolare, dobbiamo estrarre il nome, che si trova internamente ai blocchi `<div>` `</div>`, ma solo se c'è una stringa di testo all'interno, altrimenti si continua la ricerca.  
Per fare ciò, possiamo usare il modulo `BeautifulSoup` di Python.

```python
    from bs4 import BeautifulSoup
```


In [3]:
import os
from bs4 import BeautifulSoup # type: ignore

html_file = 'bandsintown followed.html'

with open(html_file, 'r', encoding='utf-8') as f:
    html = f.read()

soup = BeautifulSoup(html, 'html.parser')

Senza apparente motivo, gli elementi `<div>` hanno come attributo `class` una stringa non comprensibile.

Tuttavia il nome della band si trova sempre all'interno di un blocco `<div>` con attributo `class` uguale a `hVlkNE3ZIIiARNi4swgW`, come qui:

```html
      <div class="kXGHOPGn59hvZ3KIEx8t">
        <div class="hVlkNE3ZIIiARNi4swgW">1349</div>
        <span class="rxFGJF_Hmm6LdBb5G2lK"></span>
        <div class="rCYfM3PKAnUG7__xeA5N"></div>
      </div>
```

In questo caso, il nome della band è `1349`, e si trova all'interno del blocco `<div>` con attributo `class="hVlkNE3ZIIiARNi4swgW"`.

In [None]:
artists = []

# Riga per riga cerca i nomi degli artisti nei tag <div> con classe "hVlkNE3ZIIiARNi4swgW"
# Assicura di rimuovere gli spazi bianchi iniziali e finali, inclusi i caratteri di nuova riga e tabulazione
for div in soup.find_all('div', class_='hVlkNE3ZIIiARNi4swgW'):
    artist = div.get_text().strip()
    artist = artist.lower()
    artists.append(artist)

# Stampa la lista degli artisti e il numero totale di artisti
#print(artists)
print(len(artists), 'artisti seguiti\n')

# Lista breve per anteprima
print(*artists[:3], '...', *artists[-3:],sep='\n')

In [None]:
# Salva la lista degli artisti in un file di testo
if not os.path.exists('parsing'):
    os.makedirs('parsing')

artists_followed = os.path.join('parsing', 'artists_followed_from_spotify.txt')
if not os.path.exists(artists_followed):
    with open(artists_followed, 'w', encoding='utf-8') as f:
        for artist in artists:
            f.write(artist + '\n')

if f:
    print('File \'artists_followed_from_spotify.txt\' creato con successo!')
else:
    print('Errore durante la creazione del file artists.txt')

# XML Maps

Per effettuare richieste HTTP, possiamo usare il modulo `requests` di Python.  

```python
    import requests
```

Questo script scarica un file gzip da [sitemap/sitemap.xml.gz](<https://www.bandsintown.com/[sitemap/sitemap.xml.gz>), estrae il file XML e ricava dal suo interno URL di altri XML di interesse, restituendoli in una lista.

In [13]:
headers_dict = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    # Aggiungi altri headers necessari
}

In [14]:
import requests # type: ignore
import gzip
import xml.etree.ElementTree as ET
from io import BytesIO

# URL per la mappa XML del sito
domain = 'https://www.bandsintown.com'
map = domain + "/sitemap/sitemap.xml.gz"

# Headers per la richiesta
headers_from_console = {}

# Scarica il file gzip
response = requests.get(map, headers=headers_dict)
if response.status_code == 200 and isinstance(response.content, bytes):
    # Decomprime il file gzip
    with gzip.GzipFile(fileobj=BytesIO(response.content)) as gz:    
        # Leggi il contenuto dell'XML
        xml_content = gz.read()
        
        # Parsing dell'XML
        root = ET.fromstring(xml_content)
        
        # Salva il contenuto dell'XML in un file di testo
        with open(os.path.join('parsing', 'sitemap.xml'), 'wb') as f:
            f.write(xml_content)
                
        artist_url_in_map = []
        # Esempio: Stampa tutti gli URL trovati nel file XML
        for url in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
            if 'artist' in url.text:
                artist_url_in_map.append(url.text)
else:
    print(f"Errore durante il download del file: {response.status_code}")

Ciascuno di questi file XML contiene link a pagine web di artisti in bandsintown.

- [artists.xml.gz](<https://www.bandsintown.com/sitemap/artists.xml.gz>)
- [artists1.xml.gz](<https://www.bandsintown.com/sitemap/artists1.xml.gz>)
- [artists2.xml.gz](<https://www.bandsintown.com/sitemap/artists2.xml.gz>)
- [artists3.xml.gz](<https://www.bandsintown.com/sitemap/artists3.xml.gz>)
- [artists4.xml.gz](<https://www.bandsintown.com/sitemap/artists4.xml.gz>)
- [artists5.xml.gz](<https://www.bandsintown.com/sitemap/artists5.xml.gz>)
- [artists6.xml.gz](<https://www.bandsintown.com/sitemap/artists6.xml.gz>)
- [artists7.xml.gz](<https://www.bandsintown.com/sitemap/artists7.xml.gz>)
- [artists8.xml.gz](<https://www.bandsintown.com/sitemap/artists8.xml.gz>)
- [artists9.xml.gz](<https://www.bandsintown.com/sitemap/artists9.xml.gz>)
- [artists10.xml.gz](<https://www.bandsintown.com/sitemap/artists10.xml.gz>)
- [artists11.xml.gz](<https://www.bandsintown.com/sitemap/artists11.xml.gz>)

In [None]:
xfiles = []
if os.listdir('artists_from_xml') == []:
    for index, xml_url in enumerate(artist_url_in_map):
        # Scarica il file gzip
        x = requests.get(xml_url, headers=headers_dict)
        if x.status_code == 200 and isinstance(x.content, bytes):
            # Decomprime il file gzip
            with gzip.GzipFile(fileobj=BytesIO(x.content)) as gz:
                
                # Leggi il contenuto decompressione
                xml_content = gz.read()
                # Directory per salvare i file XML
                if not os.path.exists('artists_from_xml'):
                    os.makedirs('artists_from_xml')    
                # Salva il contenuto dell'XML in un file di testo
                index = str(index+1).zfill(2)
                gz_name = f'{index}_artists.xml'
                output_file = os.path.join('artists_from_xml', gz_name)
                with open(output_file, 'wb') as f:
                    f.write(xml_content)
                    if not f:
                        print(f"Errore durante la creazione del file {output_file}")
                    else:
                        xfiles = xfiles + [output_file]
else:
    xfiles = os.listdir('artists_from_xml')

print('File XML scaricati:', *xfiles, sep='\n')


assert xfiles, 'Nessun file XML scaricato dalla sitemap'

## To-Text

In [None]:
# File unici da invocare in Pandas per esecuzione query
artists_url_txt = os.path.join('parsing', 'artist_urls.txt')
artists_name_txt = os.path.join('parsing', 'artist_names.txt')

skip = os.path.exists(artists_url_txt) and os.path.exists(artists_name_txt)

k = 0
if not skip:
    artist_url_set = set()
    artist_name_set = set()
    k = 0
    for j,xfile in enumerate(xfiles):
        # Index e lunghezza file in KB
        xfile = os.path.join('artists_from_xml', xfile)
        print(f"File { str(j+1).zfill(2)}/{len(xfiles) } {xfile} -> Size: {os.path.getsize(xfile) / 1024**2:.2f} MB")
        tree = ET.parse(xfile)
        root = tree.getroot()
        for url in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
            artist_url_set.add(url.text)
            try:
                name = url.text.split('/')[-1]
                if len(name) < 1:
                    raise IndexError
                else:
                    name = name.split('-', 1)[1]
                    name = name.replace('-', ' ')
                    name = name.lower()
                    artist_name_set.add(name)
            except IndexError:
                k += 1
    with open(artists_url_txt, 'w', encoding='utf-8') as f:
        for artist_url in artist_url_set:
            f.write(artist_url + '\n')
        print(f'Scritti {len(artist_url_set)} URL artisti in file {artists_url_txt}')
    with open(artists_name_txt, 'w', encoding='utf-8') as f:
        for artist_name in artist_name_set:
            f.write(artist_name + '\n')
        print(f'Scritti {len(artist_name_set)} nomi in file {artists_name_txt}')
    
else:
    artist_url_set = set()
    artist_name_set = set()
    with open(artists_url_txt, 'r', encoding='utf-8') as f:
        for line in f:
            artist_url_set.add(line.strip())
        print(f'Letti {len(artist_url_set)} URL artisti da file {artists_url_txt}')
    with open(artists_name_txt, 'r', encoding='utf-8') as f:
        for line in f:
            artist_name_set.add(line.strip())
        print(f'Letti {len(artist_name_set)} nomi da file {artists_name_txt}')
    
print('Nomi artisti non validi:', k)

# Database vs Following Artists

Ora abbiamo due set ottenuti da bandsintown:

- `artist_url_set`: contiene gli URL alle pagine degli artisti
```plaintext  
    https://www.bandsintown.com/a/14870794-hend-lo
    https://www.bandsintown.com/a/8649612-equipo-latino-mundial
```
- `artist_set`: contiene i nomi degli artisti, estratti dagli URL
```plaintext  
    hend lo
    equipo latino mundial
```

## To-Dataframe

Si crea un dataframe con due colonne: `artist_name` e `artist_url`, ottenute rispettivamente da `artist_name.txt` e `artist_urls.txt`.

In [None]:
import pandas as pd # type: ignore
from tabulate import tabulate

# Carica i file in un DataFrame
df_urls = pd.read_csv(artists_url_txt, header=None, names=['url']).dropna()
df_followed = pd.read_csv(artists_followed, header=None, names=['name']).dropna()

# Stampa i primi 5 URL e nomi degli artisti
print(tabulate(df_urls.head(), headers='keys', tablefmt='pretty'))
print(tabulate(df_followed.head(), headers='keys', tablefmt='pretty'))

Bisogna eliminare dal df `df_urls` le righe che contengono URL senza il carattere `-` poiché non contengono un nome concreto di un artista.  
Si rimuovono tali righe dal df.

In [18]:
def nodash_dropper(df: pd.DataFrame, column: str) -> pd.DataFrame:
    x = df.copy()
    # Conta il numero di righe filtrate
    count = x[~x[column].str.contains('-')].count().sum()
    # Filtra le righe in cui la colonna 'url' non contiene il carattere '-'
    x = x[x[column].str.contains('-')]

    count = f'{count:,}'
    print(f'Nomi artisti senza trattino rimossi: {count.replace(",", ".")}')
    return x

In [None]:
print(df_urls.shape, df_followed.shape)

In [None]:
# Filtra le righe in cui la colonna 'url' non contiene il carattere '-'
df_urls = nodash_dropper(df_urls, 'url')

dash_left = df_urls[~df_urls['url'].str.contains('-')]
# Stampa i risultati
print(f"Numero di righe senza '-': {dash_left.shape[0]}")
print(f"Indici delle righe senza '-': {dash_left.index.tolist()}")

assert dash_left.empty, 'Righe con trattino rimaste'

## Inner Join

Qui creiamo la colonna `name_from_url` nel dataframe `df_urls`.  
Dopodiché nella colonna `name` del dataframe `df_names` sostituiamo in ogni riga lo spazio con il trattino `.replace(' ', '-')`. 

```python
    df_urls['name_from_url'] = df_urls['artist_url'].str.split('/').str[-1]
    df_names['name'] = df_names['name'].str.replace(' ', '-')
```

In [None]:
df_urls['name_from_url'] = df_urls['url'].str.split('/').str[-1]
df_urls['name_from_url'] = df_urls['name_from_url'].apply(lambda x: x.split('-', 1)[1].lower())
df_followed['name_for_url'] = df_followed['name'].str.replace(' ', '-')

assert df_urls['name_from_url'].isna().sum() == 0, 'Ci sono valori mancanti nella colonna name_from_url'
assert df_followed[df_followed['name_for_url'].str.contains(' ')].empty, 'Ci sono valori con il carattere "-" nella colonna name'

print(tabulate(df_urls.head(), headers='keys', tablefmt='pretty'))
print(tabulate(df_followed.head(), headers='keys', tablefmt='pretty'))


### Merge

Ci appoggiamo a queste due colonne per effettuare il `join`.  

Manteniamo nel dataframe `df_urls` solo le righe che hanno un corrispondente nella colonna `name` del dataframe `df_followed`.  
In altre parole, manteniamo solo le righe del __primo df__ che contengono un URL di un artista che è presente nella lista di artisti del __secondo df__.  

Questa operazione si chiama `inner join`, e viene effettuata con il metodo `merge`

```python
    df_map = df_urls.merge(df_followed, left_on='name_from_url', right_on='name', how='inner')
```

In [None]:
# Join execution keeping all columns from both DataFrames
inner_df = df_followed.merge(df_urls, left_on='name_for_url', right_on='name_from_url', how='inner')

inner_df_duplicates = inner_df[inner_df.duplicated(subset=['name_for_url'], keep=False)]
print('URL duplicati per uno stesso artista:', inner_df_duplicates.shape[0])
print(tabulate(inner_df_duplicates, headers='keys', tablefmt='pretty'))

inner_df.drop_duplicates(subset=['name_for_url'], keep='first', inplace=True)
print('Artista con URL univoco:', inner_df.shape[0])
print(tabulate(inner_df.head(), headers='keys', tablefmt='pretty'))
print(tabulate(inner_df.tail(), headers='keys', tablefmt='pretty'))

Double-Triple Check di artisti non trovati in url

In [None]:
not_found = df_followed.merge(df_urls, left_on='name_for_url', right_on='name_from_url', how='left', indicator=True)
print(not_found.columns)
not_found = not_found[not_found['_merge'] == 'left_only']
not_found = not_found[['name', 'name_for_url']]
not_found.reset_index(drop=True, inplace=True)

"""
for index, row in not_found.iterrows():
    if row['name_for_url'] not in df_urls['name_from_url'].values:
        print(f"{index+1}. {row['name_for_url']}")
"""


### Isin

Un metodo alternativo, ma equivalente, per ottenere lo stesso risultato è usare il metodo `map`.  
In questo caso il df `df_map` è ottenuto in questo modo:

```python
    df_map_two = df_urls[df_urls['name_from_url'].isin(df_followed['name'])]
```

Non rimuoviamo i duplicati poiché per qualche ragione ci sono artisti a cui corrispondono più URL.

# URL Inspection

Puntiamo ad ognuno degli URL nel dataframe `df_map` ed effettuiamo il parsing della pagina di atterraggi su bandsintown.  

Cominciamo con un URL di esempio chiamato `sample` riferito alla pagina di `Achille Lauro`:

```python
    sample = 'https://www.bandsintown.com/a/226317-achille-lauro'
```

## Web Soup

Con le librerie `requests` e `BeautifulSoup` è possibile effettuare il parsing di una pagina web e ottenere i dati necessari per l'analisi. In questo caso, si vuole ottenere una tabella a partire dagli eventi presenti nella pagina dell'artista.

### Single Link

In [None]:
#sample = 'https://www.bandsintown.com/a/226317-achille-lauro'

# As a sample store the first row of df_map
import random


sample = inner_df.iloc[[0]]

# came_from is an integer between 100 an 999
came_from = random.randint(100, 999)
sample_text = sample.url.values[0] + f'/events?came_from={came_from}&utm_medium=web&utm_source=home&utm_campaign=search_bar'
sample_name = sample.name.values[0]
print(f"URL di esempio per {sample_name}: {sample_text}")

sesh = requests.Session()
sesh.headers.update(headers_dict)

Per generare la tabella possiamo estrarre per ciascun `artist` il contenuto di blocchi `<div>` con questa sottostruttura:

| class_= | Description |
|---------|-------------|
| __yO7_YNtrhrBbkSdcHF8Y__ | all concerts & live streams |
| __jnX2IOn9AGg9SfWK4eCL__ | month (mmm) |
| __vLfdQ0HSBUy47Eujeqkk__ | day (dd) |
| __TYzA8d85IfvLeyChcYJj__ | venue |
| __D9Nc3q2GrC4mEVUaPKoR__ | city, country |
| __HSi6JUw0uzbTa6L0Se_I__ | url to event |

Proviamo a inserire tutto in un df chiamato `df_events`:

```python
    df_events = pd.DataFrame(columns=['artist', 'date', 'month', 'day', 'url'])
```

In [None]:
df_events = pd.DataFrame(columns=['artist', 'datetime', 'venue', 'city', 'country', 'event_url'])
# Change dtypes
df_events['datetime'] = pd.to_datetime(df_events['datetime']) 



print('Inizio scraping...')
print(df_events.dtypes)

In [None]:
def get_events(df_events: pd.DataFrame, name: str) -> pd.DataFrame:
    artist_url = inner_df[inner_df['name'] == name].url.values[0]
    came_from = random.randint(100, 999)
    artist_url = artist_url + f'/events?came_from={came_from}&utm_medium=web&utm_source=home&utm_campaign=search_bar'
    
    if not artist_url:
        print(f"\nNon è stato possibile trovare l'URL per {name}")
        return df_events
    if name in df_events['artist'].values:
        print(f"\nGli eventi per {name} sono già stati scaricati: ci sono {df_events[df_events['artist'] == name].shape[0]} eventi")
        return df_events
    print(f"\nScaricamento eventi per {name} all'url {artist_url}...")
    response = sesh.get(artist_url, headers=headers_dict)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find the exact block <div class="yO7_YNtrhrBbkSdcHF8Y"></div> which contains the table of events
        events = soup.find_all('div', class_='yO7_YNtrhrBbkSdcHF8Y')

        if not events:
            print(f"Non ci sono eventi per {name}")
            return df_events
        else:
            # Loop through the events     
            L = df_events.shape[0]
            for i, event in enumerate(events):
                
                # Month and day of the event are respectively in <div class="jnX2IOn9AGg9SfWK4eCL"></div> and <div class="vLfdQ0HSBUy47Eujeqkk"></div>
                month = event.find('div', class_='jnX2IOn9AGg9SfWK4eCL').get_text().lower()
                day = event.find('div', class_='vLfdQ0HSBUy47Eujeqkk').get_text()
                if month and day:
                    # Transform month day mmm dd to mmm-dd
                    date = f'{month}-{day}'

                    # date to datetime
                    datetime = pd.to_datetime(date, format='%b-%d', errors='coerce')
                    # Build year based on month
                    try:
                        if datetime.month < pd.Timestamp.today().month:
                            year = pd.Timestamp.today().year + 1
                        else:
                            year = pd.Timestamp.today().year
                        datetime = datetime.replace(year=year)
                    except AttributeError:
                        datetime = pd.NaT
                        print(f"Errore nella data dell'evento {i+1} di {name}")
                        continue
                else:
                    datetime = pd.NaT
                    print(f"Errore nella data dell'evento {i+1} di {name}")
                    continue

                # Get the venue inside <div class="TYzA8d85IfvLeyChcYJj"></div>
                venue = event.find('div', class_='TYzA8d85IfvLeyChcYJj').get_text()
                if not venue:
                    venue = 'N/A'
                    print(f"Errore nel nome della location dell'evento {i+1} di {name}")
                    continue
                # Get city and country inside <div class="D9Nc3q2GrC4mEVUaPKoR"></div>
                location = event.find('div', class_='D9Nc3q2GrC4mEVUaPKoR').get_text()
                if location:
                    city, country = location.split(', ')
                else:
                    city, country = 'N/A', 'N/A'
                    print(f"Errore nella città/nazione dell'evento {i+1} di {name}")
                    continue
                # Get the URL of the event which is inside <a href="URL"></a> inside <div class="HSi6JUw0uzbTa6L0Se_I"></div>
                event_url = event.find('div', class_='HSi6JUw0uzbTa6L0Se_I').find('a')['href']
                if not event_url:
                    event_url = 'N/A'
                    print(f"Errore nell'URL dell'evento {i+1} di {name}")
                    continue

                # Add this elements to a new row in the DataFrame
                df_events.loc[L + i] = [name, datetime, venue, city, country, event_url]
                
        # Drop duplicates in 'url' column
        df_events.drop_duplicates(subset=['event_url'], inplace=True)
        df_events.sort_values(by='datetime', inplace=True)
        df_events['datetime'] = df_events['datetime'].dt.strftime('%d/%m/%Y')
        df_events.reset_index(drop=True, inplace=True)
        return df_events
    else:
        print(f"Errore durante il download di {name}: {response.status_code}")

# Get events for the sample artist
df_events = get_events(df_events, sample_name)

# pretty text
if not df_events.empty:
    print(f'\n{len(df_events)} eventi trovati')
    print(tabulate(df_events, headers='keys', tablefmt='pretty', stralign='left'))


### Links in df_urls

Applichiamo la funzione `get_events` a ciascun URL presente nel dataframe `df_map`:

```python
    df_events = df_map.apply(lambda x: get_events(x['artist_url']), axis=1)
```

In [None]:
# Empty df_events
df_all_events = pd.DataFrame(columns=['artist', 'datetime', 'venue', 'city', 'country', 'event_url'])
df_all_events.columns

In [None]:
"""# Cycle function for every url in inner_df['url'] given the name in inner_df['name']
import time

for index, row in inner_df.iterrows():
    try:
        time.sleep(1.5)
        df_all_events = get_events(df_all_events, row['name'])
    except Exception as e:
        print(f"Errore durante il download degli eventi per {row['name']}: {e}")"""

# From Soup

In [32]:
import os
import random
import requests
from bs4 import BeautifulSoup as BS

import pandas as pd
import json

from tabulate import tabulate

# Testi contenenti gli URL e nomi di tutti gli artisti nel DB di Bandsintown
artists_url_txt = os.path.join('parsing', 'artist_urls.txt')
url_series = pd.read_csv(artists_url_txt, header=None, names=['url']).dropna()

artists_name_txt = os.path.join('parsing', 'artist_names.txt')
name_series = pd.read_csv(artists_name_txt, header=None, names=['name']).dropna()

html_file = 'bandsintown followed.html'
with open(html_file, 'r', encoding='utf-8') as f:
    html = f.read()

zuppa = BS(html, 'html.parser')

# Riga per riga cerca i nomi degli artisti nei tag <div> con classe "hVlkNE3ZIIiARNi4swgW"
artists = []
for div in zuppa.find_all('div', class_='hVlkNE3ZIIiARNi4swgW'):
    artist = div.get_text().strip()
    artist = artist.lower()
    artists.append(artist)

artists_followed = os.path.join('parsing', 'artists_followed_from_spotify.txt')
if not os.path.exists(artists_followed):
    with open(artists_followed, 'w', encoding='utf-8') as f:
        for artist in artists:
            f.write(artist + '\n')

# Carica i file in un DataFrame
df_followed = pd.read_csv(artists_followed, header=None, names=['name']).dropna()
df_followed['name_for_url'] = df_followed['name'].str.replace(' ', '-')

df_urls = pd.read_csv(artists_url_txt, header=None, names=['url']).dropna()
df_urls['name_from_url'] = df_urls['url'].str.split('/').str[-1]
df_urls['name_from_url'] = df_urls['name_from_url'].apply(lambda x: x.split('-', 1)[1] if '-' in x else x)

# Join execution keeping all columns from both DataFrames
inner_df = df_followed.merge(df_urls, left_on='name_for_url', right_on='name_from_url', how='inner')

assert len(inner_df) > 0, 'Nessun artista trovato'
assert len(artists_url_txt) > 0, 'Nessun URL trovato'
assert len(artists_name_txt) > 0, 'Nessun nome trovato'
#assert len(artists_url_txt) == len(artists_name_txt), f'Numero di URL e nomi diversi: {len(artists_url_txt)} vs {len(artists_name_txt)}'

In [34]:
print(len(url_series), len(name_series))

565440 557057


In [2]:
# Punti estremi per delimitare l'Europa
europa = [
    ("N", 71.1708, 25.7834),    # Norvegia
    ("S", 36.0007, -5.6045),    # Spagna
    ("E", 54.9358, 59.9700),    # Russia
    ("W", 38.7804, -9.4989)     # Portogallo
]

headers_dict = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    # Altri headers necessari
}

# New session
session = requests.Session()
session.headers.update(headers_dict)

In [3]:
def employees(band: str):

    if band not in inner_df['name'].values:
        print(f'Nessun {band} trovato in bandsintown')
        return
    
    url = inner_df[inner_df['name'] == band]['url'].values[0]
    came_from = random.randint(100, 999)
    url = url + f'/events?came_from={came_from}&utm_medium=web&utm_source=home&utm_campaign=search_bar'

    response = sesh.get(url, headers=headers_dict)
    if response.status_code == 200:
        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')
        scripts_soup = soup.find_all('script', type='application/ld+json')

        for s in scripts_soup:
            s = json.loads(s.get_text())
            if type(s) == dict:
                if 'employees' in s.keys() and isinstance(s['employees'], dict):
                    if s['employees']['@type'] == 'Person':
                        print(f'\nEquipaggio della banda \'{band}\':', *s['employees']['name'], sep=' ')
                    else:
                        print(f'Nessun essere umano in \'{band}\', bensì {s["employees"]["@type"]}')

    else:
        print(f'\nPorco dio: {response.status_code}')
        return

In [16]:
# Store every JSON object as a list in a list
soup = BS(session.get(inner_df['url'].values[0], headers=headers_dict).text, 'html.parser')
scripts_soup = soup.find_all('script', type='application/ld+json')
scripts = []
for script in scripts_soup:
    try:
        scripts.append(json.loads(script.get_text()))
    except json.JSONDecodeError:
        print('Errore durante il parsing del JSON')
        continue

for i, script in enumerate(scripts):
    print(f'\nScript [{i+1}/{len(scripts)}] is type {type(script)}:')
    j = 0
    while j < 1:
        print(f'\tElements are type {type(script)} long {len(script)}')
        j+=1

assert scripts_soup, 'Nessun elemento...'
assert type(scripts) == list, 'Non è stata creata una lista... "application/ld+json"??'


Script [1/3] is type <class 'list'>:
	Elements are type <class 'list'> long 23

Script [2/3] is type <class 'dict'>:
	Elements are type <class 'dict'> long 9

Script [3/3] is type <class 'list'>:
	Elements are type <class 'list'> long 20


In [7]:
def artist_soup(artist_url: str) -> BS:
    
    response = session.get(artist_url)
    if response.status_code == 200:
        soup = BS(response.text, 'html.parser')
        scripts_soup = soup.find_all('script', type='application/ld+json')
        return scripts_soup
    else:
        print(f"Errore durante il download della pagina {url}: {response.status_code}")

def artist_json(artists_scripts_soup: list) -> list:
    artists_json = []
    for script in artists_scripts_soup:
        try:
            artists_json.append(json.loads(script.get_text()))
        except json.JSONDecodeError:
            print('Errore durante il parsing del JSON')
            continue
    return [artists_json][0]

In [8]:
# Soup only L artists of the inner_df
L = 5

soups = [artist_soup(url) for url in inner_df['url'][:L]]

In [9]:
jsons = [artist_json(soup) for soup in soups]

for i in range(L):
    print(f'\nArtista {inner_df["name"][i]}:')
    for j in range(len(jsons[i])):
        print(f'\tScript {j+1}/{len(jsons[i])} is type {type(jsons[i][j])} long {len(jsons[i][j])}')

assert L == len(soups) == len(jsons), 'Non sono stati scaricati tutti i JSON'
assert all([type(s) == list for s in jsons]), 'Non sono state create liste di JSON'
assert all([type(s[0]) == list for s in jsons]), 'Problemi con events_list'
assert all([type(s[1]) == dict for s in jsons]), 'Problemi con overview_dict'
assert all([type(s[2]) == list for s in jsons]), 'Problemi con ratings_list'


Artista 1349:
	Script 1/3 is type <class 'list'> long 23
	Script 2/3 is type <class 'dict'> long 9
	Script 3/3 is type <class 'list'> long 20

Artista 65daysofstatic:
	Script 1/3 is type <class 'list'> long 0
	Script 2/3 is type <class 'dict'> long 8
	Script 3/3 is type <class 'list'> long 15

Artista alice in chains:
	Script 1/3 is type <class 'list'> long 0
	Script 2/3 is type <class 'dict'> long 9
	Script 3/3 is type <class 'list'> long 20

Artista altars:
	Script 1/3 is type <class 'list'> long 1
	Script 2/3 is type <class 'dict'> long 9
	Script 3/3 is type <class 'list'> long 0

Artista andy mckee:
	Script 1/3 is type <class 'list'> long 7
	Script 2/3 is type <class 'dict'> long 9
	Script 3/3 is type <class 'list'> long 20


In [10]:
all_events_list = []
all_overviews_dict = []
all_ratings_list = []

for i in range(L):
    if len(jsons[i][0]) > 0:
        print(f'For artist {inner_df["name"][i]}:')
        all_events_list.append(jsons[i][0])
        print(f'\tAdded {len(jsons[i][0])} events')
        #print(f'\tOverview: {len(jsons[i][1])}')
        if len(jsons[i][1]) > 0:
            all_overviews_dict.append(jsons[i][1])
            print(f'\tAdded {len(jsons[i][1])} overviews')
        #print(f'\tRatings: {len(jsons[i][2])}')
        if len(jsons[i][2]) > 0:
            all_ratings_list.append(jsons[i][2])
            print(f'\tAdded {len(jsons[i][2])} ratings')
    else:
        print(f'Skipping {inner_df["name"][i]}')

For artist 1349:
	Added 23 events
	Added 9 overviews
	Added 20 ratings
Skipping 65daysofstatic
Skipping alice in chains
For artist altars:
	Added 1 events
	Added 9 overviews
For artist andy mckee:
	Added 7 events
	Added 9 overviews
	Added 20 ratings


In [19]:
# index is artist from 1 to range(L)
sample_event = all_events_list[0]
sample_overview = all_overviews_dict[0]
sample_rating = all_ratings_list[0]

print(f'\nEvents list is type {type(sample_event)} and has {len(sample_event)} elements of type {type(sample_event[0])} with keys:\n {sample_event[0].keys()}')
print(f'\nOverview dict is type {type(sample_overview)} and has {len(sample_overview)} keys:\n {sample_overview.keys()}')
print(f'\nRatings list is type {type(sample_rating)} and has {len(sample_rating)} elements of type {type(sample_rating[0])} with keys:\n {sample_rating[0].keys()}')

print(sample_overview['name'])

assert all([type(s) == list for s in all_events_list]), 'Non sono state create liste di eventi'
assert all([type(s) == dict for s in all_overviews_dict]), 'Non sono state create liste di overview'
assert all([type(s) == list for s in all_ratings_list]), 'Non sono state create liste di ratings'


Events list is type <class 'list'> and has 23 elements of type <class 'dict'> with keys:
 dict_keys(['@context', '@type', 'name', 'startDate', 'endDate', 'url', 'location', 'performer', 'description', 'image', 'eventAttendanceMode', 'eventStatus', 'offers', 'organizer'])

Overview dict is type <class 'dict'> and has 9 keys:
 dict_keys(['@context', '@type', 'name', 'location', 'genre', 'sameAs', 'employees', 'description', 'interactionCount'])

Ratings list is type <class 'list'> and has 20 elements of type <class 'dict'> with keys:
 dict_keys(['@context', '@type', 'itemReviewed', 'reviewBody', 'reviewRating', 'author'])
1349


In [18]:
# All events for all artists in all_events_list
for i in range(len(all_events_list)):
    print(f'\nArtist {i+1}/{len(all_events_list)} for {all_overviews_dict[i]['name']}:')
    for j in range(len(all_events_list[i])):
        print(f'Event {j+1}/{len(all_events_list[i])}:')
        for key, value in all_events_list[i][j].items():
            print(f'\t{key}: {value}')


Artist 1/3 for 1349:
Event 1/23:
	@context: http://schema.org
	@type: MusicEvent
	name: 1349 @ the ANCHOR e.V. Leipzig
	startDate: 2024-10-01T19:00:00
	endDate: 2024-10-01
	url: https://www.bandsintown.com/e/105332809-1349-at-the-anchor-e.v.-leipzig?came_from=209
	location: {'@type': 'Place', 'name': 'the ANCHOR e.V. Leipzig', 'address': {'@type': 'PostalAddress', 'addressCountry': 'Germany', 'addressRegion': '', 'addressLocality': 'Leipzig', 'streetAddress': 'Renftstraße 1', 'postalCode': '04159'}, 'geo': {'@type': 'GeoCoordinates', 'latitude': 51.36747, 'longitude': 12.3427701}}
	performer: {'@type': 'PerformingGroup', 'name': '1349'}
	description: 1349
	image: https://photos.bandsintown.com/thumb/16695067.jpeg
	eventAttendanceMode: http://schema.org/OfflineEventAttendanceMode
	eventStatus: http://schema.org/EventScheduled
	offers: {'@type': 'Offer', 'url': 'https://www.bandsintown.com/e/105332809-1349-at-the-anchor-e.v.-leipzig?came_from=209', 'availability': 'https://schema.org/In

In [None]:
daysmap = {
    'Mon': 'lun',
    'Tue': 'mar',
    'Wed': 'mer',
    'Thu': 'gio',
    'Fri': 'ven',
    'Sat': 'sab',
    'Sun': 'dom'
}

monthsmap = {
    'Jan': 'gennaio',
    'Feb': 'febbraio',
    'Mar': 'marzo',
    'Apr': 'aprile',
    'May': 'maggio',
    'Jun': 'giugno',
    'Jul': 'luglio',
    'Aug': 'agosto',
    'Sep': 'settembre',
    'Oct': 'ottobre',
    'Nov': 'novembre',
    'Dec': 'dicembre'
}

sample_event = sample_event[0]

start = pd.to_datetime(sample_event['startDate'])
str_start = start.strftime('%d/%m/%Y')
venue = sample_event['location']['name']
locality = sample_event['location']['address']['addressLocality']
country = sample_event["location"]["address"]["addressCountry"]

print(f'\n{str_start} -> Evento il giorno presso "{venue}" in {locality}, {country}')

item = [start, str_start, sample_event['location']['name'], sample_event['location']['address']['addressLocality'], sample_event['location']['address']['addressCountry']]

items = {
    'band_name': sample_name,
    'datetime': start,
    'fdatetime': str_start,
    'venue': venue,
    'city': locality,
    'country': country,
}

#item_row = dict(zip(items.keys(), item))

In [None]:
df_items = pd.DataFrame(columns=['datetime', 'fdatetime', 'venue', 'city', 'country'])
df_items.loc[len(df_items)] = item
print(df_items.dtypes)
print(tabulate(df_items, headers='keys', tablefmt='pretty'))

In [None]:
def artist_append(df: pd.DataFrame, event: dict) -> pd.DataFrame:
    start = pd.to_datetime(event['startDate'])
    str_start = start.strftime('%d/%m/%Y')
    venue = event['location']['name']
    locality = event['location']['address']['addressLocality']
    country = event['location']['address']['addressCountry']
    item = [start, str_start, venue, locality, country]
    df.loc[len(df)] = item
    return df

df_items = pd.DataFrame(columns=['datetime', 'fdatetime', 'venue', 'city', 'country'])

for event in all_events_list:
    df_items = artist_append(df_items, event)
