# Metadata Scraping - Tracks

Os dados fornecidos pelo Spotify são bastante secos em termos de catergorização das músicas presentes em cada uma das playlists. Em uma estratégia utilizando puramente técnicas de Filtro Colaborativo, que é nosso foco neste projeto, isso seria suficiente. Mas buscaremos observar também técnicas relacionadas sistemas Baseados em Conteúdo, bem como estratégias híbridas, bem como será interessante analisar alguns padrões nas diversas listagens disponíveis.

Assim, no presente notebook, nos utilizamos do módulo [Spotipy](https://spotipy.readthedocs.io/en/latest/) para nos utilizar de dados do Spotify para enriquecer nossa base de dados.

Dados de <b>FAIXAS</b>

<br><br>
<hr>
@author: [Pedro Correia](https://github.com/pfcor)

In [1]:
import math
import pprint
from collections import defaultdict
import pandas as pd
import numpy as np
import tqdm
import time
import datetime as dt
import os

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

Vamos iterar faixa a faixa e obter informações relacionadas ao artista (estilos, popularidade, número de seguidores e artistas similares)e à música (níveis de ritmos, instrumentação, etc)

In [38]:
def spotipy_auth():
    
    # initializing and authenticating
    from spotipy_credencials import CLIENT_ID, CLIENT_SECRET
    ccm = SpotifyClientCredentials( client_id=CLIENT_ID, client_secret=CLIENT_SECRET )
    sp = spotipy.Spotify(client_credentials_manager=ccm)
    return sp


def get_tracks_uri(ntracks=None):
    
    return pd.read_csv("data/formatted/tracks.csv", nrows=ntracks, usecols=["track_uri"])["track_uri"].values


def get_top_n_related_artists(artist_uri, n=3, sort_by="popularity", include_name=False):
    
    assert sort_by in ["popularity", "similarity"]
    
    related_artists = [
        (rartist["uri"], rartist["name"], rartist["popularity"]) 
        for rartist 
        in sp.artist_related_artists(artist_uri)["artists"]
    ]
    
    if sort_by=="popularity":
        related_artists = sorted(related_artists, key=lambda x: -x[-1])[:n]
    
    return [rartist[:2] if include_name else rartist[0] for rartist in related_artists[:n]]

In [39]:
def scrape_track_metadata(batch_size=50, output_folder="data/metadata", start_batch=0):
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    filename = output_folder + '/tracks_metadata.csv'
    failed_index = []
    
    
    print("|Initializing Spotify API...", end=" ", flush=True)
    sp = spotipy_auth()
    print("ok\n")
    
    print("|Loading tracks...", end=" ", flush=True)
    tracks = get_tracks_uri()
    n_tracks = len(tracks)
    print("ok")
    
    print(f"    > {n_tracks} tracks to be processed\n")
    
    print("|Gathering Spotify data")
    t0 = time.time()
    

    batches_count = math.ceil(n_tracks / batch_size)
    print(f"    > {batches_count} batches to be processed")
    
    for i in range(start_batch, batches_count):
        spotify_data = defaultdict(list)
        
#         print(f'        - batch # {i+1:0>2}', end=" ", flush=True)
        try:
            i_0 = i * batch_size
            i_f = i_0 + batch_size           
            batch = tracks[i_0:i_f]

            # general data
            tracks_data = sp.tracks(batch)["tracks"]
            for track in tracks_data:
                if track != None:
                    spotify_data['track_uri'].append(track['uri'])
                    spotify_data['popularity'].append(track['popularity'])
                    spotify_data['realease_year'].append(int(track['album']['release_date'].split("-")[0]))
                else:
                    spotify_data['track_uri'].append(np.nan)
                    spotify_data['popularity'].append(np.nan)
                    spotify_data['realease_year'].append(np.nan)

            # audio data
            track_audio_features = sp.audio_features(batch)
            for track in track_audio_features:
                if track != None:
                    spotify_data['acousticness'].append(track['acousticness'])
                    spotify_data['danceability'].append(track['danceability'])
                    spotify_data['energy'].append(track['energy'])
                    spotify_data['instrumentalness'].append(track['instrumentalness'])
                    spotify_data['mode'].append(track['mode'])
                    spotify_data['loudness'].append(track['loudness'])
                    spotify_data['speechiness'].append(track['speechiness'])
                    spotify_data['tempo'].append(track['tempo'])
                    spotify_data['time_signature'].append(track['time_signature'])
                    spotify_data['valence'].append(track['valence'])

                else:
                    spotify_data['acousticness'].append(np.nan)
                    spotify_data['danceability'].append(np.nan)
                    spotify_data['energy'].append(np.nan)
                    spotify_data['instrumentalness'].append(np.nan)
                    spotify_data['mode'].append(np.nan)
                    spotify_data['loudness'].append(np.nan)
                    spotify_data['speechiness'].append(np.nan)
                    spotify_data['tempo'].append(np.nan)
                    spotify_data['time_signature'].append(np.nan)
                    spotify_data['valence'].append(np.nan)

            try:
                spotify_data = pd.DataFrame(spotify_data)
                spotify_data.to_csv(filename, sep=";", index=False, header=(i==0), mode="w" if (i==0) else "a")
            except:
                for feat, vals in spotify_data.items():
                    print(feat, len(vals))
                
            if ((i+1)%100 == 0) or ((i+1) == batches_count):
                print(f'        - {i+1:>8} batches completed | {batches_count-(i+1):>8} to go | elapsed time: {str(dt.timedelta(seconds=time.time()-t0))}')
        except TimeoutError:
            failed_index.append((i_0, i_f))
            sp = spotipy_auth()
        


In [40]:
scrape_track_metadata(start_batch=45200)

|Initializing Spotify API... ok

|Loading tracks... ok
    > 2262292 tracks to be processed

|Gathering Spotify data
    > 45246 batches to be processed
        -    45246 batches completed |        0 to go | elapsed time: 0:00:34.580821


In [44]:
tracks_metadata = pd.read_csv("data/metadata/tracks_metadata.csv", sep=";")

In [45]:
tracks_metadata.shape

(2278792, 13)

In [46]:
tracks_metadata.head()

Unnamed: 0,track_uri,popularity,realease_year,acousticness,danceability,energy,instrumentalness,mode,loudness,speechiness,tempo,time_signature,valence
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,63.0,2005.0,0.0311,0.904,0.813,0.00697,0.0,-7.105,0.121,125.461,4.0,0.81
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,77.0,2003.0,0.0249,0.774,0.838,0.025,0.0,-3.914,0.114,143.04,4.0,0.924
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,58.0,2003.0,0.00238,0.664,0.758,0.0,0.0,-6.583,0.21,99.259,4.0,0.701
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,71.0,2002.0,0.202,0.891,0.714,0.000234,0.0,-6.055,0.14,100.972,4.0,0.818
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,27.0,2000.0,0.0561,0.853,0.606,0.0,1.0,-4.596,0.0713,94.759,4.0,0.654


In [51]:
tracks = pd.read_csv("data/formatted/tracks.csv", usecols=["tid", "track_uri", "track_name", "artist_name"])

In [54]:
tracks_metadata.head().merge(tracks, on="track_uri", how="left").T

Unnamed: 0,0,1,2,3,4
track_uri,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:track:1lzr43nnXAijIGYnCT8M8H
popularity,63,77,58,71,27
realease_year,2005,2003,2003,2002,2000
acousticness,0.0311,0.0249,0.00238,0.202,0.0561
danceability,0.904,0.774,0.664,0.891,0.853
energy,0.813,0.838,0.758,0.714,0.606
instrumentalness,0.00697,0.025,0,0.000234,0
mode,0,0,0,0,1
loudness,-7.105,-3.914,-6.583,-6.055,-4.596
speechiness,0.121,0.114,0.21,0.14,0.0713


In [59]:
tracks_metadata.loc[tracks_metadata["energy"]==1, ["track_uri", "energy"]].merge(tracks, on="track_uri", how="left")

Unnamed: 0,track_uri,energy,tid,track_name,artist_name
0,spotify:track:4YP5IwwbQEDUgDCG5jNmDq,1.0,7260,The Pipe,Ryan Miller
1,spotify:track:7vHaqMmt5DGacX6c6YCuCP,1.0,64049,Calm Tropical Rainforest Nature Sounds,Nature Sounds
2,spotify:track:3hdeTHLlna8eJwnOiBVoOi,1.0,93701,Symbols,Parser
3,spotify:track:1U7j5xpUSgX0gxk5lmhHUD,1.0,98944,Snakes in Tracksuits,Gopnik McBlyat
4,spotify:track:1z2Kf0DhuJqG7bwP02oqiP,1.0,114763,Kenn nicht deinen Namen (Scheissegal) - Party ...,DJ Hulpa
5,spotify:track:0ksVMStKRQ5mjRT1l8bAC5,1.0,114796,Licht am Fahrrad,Mike Der Bademeister
6,spotify:track:7d8xHVw0gLpdtQfI1LqHjU,1.0,117683,Everlasting Fire,Non
7,spotify:track:7ntx4iSw4VYZQ2bRQv092H,1.0,255988,Walk On Water - Ultra DJ's Remix,Basshunter
8,spotify:track:079CplzWlPmfJW20w3y5tD,1.0,262320,Black Rain Clouds,"Lighting, Thunderstorms & Rain Storm Sounds"
9,spotify:track:01DqBu9WECQ8XyMIAOd9VV,1.0,310237,Tropical Rain,Nature Sounds Nature Music


<hr>
