# Metadata Scraping - Artists

Os dados fornecidos pelo Spotify são bastante secos em termos de catergorização das músicas presentes em cada uma das playlists. Em uma estratégia utilizando puramente técnicas de Filtro Colaborativo, que é nosso foco neste projeto, isso seria suficiente. Mas buscaremos observar também técnicas relacionadas sistemas Baseados em Conteúdo, bem como estratégias híbridas, bem como será interessante analisar alguns padrões nas diversas listagens disponíveis.

Assim, no presente notebook, nos utilizamos do módulo [Spotipy](https://spotipy.readthedocs.io/en/latest/) para nos utilizar de dados do Spotify para enriquecer nossa base de dados.

Dados de <b>ARTISTAS</b>
<br><br>
<hr>
@author: [Pedro Correia](https://github.com/pfcor)

In [1]:
import math
import pprint
from collections import defaultdict
import pandas as pd
import numpy as np
import tqdm
import time
import datetime as dt
import os

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

Vamos iterar faixa a faixa e obter informações relacionadas ao artista (estilos, popularidade, número de seguidores e artistas similares)e à música (níveis de ritmos, instrumentação, etc)

In [2]:
def spotipy_auth():
    
    # initializing and authenticating
    from spotipy_credencials import CLIENT_ID, CLIENT_SECRET
    ccm = SpotifyClientCredentials( client_id=CLIENT_ID, client_secret=CLIENT_SECRET )
    sp = spotipy.Spotify(client_credentials_manager=ccm)
    return sp


def get_artists_uri():
    
    return pd.read_csv("tab_data/tracks_metadata.csv", usecols=["artist_uri"])["artist_uri"].unique()


def get_top_n_related_artists(artist_uri, n=3, sort_by="popularity", include_name=False):
    
    assert sort_by in ["popularity", "similarity"]
    
    related_artists = [
        (rartist["uri"], rartist["name"], rartist["popularity"]) 
        for rartist 
        in sp.artist_related_artists(artist_uri)["artists"]
    ]
    
    if sort_by=="popularity":
        related_artists = sorted(related_artists, key=lambda x: -x[-1])[:n]
    
    return [rartist[:2] if include_name else rartist[0] for rartist in related_artists[:n]]

In [3]:
def scrape_artist_metadata(batch_size=50, output_folder="data/metadata"):
    
    print("|Initializing Spotify API...", end=" ", flush=True)
    sp = spotipy_auth()
    print("ok\n")
    
    print("|Loading artists...", end=" ", flush=True)
    artists = get_artists_uri()
    n_artists = len(artists)
    print("ok")
    
    print(f"    > {n_artists} artists to be processed\n")
    
    print("|Gathering Spotify data")
    t0 = time.time()
    
    spotify_data = defaultdict(list)

    batches_count = math.ceil(n_artists / batch_size)
    print(f"    > {batches_count} batches to be processed")
    
    for i in range(batches_count):

        i_0 = i * batch_size
        i_f = i_0 + batch_size           
        batch = artists[i_0:i_f]
        
        # general data
        artists_data = sp.artists(batch)["artists"]
        
        for artist in artists_data:
        
            if artist != None:
                spotify_data['artist_uri'].append(artist['uri'])
                spotify_data['popularity'].append(artist['popularity'])
                spotify_data['genres'].append(artist["genres"])
#                 spotify_data["related_artists"].append(get_top_n_related_artists(artist["uri"]))
            else:
                spotify_data['artist_uri'].append(np.nan)
                spotify_data['popularity'].append(np.nan)
                spotify_data['genres'].append(np.nan)
#                 spotify_data["related_artists"].append(np.nan)
          
        if ((i+1)%100 == 0) or ((i+1) == batches_count):
            print(f'        - {i+1:>8} batches completed | {batches_count-(i+1):>8} to go | elapsed time: {str(dt.timedelta(seconds=time.time()-t0))}')
    
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    filename = output_folder + '/artists_metadata.csv'
    print(f"    > Saving to file: {filename}")
    spotify_data = pd.DataFrame(spotify_data)
    spotify_data.to_csv(filename, index=False, sep=";")


In [4]:
scrape_artist_metadata()

|Initializing Spotify API... ok

|Loading artists... ok
    > 295860 artists to be processed

|Gathering Spotify data
    > 5918 batches to be processed
        -      100 batches completed |     5818 to go | elapsed time: 0:00:59.097345
        -      200 batches completed |     5718 to go | elapsed time: 0:01:48.905869
        -      300 batches completed |     5618 to go | elapsed time: 0:02:39.463112
        -      400 batches completed |     5518 to go | elapsed time: 0:03:34.307537
        -      500 batches completed |     5418 to go | elapsed time: 0:04:26.294466
        -      600 batches completed |     5318 to go | elapsed time: 0:05:33.564779
        -      700 batches completed |     5218 to go | elapsed time: 0:06:48.684547
        -      800 batches completed |     5118 to go | elapsed time: 0:08:37.356402
        -      900 batches completed |     5018 to go | elapsed time: 0:09:49.841457
        -     1000 batches completed |     4918 to go | elapsed time: 0:10:54.1272

<hr>

In [6]:
artists_metadata = pd.read_csv("data/metadata/artists_metadata.csv", sep=";")

In [9]:
artists_metadata["genres"][0]

"['dance pop', 'hip hop', 'hip pop', 'pop', 'pop rap', 'r&b', 'rap', 'southern hip hop', 'urban contemporary']"

In [10]:
import ast

In [12]:
ast.literal_eval(artists_metadata["genres"][0])

['dance pop',
 'hip hop',
 'hip pop',
 'pop',
 'pop rap',
 'r&b',
 'rap',
 'southern hip hop',
 'urban contemporary']

In [13]:
from collections import Counter

In [20]:
genders_counter = Counter()

In [23]:
genders_counter = Counter()
for genres in artists_metadata["genres"].values:
    genders_counter.update(ast.literal_eval(genres))

In [24]:
genders_counter

Counter({'dance pop': 599,
         'hip hop': 501,
         'hip pop': 270,
         'pop': 571,
         'pop rap': 454,
         'r&b': 339,
         'rap': 474,
         'southern hip hop': 368,
         'urban contemporary': 379,
         'post-teen pop': 351,
         'reggae fusion': 82,
         'australian pop': 183,
         'girl group': 60,
         'dirty south rap': 275,
         'canadian pop': 107,
         'pop rock': 364,
         'neo mellow': 274,
         'viral pop': 197,
         'deep pop r&b': 237,
         'candy pop': 100,
         'folk': 359,
         'folk-pop': 500,
         'lilith': 247,
         'mellow gold': 330,
         'new wave pop': 340,
         'permanent wave': 110,
         'rock': 568,
         'comic': 224,
         'pop punk': 294,
         'post-grunge': 294,
         'boy band': 92,
         'emo': 342,
         'pixie': 138,
         'pop emo': 186,
         'trap queen': 114,
         'bubblegum dance': 194,
         'eurodance': 170,

In [26]:
genders_counter.most_common(30)

[('dance pop', 599),
 ('pop', 571),
 ('rock', 568),
 ('modern rock', 527),
 ('hip hop', 501),
 ('folk-pop', 500),
 ('electro house', 485),
 ('latin', 477),
 ('indietronica', 476),
 ('rap', 474),
 ('pop rap', 454),
 ('edm', 452),
 ('indie rock', 423),
 ('indie poptimism', 411),
 ('tropical', 406),
 ('vapor soul', 393),
 ('indie r&b', 384),
 ('funk', 381),
 ('urban contemporary', 379),
 ('latin rock', 378),
 ('underground hip hop', 374),
 ('christmas', 373),
 ('neo-psychedelic', 370),
 ('southern hip hop', 368),
 ('folk rock', 368),
 ('chamber pop', 368),
 ('pop rock', 364),
 ('indie pop', 363),
 ('garage rock', 360),
 ('folk', 359)]