In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from collections import defaultdict
from spotipy import SpotifyClientCredentials, Spotify
from queue import Queue
import string
import pickle
import os

## Collecting artists ids from daily charts and playlists

In [2]:
# Get daily charts
streams = pd.read_csv('../data/daily_charts.csv')

# iterate over files in playlist folder
playlists_data = pd.DataFrame()
for filename in os.scandir('../data/playlists'):
    if filename.is_file():
        playlists_data = pd.concat([playlists_data, pd.read_csv(filename)], ignore_index=True)

playlists_data.head()

Unnamed: 0,id,name,neighbours,out_degree,genres
0,73rPcaYEhBd0UuVZBqqyQJ,The Happy Fits,"['6wnSPJpmVKFcn1TpRl5Uli', '1MIe1z4RdqLqHSJsb7...",20,"['indie pop', 'modern folk rock', 'modern rock..."
1,2garjIdgTW5i89s4Z3UT72,Andrew Garfield,"['6mXcTwoata1hkaDMbkzw32', '6RKS67sIPOW16NcGWk...",20,"['hollywood', 'pop']"
2,6RKS67sIPOW16NcGWkgdzF,Alexandra Shipp,"['6mXcTwoata1hkaDMbkzw32', '7i3os1iEWOn5UpaylR...",20,"['hollywood', 'pop']"
3,0HthCchcL0kVLHTr113Vk1,mxmtoon,"['21TinSsF5ytwsfdyz5VSVS', '2hR4h1Cao2ueuI7Cx9...",20,"['lo-fi beats', 'lo-fi chill', 'sad lo-fi']"
4,4ZeB1hzT2mSZrf7wszOqHs,Mating Ritual,"['3cohAS2UQTaOo80kCn8qjT', '7FhRUp59cBzPaxobsR...",20,"['indie poptimism', 'modern alternative rock',..."


In [3]:
from functools import reduce
import operator

# Get unique artist ids

artists_from_streams = streams['artists'].apply(lambda x: literal_eval(x)['id']).tolist()
main_artists_from_playlists = playlists_data['id'].tolist()
neighbors_artists = playlists_data['neighbours'].apply(lambda x: literal_eval(x)).tolist()
neighbors_artists = reduce(operator.concat, neighbors_artists)

# artists_ids = set(artists_from_streams) | set(main_artists_from_playlists) | set(neighbors_artists)
artists_ids = set(neighbors_artists)
print('Unique artists: %d' % len(artists_ids))

Unique artists: 4755


In [4]:
def __extract_info_from_artist_entity(artist_entity: dict):
    return {
        'id': artist_entity['id'],
        'name': artist_entity['name'],
        'followers': artist_entity['followers']['total'],
        'genres': artist_entity['genres'],
        'popularity': artist_entity['popularity']
    }


def spotify_client(client_id, client_secret):
    client_credentials_manager = SpotifyClientCredentials(client_id, client_secret)
    return Spotify(client_credentials_manager=client_credentials_manager, requests_timeout=3, retries=2)


def related_artists(spotify_client: Spotify, artist_id: string):
    rel_artists = list(
        map(__extract_info_from_artist_entity, spotify_client.artist_related_artists(artist_id)['artists']))
    return rel_artists


def artist_features(spotify_client: Spotify, artist_id: string):
    artist = spotify_client.artist(artist_id)
    rel_artists = related_artists(spotify_client, artist_id)

    features = __extract_info_from_artist_entity(artist)
    features['related_artists'] = rel_artists

    return features

In [5]:
client_id = '7dad625f745d4275bdf2da9b116c764c'
client_secret = '96597c6e00db4d3895abfe9487b64272'
sp = spotify_client(client_id, client_secret)
print(related_artists(sp, '7dGJo4pcD2V6oG8kP0tJRR'))

[{'id': '5Qi4Bb7a8C0a00NZcA77L0', 'name': 'D12', 'followers': 2371682, 'genres': ['detroit hip hop', 'gangster rap', 'hip hop', 'pop rap', 'rap'], 'popularity': 64}, {'id': '77IURH5NC56Jn09QHi76is', 'name': 'Bad Meets Evil', 'followers': 1425465, 'genres': ['detroit hip hop', 'gangster rap', 'hip hop', 'pop rap', 'rap'], 'popularity': 59}, {'id': '0NbfKEOTQCcwd6o7wSDOHI', 'name': 'The Game', 'followers': 4010884, 'genres': ['gangster rap', 'hip hop', 'pop rap', 'rap', 'southern hip hop', 'trap'], 'popularity': 71}, {'id': '2XnnxQzxFZG8qEPjakokPM', 'name': 'Obie Trice', 'followers': 787578, 'genres': ['detroit hip hop', 'gangster rap', 'hardcore hip hop', 'hip hop', 'pop rap', 'rap', 'southern hip hop'], 'popularity': 60}, {'id': '1HwM5zlC5qNWhJtM00yXzG', 'name': 'DMX', 'followers': 3473369, 'genres': ['east coast hip hop', 'gangster rap', 'hardcore hip hop', 'hip hop', 'pop rap', 'rap', 'southern hip hop'], 'popularity': 69}, {'id': '6DPYiyq5kWVQS4RGwxzPC7', 'name': 'Dr. Dre', 'followe

In [11]:
def dict_to_df(dictionary):
    return pd.DataFrame([list(dictionary.values())], columns=list(dictionary.keys()))


client_id = '7641b1dcfa894b9e97d9419d50e29c45'
client_secret = 'a7a93ef6f8354a349374896772bf752b'
sp = spotify_client(client_id, client_secret)

# We need to set apart the artists we haven't encountered before to fetch their data
artists_with_data = set()

# Store the links between artists
artist_links = defaultdict(set)

# TODO change to 10k
# We want to gather the related artists for a maximum of 10k artists (to reduce calls to the Spotify API)
artist_queue_limit = 10000
artist_count = len(artists_ids)
artist_queue = Queue()

# Keep dictionaries to save progress
af = open('../data/tmp/af.pickle', 'wb')
ar = open('../data/tmp/ar.pickle', 'wb')
artists_features = pickle.load(af)
artists_related_artists = pickle.load(ar)

# Filling the queue with artists that appeared in the 2020 charts
for id in artists_ids:
    artist_queue.put(id)

count = 0
artists_info = pd.DataFrame()


while not artist_queue.empty():
    count += 1
    if count % 100 == 0:
        print('Dequeued artist %d' % count)
        pickle.dump(artists_features, af)
        pickle.dump(artists_related_artists, ar)

    artist_id = artist_queue.get()

    # Save information for current artist
    if artist_id not in artists_features:
        artist_data = artist_features(sp, artist_id)
        artists_features[artist_id] = artist_data
    else:
        artist_data = artists_features[artist_id]

    if artist_id not in artists_with_data:
        artists_info = pd.concat([artists_info, dict_to_df(artist_data)], ignore_index=True)
        artists_with_data.add(artist_id)

    # Go through related artists
    if artist_id not in artists_related_artists:
        rel_artists = related_artists(sp, artist_id)
        artists_related_artists[artist_id] = rel_artists
    else:
        rel_artists = artists_related_artists[artist_id]

    for related_artist in rel_artists:
        # Add the related artist in the links of the current artist
        artist_links[artist_id].add(related_artist['id'])

        if related_artist['id'] in artists_with_data:
            continue

        # Save information for related artist
        artists_info = pd.concat([artists_info, dict_to_df(related_artist)], ignore_index=True)
        artists_with_data.add(related_artist['id'])

        if artist_count < artist_queue_limit:
            artist_queue.put(related_artist['id'])
        artist_count += 1

artists_info['followers'] = artists_info['followers'].astype(int)
artists_info['popularity'] = artists_info['popularity'].astype(int)
artists_info.to_csv('../data/artist_info.csv', index=False)

artist_links_file = open("../data/artist_links.pickle", "wb")
pickle.dump(artist_links, artist_links_file)
artist_links_file.close()

ar.close()
af.close()

print('Total number of artists: %d' % len(artists_with_data))

Dequeued artist 100
Dequeued artist 200
Dequeued artist 300
Dequeued artist 400
Dequeued artist 500
Dequeued artist 600
Dequeued artist 700
Dequeued artist 800
Dequeued artist 900
Dequeued artist 1000
Dequeued artist 1100
Dequeued artist 1200
Dequeued artist 1300
Dequeued artist 1400
Dequeued artist 1500
Dequeued artist 1600
Dequeued artist 1700
Dequeued artist 1800
Dequeued artist 1900
Dequeued artist 2000
Dequeued artist 2100
Dequeued artist 2200
Dequeued artist 2300
Dequeued artist 2400
Dequeued artist 2500
Dequeued artist 2600
Dequeued artist 2700
Dequeued artist 2800
Dequeued artist 2900
Dequeued artist 3000
Dequeued artist 3100
Dequeued artist 3200
Dequeued artist 3300
Dequeued artist 3400
Dequeued artist 3500
Dequeued artist 3600
Dequeued artist 3700
Dequeued artist 3800
Dequeued artist 3900
Dequeued artist 4000
Dequeued artist 4100
Dequeued artist 4200
Dequeued artist 4300
Dequeued artist 4400
Dequeued artist 4500
Dequeued artist 4600
Dequeued artist 4700
Dequeued artist 4800
D