In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
from utils.artist_features import spotify_client, artist_features, related_artists
from collections import defaultdict
from queue import Queue
import pickle
import os

KeyboardInterrupt: 

## Collecting artists ids from daily charts and playlists

In [None]:
# Get daily charts
streams = pd.read_csv('../data/daily_charts.csv')

# iterate over files in playlist folder
playlists_data = pd.DataFrame()
for filename in os.scandir('../data/playlists'):
    if filename.is_file():
        playlists_data = pd.concat([playlists_data, pd.read_csv(filename)], ignore_index=True)

playlists_data.head()

In [None]:
from functools import reduce
import operator

# Get unique artist ids

artists_from_streams = streams['artists'].apply(lambda x: literal_eval(x)['id']).tolist()
main_artists_from_playlists = playlists_data['id'].tolist()
neighbors_artists = playlists_data['neighbours'].apply(lambda x: literal_eval(x)).tolist()
neighbors_artists = reduce(operator.concat, neighbors_artists)

# artists_ids = set(artists_from_streams) | set(main_artists_from_playlists) | set(neighbors_artists)
artists_ids = set(neighbors_artists)
print('Unique artists: %d' % len(artists_ids))

In [None]:
def dict_to_df(dictionary):
    return pd.DataFrame([list(dictionary.values())], columns=list(dictionary.keys()))

client_id = '7641b1dcfa894b9e97d9419d50e29c45'
client_secret = 'a7a93ef6f8354a349374896772bf752b'
sp = spotify_client(client_id, client_secret)

# We need to set apart the artists we haven't encountered before to fetch their data
artists_with_data = set()

# Store the links between artists
artist_links = defaultdict(set)

# TODO change to 10k
# We want to gather the related artists for a maximum of 10k artists (to reduce calls to the Spotify API)
artist_queue_limit = 5000
artist_count = 1
artist_queue = Queue()

# Keep dictionaries to save progress
file = open('../data/tmp/af.pickle', 'rb')
artists_features = pickle.load(file)
file.close()

file = open('../data/tmp/ar.pickle', 'rb')
artists_related_artists = pickle.load(file)
file.close()

# Filling the queue with artists that appeared in the 2020 charts
for id in artists_ids:
    artist_queue.put(id)
    break

count = 0
artists_info = pd.DataFrame()
while not artist_queue.empty():
    count += 1
    if count % 100 == 0:
        print('Dequeued artist %d' % count)
        file = open('../data/tmp/af.pickle', 'wb')
        pickle.dump(artists_features, file)
        file.close()
        file = open('../data/tmp/ar.pickle', 'wb')
        pickle.dump(artists_related_artists, file)
        file.close()
    artist_id = artist_queue.get()

    # Save information for current artist
    if artist_id not in artists_features:
        artist_data = artist_features(sp, artist_id)
        artists_features[artist_id] = artist_data
    else:
        artist_data = artists_features[artist_id]


    if artist_id not in artists_with_data:
        artists_info = pd.concat([artists_info, dict_to_df(artist_data)], ignore_index=True)
        artists_with_data.add(artist_id)

    # Go through related artists
    if artist_id not in artists_related_artists:
        rel_artists = related_artists(sp, artist_id)
        artists_related_artists[artist_id] = rel_artists
    else:
        rel_artists = artists_related_artists[artist_id]

    for related_artist in rel_artists:
        # Add the related artist in the links of the current artist
        artist_links[artist_id].add(related_artist['id'])

        if related_artist['id'] in artists_with_data:
            continue

        # Save information for related artist
        artists_info = pd.concat([artists_info, dict_to_df(related_artist)], ignore_index=True)
        artists_with_data.add(related_artist['id'])

        if artist_count < artist_queue_limit:
            artist_queue.put(related_artist['id'])
        artist_count += 1

artists_info['followers'] = artists_info['followers'].astype(int)
artists_info['popularity'] = artists_info['popularity'].astype(int)
artists_info.to_csv('../data/artist_info.csv', index=False)

artist_links_file = open("../data/artist_links.pickle", "wb")
pickle.dump(artist_links, artist_links_file)
artist_links_file.close()

print('Total number of artists: %d' % len(artists_with_data))

## Building the Adjacency Matrix

In [39]:
# Load artist links
artist_links_file = open("../data/artist_links.pickle", "rb")
artist_links = pickle.load(artist_links_file)
artist_links_file.close()

# Combine all artists
artists = pd.read_csv('../data/artist_info.csv')

adjacency = np.zeros((artists.shape[0], artists.shape[0]))
idToIndexMap = {}

# Since the artist links actually use the Spotify ID and not the index in the adjacency matrix,
# we need to map the spotify id to the index
for index, row in artists.iterrows():
    idToIndexMap[row['id']] = index

# Loop through all artists fill adjacency matrix with one in case of a relation
for artist_id, rel_artists in artist_links.items():
    artist_index = idToIndexMap[artist_id]
    for related_artist in rel_artists:
        related_artist_index = idToIndexMap[related_artist]
        adjacency[artist_index, related_artist_index] = 1

# Save id to index map for later
id_to_index_file = open("../data/id_to_index.pickle", "wb")
pickle.dump(idToIndexMap, id_to_index_file)
id_to_index_file.close()

print('Size of adjacency matrix: %d x %d' % adjacency.shape)
print(adjacency)

Size of adjacency matrix: 684 x 684
[[0. 1. 1. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


## Convert to sparse matrix

In [40]:
import scipy.sparse as sparse

sparse_adj = sparse.csc_matrix(adjacency)
print('The adjacency matrix has %d non-zero elements, this is equivalent to the number of edges' % sparse_adj.nnz)

The adjacency matrix has 2000 non-zero elements, this is equivalent to the number of edges
