In [1]:
    import pandas as pd
import numpy as np
from scipy.spatial.distance import euclidean
from scipy.special import softmax

In [2]:
clusters = pd.read_csv('clusters.csv', index_col = 'artist')
clusters.head()

Unnamed: 0_level_0,tsne_0,tsne_1,cluster,cluster_name,followers
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Travis Scott,-49.622486,-63.16502,0,Hip hop & rap,7705985
Martin Garrix,23.056034,46.40569,1,EDM & house,12574151
Los Fabulosos Cadillacs,-27.807325,33.983185,2-0,Argentinian rock,1672897
Brockhampton,-33.395466,-67.09489,0,Hip hop & rap,1186410
DUKI,84.602455,-40.784866,3,Argentinian trap,2715207


In [3]:
artists_audio_feats = pd.read_csv('artists_audio_feats.csv', index_col='artist')
artists_audio_feats.head()

Unnamed: 0_level_0,energy,danceability,valence,instrumentalness,acousticness,speechiness,tempo,loudness,day
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A Day to Remember,0.9079,0.4463,0.4526,1.1e-05,0.012092,0.09965,0.654206,0.933772,1
AJR,0.6378,0.5945,0.5554,0.0,0.16627,0.07905,0.555772,0.911137,1
Ainda,0.2828,0.5976,0.39895,0.002467,0.7541,0.04014,0.563969,0.808978,2
Airbag,0.7816,0.4371,0.4093,0.003491,0.033316,0.04284,0.521145,0.921582,3
Alan Walker,0.6822,0.5669,0.3462,0.00024,0.20684,0.05304,0.590076,0.907678,3


Intra cluster weights

In [4]:
cluster_followers = clusters.groupby('cluster_name')['followers'].sum()
clusters['cluster_followers'] = clusters['cluster_name'].map(cluster_followers.to_dict())
clusters['weight'] = clusters['followers'] / clusters['cluster_followers']
clusters.head()

Unnamed: 0_level_0,tsne_0,tsne_1,cluster,cluster_name,followers,cluster_followers,weight
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Travis Scott,-49.622486,-63.16502,0,Hip hop & rap,7705985,11014805,0.699602
Martin Garrix,23.056034,46.40569,1,EDM & house,12574151,37888307,0.331874
Los Fabulosos Cadillacs,-27.807325,33.983185,2-0,Argentinian rock,1672897,2482014,0.674008
Brockhampton,-33.395466,-67.09489,0,Hip hop & rap,1186410,11014805,0.10771
DUKI,84.602455,-40.784866,3,Argentinian trap,2715207,5079022,0.534592


Compute weighted centroid of each cluster

In [5]:
cluster_centroids = clusters.groupby('cluster_name').apply(lambda x: np.average(x.tsne_0, weights=x.weight))
cluster_centroids = pd.concat([cluster_centroids, clusters.groupby('cluster_name').apply(lambda x: np.average(x.tsne_1, weights=x.weight))],
                              axis=1)
cluster_centroids.columns = ['tsne_0', 'tsne_1']
cluster_centroids

Unnamed: 0_level_0,tsne_0,tsne_1
cluster_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Argentinian indie,-69.431438,21.030936
Argentinian pop,-47.506736,77.361236
Argentinian rock,-31.01493,36.950956
Argentinian trap,86.034017,-35.481313
Art pop & R&B,60.223004,17.694987
EDM & house,24.138138,41.212343
Hip hop & rap,-46.332805,-64.247461
International indie & rock,-0.881426,-43.08744
International pop,44.987376,-3.960067
Latin trap,41.282845,-67.798617


Manually assign non clustered artists to corresponding clusters based on domain knowledge

In [6]:
sample_sizes = {'EDM & house':1,
                'International indie & rock':1,
                'Latin trap':3,
                'International pop':1,
                'Argentinian indie':1}

manual_tsne_index = pd.Index(['Goldfish', 'A Day to Remember', 'Paloma Mami', 'Cimafunk', 'Girl Ultra', 'Pabllo Vittar', 'Elsa y Elmar'])
manual_tsne = pd.DataFrame()

# Take random samples from gaussian multivariate distribution -without weights-
for k,v in sample_sizes.items():
    mean = clusters.groupby('cluster_name').mean().loc[k, ['tsne_0', 'tsne_1']]
    cov = clusters.groupby('cluster_name')[['tsne_0', 'tsne_1']].cov().loc[k].values
    np.random.seed(123)
    manual_tsne = pd.concat([manual_tsne, pd.DataFrame(np.random.multivariate_normal(mean, cov, size=v))], axis=0, ignore_index=True)

manual_tsne.index = manual_tsne_index
manual_tsne.columns = ['tsne_0', 'tsne_1']
manual_tsne

Unnamed: 0,tsne_0,tsne_1
Goldfish,47.064902,71.35307
A Day to Remember,5.571709,-55.16159
Paloma Mami,49.564996,-55.094376
Cimafunk,46.168287,-78.756887
Girl Ultra,42.273349,-58.939207
Pabllo Vittar,56.4635,-14.738487
Elsa y Elmar,-60.769914,10.494199


In [7]:
clusters.loc[manual_tsne_index, ['tsne_0', 'tsne_1']] = manual_tsne
clusters.loc[manual_tsne_index, ['tsne_0', 'tsne_1']]

Unnamed: 0,tsne_0,tsne_1
Goldfish,47.064902,71.35307
A Day to Remember,5.571709,-55.16159
Paloma Mami,49.564996,-55.094376
Cimafunk,46.168287,-78.756887
Girl Ultra,42.273349,-58.939207
Pabllo Vittar,56.4635,-14.738487
Elsa y Elmar,-60.769914,10.494199


In [8]:
clusters.loc[['Goldfish', 'A Day to Remember', 'Paloma Mami', 'Cimafunk', 'Girl Ultra', 'Pabllo Vittar', 'Elsa y Elmar'], 'cluster_name'] = \
             ['EDM & house', 'International indie & rock', 'Latin trap', 'Latin trap', 'Latin trap', 'International pop', 'Argentinian indie']

Lineup grid

In [9]:
grid = pd.read_csv('horarios.csv')
grid['choice'] = np.nan
grid

Unnamed: 0,Day,Main stage 1,Main stage 2,Alternative,Perry’s,choice
0,1,Cimafunk,,Alejo y Valentin,,
1,1,,Fuego,,Axel Fiks,
2,1,Maye,,Lucia Tacchetti,,
3,1,,J mena,,DJ Sky,
4,1,A Day to Remember,,Feli Colina,Nathy Peluso,
5,1,,La Delio Valdez,,,
6,1,WOS,,King Princess,Boombox Cartel,
7,1,,Rita Ora,Louta,Dani,
8,1,Yungblud,,,Kaydy Cain,
9,1,,DUKI,Two Feet,Bizarrap,


Define utility functions to generate roadmaps

In [10]:
def check_artist_name(artist):
    if artist not in clusters.index:
        raise ValueError('Artist not found ', artist)
    return True

In [11]:
def closest_to_centroids(options, shuffleness=0):
    '''
    Receives a list of options and calculates pairwise distances between those options
    and the centroids of the clusters corresponding to the chosen bands,
    and returns the option closest to any centroid.
    If shuffleness=1, the function calculates pairwise distances between the options
    and the user centroid, and returns the option closest to the user centroid.
    '''
    
    artists = []
    distances = []
    
    if shuffleness==0:
        for opt in options:
            for cluster in chosen_clusters.index:
                u = clusters.loc[opt, ['tsne_0', 'tsne_1']]
                v = cluster_centroids.loc[cluster]
                distances.append(euclidean(u, v))
                artists.append(opt)
        return artists[np.argmin(distances)]
    
    elif shuffleness==1:
        user_centroid = clusters.loc[chosen_artists, ['tsne_0', 'tsne_1']].mean()
        for opt in options:
            u = clusters.loc[opt, ['tsne_0', 'tsne_1']]
            distances.append(euclidean(u, user_centroid))
            artists.append(opt)
        return artists[np.argmin(distances)]

In [12]:
def distances_to_features(options, verbose=False):
    '''
    Receives a list of options and calculates pairwise distances between those options
    and each chosen artist represented by its audio features
    and returns the distances.
    '''
    
    artists = []
    distances = []
    
    for opt in options:
        for artist in chosen_artists:
            u = artists_audio_feats.loc[opt, features]
            v = artists_audio_feats.loc[artist, features]
            distances.append(euclidean(u, v, w=[1., 0.5, 0.25, 0.125]))
            artists.append(opt)
    
    return artists, distances

In [13]:
def choose_among_options(options, shuffleness=0, verbose=False):
    '''
    Chooses between options that are likely to be chosen based on shuffleness level.
    '''
    
    if shuffleness != 2:
        if verbose:
            print('\tVoting based on chosen clusters...')
        
        # if there are options in the same clusters as the chosen artists' clusters,
        # all the chosen artists vote for their cluster
        votes = pd.merge(clusters.loc[options, 'cluster_name'],
                         chosen_clusters,
                         left_on='cluster_name',
                         right_index=True,
                         ).sort_values('cluster_weight', ascending=False)

        # if there are votes
        if len(votes) > 0:
            # if there's only one artist voted
            if len(votes) == 1:
                if verbose:
                    print('\tThere is a winner.')
                # return the winner
                return votes.index[0]
            
            # if there's a winner
            elif votes.iloc[0]['cluster_weight'] != votes.iloc[1]['cluster_weight']:
                if verbose:
                    print('\tThere is a winner.')
                # return the winner
                return votes.index[0]     
            
            # if there's a tie between the winners
            else:
                winners = votes.index[votes['cluster_weight'] == votes.iloc[0, -1]]
                # measure distances
                artists, distances = distances_to_features(winners, verbose=verbose)
                
                if shuffleness == 0:
                    if verbose:
                        print('\tThere is a tie. Choosing closest distance.')
                    # return closest to a chosen artist based on audio features
                    return artists[np.argmin(distances)]
                else:
                    # randomly choose an artist from a probability distribution generated according to distances
                    return np.random.choice(artists, p=softmax(1-np.array(distances)))
        
        # if there are no votes, return closest to a chosen artist based on audio features
        artists, distances = distances_to_features(options, verbose=verbose)
        if shuffleness == 0:
            if verbose:
                print('\tThere are no votes. Choosing closest distance.')
            # return closest to a chosen artist based on audio features
            return artists[np.argmin(distances)]
        else:
            if verbose:
                print('\tThere are no votes. Choosing closest distance.')
            # randomly choose an artist from a probability distribution generated according to distances
            return np.random.choice(artists, p=softmax(1-np.array(distances)))
    
    else:
        if verbose:
            print('Choosing randomly.')
        return np.random.choice(options)

In [14]:
def fill_slot(slot, shuffleness=0, verbose=False):
    '''
    Fills grid slots based on chosen artists and desired shuffleness level.
    '''
    
    if len(chosen_artists) == 0:
        shuffleness = 2
    
    if verbose:
        print('Filling slot ', slot)
    options = grid.iloc[slot,1:-1]
    
    if verbose:
        print('Choosing between ', list(options))
    
    # if only one option is in the chosen bands
    if options.isin(chosen_artists).sum() == 1:
        # fill with chosen artist
        chosen = options[options.isin(chosen_artists)].values[0]
        if verbose:
            print(chosen, ' is among the chosen artists.')
        return chosen
    
    # if there are no chosen artists between the options
    elif options.isin(chosen_artists).sum() == 0:
        if verbose:
            print('There are no chosen artists between the options. Choosing among options...')
        # choose among options
        chosen = choose_among_options(options.dropna(), verbose=verbose, shuffleness=shuffleness)
        if verbose:
            print('\tChosen: ', chosen)
        return chosen
    
    # if there are more than one chosen artists between the options
    else:
        # solve tie between chosen options
        if verbose:
            print('There are more than one artist among the chosen ones. Breaking tie...')
        chosen = choose_among_options(options[options.isin(chosen_artists)], verbose=verbose, shuffleness=shuffleness)
        if verbose:
            print('\tTie broken. Winner: ', chosen)
        return chosen

### Testing

Generate multiple roadmaps to evaluate results

In [15]:
def test(chosen, features, verbose=True):
    
    for artist in chosen_artists:
        check_artist_name(artist)
    
    shuffleness = range(3)
    sheet_names = {0:'Nada shuffle', 1:'Algo shuffle', 2:'Muy shuffle'}
    writer = pd.ExcelWriter(f'tests/Roadmap {chosen}-{features}.xlsx')

    for random in shuffleness:
        for index in range(len(grid)):
            grid.iloc[index, -1] = fill_slot(index, shuffleness=random, verbose=verbose)
        grid.to_excel(writer, sheet_names[random], index=False)
    writer.save()

In [None]:
art_opt = ['Travis Scott', 'Martin Garrix', 'Los Fabulosos Cadillacs', 'DUKI', 'The Strokes',
           'Gwen Stefani', 'Louta', 'Guns N’ Roses', 'Lana del Rey', 'James Blake']

Test 1: three artists, two different clusters (argentinian rock and international indie & rock)

In [16]:
chosen_artists = ['Los Fabulosos Cadillacs', 'The Strokes', 'Guns N’ Roses']
features = ['energy', 'valence', 'acousticness', 'danceability']
chosen_clusters = clusters.loc[chosen_artists, 'cluster_name'].value_counts().rename('cluster_weight')

In [17]:
test(chosen_artists, features=features, verbose=False)

In [18]:
features = ['valence', 'danceability', 'energy', 'acousticness']
test(chosen_artists, features=features, verbose=False)

Test 2: three artists, three different clusters (argentinian trap, hip hop and electronic music)

In [19]:
chosen_artists = ['DUKI', 'Travis Scott', 'Martin Garrix']
features = ['energy', 'danceability', 'valence', 'acousticness']
chosen_clusters = clusters.loc[chosen_artists, 'cluster_name'].value_counts().rename('cluster_weight')

In [20]:
test(chosen_artists, features, verbose=False)

In [21]:
features = ['valence', 'danceability', 'energy', 'acousticness']
test(chosen_artists, features, verbose=False)

Test 3: four artists, three clusters (argentinian pop, international pop and argentinian rock)

In [22]:
features = ['danceability', 'valence', 'energy', 'acousticness']
chosen_artists = ['Lana del Rey', 'Gwen Stefani', 'Los Fabulosos Cadillacs']
chosen_clusters = clusters.loc[chosen_artists, 'cluster_name'].value_counts().rename('cluster_weight')

In [23]:
test(chosen_artists, features=features, verbose=False)

Filling slot  0
Choosing between  ['Cimafunk', nan, 'Alejo y Valentin', nan]
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  Cimafunk
Filling slot  1
Choosing between  [nan, 'Fuego', nan, 'Axel Fiks']
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  Fuego
Filling slot  2
Choosing between  ['Maye', nan, 'Lucia Tacchetti', nan]
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  Maye
Filling slot  3
Choosing between  [nan, 'J mena', nan, 'DJ Sky']
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  DJ Sky
Filling slot  4
Choosing between

In [24]:
features = ['acousticness', 'energy', 'valence', 'danceability']
test(chosen_artists, features=features, verbose=False)

Filling slot  0
Choosing between  ['Cimafunk', nan, 'Alejo y Valentin', nan]
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  Cimafunk
Filling slot  1
Choosing between  [nan, 'Fuego', nan, 'Axel Fiks']
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  Fuego
Filling slot  2
Choosing between  ['Maye', nan, 'Lucia Tacchetti', nan]
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  Lucia Tacchetti
Filling slot  3
Choosing between  [nan, 'J mena', nan, 'DJ Sky']
There are no chosen artists among the options. Breaking tie...
	Voting based on chosen clusters...
	There is a tie. Choosing closest distance.
	Tie broken. Winner:  DJ Sky
Filling slot  4
Choos

Test 4 - mix

In [None]:
chosen_artists = ['The Strokes', 'DUKI', 'Martin Garrix']
chosen_clusters = clusters.loc[chosen_artists, 'cluster_name'].value_counts().rename('cluster_weight')
test(chosen_artists, features, verbose=False)

Test 5: electronic music

In [None]:
chosen_artists = ['Martin Garrix']
features = ['danceability', 'energy']
chosen_clusters = clusters.loc[chosen_artists, 'cluster_name'].value_counts().rename('cluster_weight')
test(chosen_artists, features, verbose=False)

Test 6: no information - chooses randomly

In [None]:
chosen_artists = []
chosen_clusters = clusters.loc[chosen_artists, 'cluster_name'].value_counts().rename('cluster_weight')
test([], features, verbose=False)

In [2]:
[artist.strip() for artist in " ".split(',')]

['']