In [1]:
import pandas as pd
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as ntx
import re
# ref doc https://github.com/eliorc/node2ve and https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
from node2vec import Node2Vec 
from sklearn.model_selection import train_test_split
from music_utils import *
from tqdm.auto import tqdm  # import tqdm for progress bar

# Creating edge list and feature matrix

## Importing data

In [132]:
spotify = pd.read_csv('./data/SpotifyFeatures.csv')

In [133]:
spotify

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.61100,0.389,99373,0.910,0.000000,C#,0.3460,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.24600,0.590,137373,0.737,0.000000,F#,0.1510,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.95200,0.663,170267,0.131,0.000000,C,0.1030,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.70300,0.240,152427,0.326,0.000000,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95000,0.331,82625,0.225,0.123000,F,0.2020,-21.150,Major,0.0456,140.576,4/4,0.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
232720,Soul,Slave,Son Of Slide,2XGLdVl7lGeq8ksM6Al7jT,39,0.00384,0.687,326240,0.714,0.544000,D,0.0845,-10.626,Major,0.0316,115.542,4/4,0.962
232721,Soul,Jr Thomas & The Volcanos,Burning Fire,1qWZdkBl4UVPj9lK6HuuFM,38,0.03290,0.785,282447,0.683,0.000880,E,0.2370,-6.944,Minor,0.0337,113.830,4/4,0.969
232722,Soul,Muddy Waters,(I'm Your) Hoochie Coochie Man,2ziWXUmQLrXTiYjCg2fZ2t,47,0.90100,0.517,166960,0.419,0.000000,D,0.0945,-8.282,Major,0.1480,84.135,4/4,0.813
232723,Soul,R.LUM.R,With My Words,6EFsue2YbIG4Qkq8Zr9Rir,44,0.26200,0.745,222442,0.704,0.000000,A,0.3330,-7.137,Major,0.1460,100.031,4/4,0.489


In [134]:
# Create a dictionary of the form {artist_name: artist_id}, sorted as they appear in the dataframe
# Here only artists appear that are the main artist for a song. 
artist_dic = {a:str(b) for a,b in zip(spotify.artist_name.unique(), range(100, spotify.artist_name.nunique()+100))}

I did not see the added value of subsetting the columns of the dataframe (it's not too slow on the whole dataframe). In this way, we can also capture all the features immediately

A way we can make sure all values are captured is by replacing 'with' and 'par' by feat.
The thing we have to realize here is that we just make the graph larger in this way, it is not necessarily a huge problem if we fail to capture all. 

Here we make sure the different ways to announce a featuring are replaced by the standard (feat.) announcement in the track name. Then we split on the feat. announcement to obtain all artists that collaborate in a list. 

## Making a column with all artist id's and the number of artists per song

In [135]:
# Splitting the track name to obtain all artists collaborating on the song

spotify['track_name'] = spotify.track_name.apply(lambda x: x.replace('(with ', '(feat. '))
spotify['artists'] = spotify.track_name.apply(lambda x:x.split('(feat. ')[1] if '(feat. ' in x else '')


# Replace the artist names with artist_id's

def replace_artist_names(row, artist_dic):
    '''
    Parameters: 
        row: a row with all the characteristics of a song
        artist_dic: {artist_name: artist_id}
    Returns:
        for each song, a list of artist_id's that collaborated for the song
    '''
    # Perform the replacements using the replace() method
    artist_list = row.artists
    artist_list = artist_list.replace(', ', ',')
    artist_list = artist_list.replace(' & ', ',')
    artist_list = artist_list.replace(')', '')
    artist_list = re.split(',', artist_list)
    if artist_list == '':
        return [artist_dic[row.artist_name]]
    l_artists = [artist_id for artist_name, artist_id in artist_dic.items() if artist_name in artist_list]
    return [artist_dic[row.artist_name]] + l_artists

# Put all artists collaborating on the song in a list and retrieve the length of the list. 
spotify['artists'] = spotify.apply(lambda row: replace_artist_names(row, artist_dic), axis=1)
spotify['num_artists'] = spotify.artists.apply(lambda x:len(x))

Perform some checks: here we see that Sia is featured and the main artist, so would have a self loop. 

In [139]:
spotify[spotify['track_id'] == '2c7GlMNmF7pbohjykutmLP']

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artists,num_artists
9072,Dance,Sia,"Thunderclouds (feat. Sia, Diplo & Labrinth)",2c7GlMNmF7pbohjykutmLP,86,0.0085,0.691,187027,0.716,9e-06,G,0.263,-5.985,Major,0.0351,112.035,4/4,0.507,"[1908, 217, 1908, 8444]",4
108040,Pop,Sia,"Thunderclouds (feat. Sia, Diplo & Labrinth)",2c7GlMNmF7pbohjykutmLP,86,0.0085,0.691,187027,0.716,9e-06,G,0.263,-5.985,Major,0.0351,112.035,4/4,0.507,"[1908, 217, 1908, 8444]",4


In [140]:
# this one we don't capture
spotify[spotify['track_id'] == '7sO5G9EABYOXQKNPNiE9NR']

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artists,num_artists
66646,Hip-Hop,Offset,Ric Flair Drip (& Metro Boomin),7sO5G9EABYOXQKNPNiE9NR,86,0.149,0.88,172800,0.428,5.1e-05,A,0.114,-8.28,Major,0.206,100.007,4/4,0.333,[6405],1
86994,Rap,Offset,Ric Flair Drip (& Metro Boomin),7sO5G9EABYOXQKNPNiE9NR,86,0.149,0.88,172800,0.428,5.1e-05,A,0.114,-8.28,Major,0.206,100.007,4/4,0.333,[6405],1
107857,Pop,Offset,Ric Flair Drip (& Metro Boomin),7sO5G9EABYOXQKNPNiE9NR,86,0.149,0.88,172800,0.428,5.1e-05,A,0.114,-8.28,Major,0.206,100.007,4/4,0.333,[6405],1


In [141]:
# this one we don't capture ( we could change the second feat. into a comma)
spotify[spotify['track_id'] == '4mGdjNMo0RonTlOEb7cYg4']

Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence,artists,num_artists
77053,Children’s Music,YUNGBLUD,11 Minutes (feat. Halsey feat. Travis Barker),4mGdjNMo0RonTlOEb7cYg4,86,0.0116,0.464,239507,0.852,0.0,B,0.108,-3.804,Major,0.067,160.075,4/4,0.233,[7039],1
92825,Indie,YUNGBLUD,11 Minutes (feat. Halsey feat. Travis Barker),4mGdjNMo0RonTlOEb7cYg4,87,0.0116,0.464,239507,0.852,0.0,B,0.108,-3.804,Major,0.067,160.075,4/4,0.233,[7039],1
166060,Rock,YUNGBLUD,11 Minutes (feat. Halsey feat. Travis Barker),4mGdjNMo0RonTlOEb7cYg4,87,0.0116,0.464,239507,0.852,0.0,B,0.108,-3.804,Major,0.067,160.075,4/4,0.233,[7039],1


In [292]:
### BROUILLON 1 FOR FEATURING DECOMPOSITION
if False:
    """
    def full_artists(x):
        #we don't consider the first part which is the title
        if x.len_split == 1: 
            return [x.artist_name]
        else:
            featuring_list = x.feat_split[1]
            #split pbb of the split when ' & ' IS IN THE NAME OF AN ARTIST. Same for ', '
            featuring_list = re.split(', | & ', featuring_list)
            #remove some weird shit 
            featuring_list = [elt.replace(') - Remix', '').strip() for elt in featuring_list]
            featuring_list = [elt.replace(')', '').strip() for elt in featuring_list]

            return [x.artist_name] + featuring_list

    spotify_featurings = spotify[['artist_name', 'track_name', 'track_id']].copy()
    #split the track name to access the featurings
    spotify_featurings['feat_split'] = spotify_featurings.track_name.apply(lambda x:x.split('(feat. '))
    #count the number of split to remove some weird stuff
    spotify_featurings['len_split'] = spotify_featurings.feat_split.apply(lambda x:len(x))
    #only 13 tracks with such a configuration and it is a pain in the ass
    spotify_featurings = spotify_featurings[spotify_featurings.len_split <= 2].copy()
    """

## Creating an edge list

In [117]:
def nodes_featuring(df_song):
    # Make an edge list dataframe
    prep_df = pd.DataFrame(columns=['artist_1', 'artist_2', 'number'])
    for index, x in df_song.iterrows():
        # loop over the list of artists and make edges between all artists that collaborate
        l = x.artists
        for i in range(len(l)):
            for j in range(i+1, len(l)):
                # edges always in sorted, edge between x and y if x < y and never between y and x
                prep_df.loc[len(prep_df)] = sorted([l[i], l[j]])+[1]
    # store the number of features between two artists
    nodes_df = prep_df.groupby(['artist_1', 'artist_2']).agg(num_feats=('number','sum')).reset_index()
    return nodes_df

spotify = spotify[spotify.num_artists > 1].copy()

nodes_df = nodes_featuring(spotify)

## Saving the feature matrix per song and the edge list

In [129]:
#spotify.to_csv('./data/song_features.csv', index = True, header = True)
nodes_df.to_csv('./data/edge_list.csv')

In [112]:
nodes_df

Unnamed: 0,artist_1,artist_2,num_feats
0,10012,9807,1
1,10032,8900,1
2,10032,8946,1
3,1006,13932,1
4,1006,13943,2
...,...,...,...
4422,9709,9716,1
4423,9709,9783,1
4424,9737,9737,1
4425,9805,9918,1


From this dataframe, it is easy to create an edge_list that can be passed to networkx

In [130]:
edge_list = [tuple(l[:2]) for l in nodes_df.values.tolist()]

In [131]:
G = ntx.from_edgelist(edge_list)
largest_cc = G.subgraph(max(ntx.connected_components(G), key=len))
print(f'The largest connected component has {len(largest_cc.nodes)} nodes.')
print(f'The largest connected component has {len(largest_cc.edges)} edges.')

The largest connected component has 1608 nodes.
The largest connected component has 4081 edges.


In [None]:
# didn't work on my local computer, but did work on colab, some issue with the installed networkx version
fig, ax = plt.subplots(figsize=(15, 15))
ntx.draw_networkx(largest_cc, with_labels=False, node_size = 20, ax = ax)

We currently have: 

1) an edge list with which we can greate a graph. 

2) a dataframe with per song all the features (genre,...)

The next step is to

1) possibly finetune the way in which we make the edge list. For example, we can 

2) from the dataframe with all the features per song, make features for artists. 