### We have different dataframes: 

1. **Spotify_600**: all tracks and features associated to tracks, from this dataframe we delete the non-music songs
2. **artists_600**: all artists and a small number of features related to the artists
3. **feat_track_600**: all songs with more than one artist, since 1999 (passed to create nodes_600)
4. **nodes_600**: edge lists from source to target, including number of features and songs in which they collaborated
5. **artists_600_features**: artist features, popularity and song features for all songs
6. **genres**: all genres with the number of artists having that genre

In [1]:
import pandas as pd
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as ntx
import re
import altair as alt

# ref doc https://github.com/eliorc/node2ve and https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
from node2vec import Node2Vec 
from sklearn.model_selection import train_test_split
from music_utils import *
from tqdm.auto import tqdm  # import tqdm for progress bar
tqdm.pandas()

In [2]:
local = True
if local:
    DATA_PATH = '../data/'
else:
    DATA_PATH = './data/'

In [3]:
def graph_largest_cc(nodes_df, plot_graph=False, write_graphml=None):   
    edge_list = [tuple(l[:2]) for l in nodes_df.values.tolist()]
    G = ntx.from_edgelist(edge_list)
    largest_cc = G.subgraph(max(ntx.connected_components(G), key=len))
    print(f'The largest connected component has {len(largest_cc.nodes)} nodes.')
    print(f'The largest connected component has {len(largest_cc.edges)} edges.')

    if plot_graph:
        fig, ax = plt.subplots(figsize=(15, 15))
        ntx.draw_networkx(largest_cc, with_labels=False, node_size = 20, ax = ax)
        plt.show()
    
    if write_graphml != None:
        ntx.write_graphml_lxml(G, write_graphml+'.graphml')
    return G, largest_cc

Ran this once on the CSV's from the kaggle website, is now saved locally in pickle files

In [4]:
spotify_600, artists_600 = read_spotify_600(DATA_PATH=DATA_PATH, read=True)

We filter out the non-music tracks, then we create the edge lists and the features for artists. 

In [5]:
# Filter out non-music
n_non_music = sum(spotify_600["speechiness"] > 0.7)
print(f"We are deleting {n_non_music} songs from the dataframe")
spotify_600 = spotify_600[spotify_600["speechiness"] < 0.7]

We are deleting 22100 songs from the dataframe


In [6]:
#Nodes creation
feat_track_600 = spotify_600[(spotify_600.num_artists > 1)&(spotify_600.release_date.dt.year > 1999)].copy()
nodes_600 = nodes_featuring(feat_track_600, DATA_PATH=DATA_PATH, read=False, path='edge_list_600k_over1999.pkl')

In [16]:
#artists feature creation
artists_600_features = artists_features_creation(artists_600, spotify_600, DATA_PATH, read=False)

In [19]:
#Genres
genres = pd.DataFrame(artists_600_features.genres.explode().value_counts().reset_index()).dropna().rename(
    columns={'index':'genre','genres':'number_of_artists'}
)

In [9]:
edge_list = [tuple(l[:2]) for l in nodes_600.values.tolist()]
G = ntx.from_edgelist(edge_list)
largest_cc = G.subgraph(max(ntx.connected_components(G), key=len))
print(f'The largest connected component has {len(largest_cc.nodes)} nodes.')
print(f'The largest connected component has {len(largest_cc.edges)} edges.')

The largest connected component has 20500 nodes.
The largest connected component has 62429 edges.
