Set up code referenced from: https://github.com/rodolfostark/spotify-network-analysis/blob/main/spotify_network_analysis.ipynb

In [23]:
import json
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt

In [24]:
#Constants
SAMPLE_SIZE = 100   # size of how many playlists we want to look at
RANDOM_STATE = 15   # seed for random sampling

Generate the dataframe for the artist and playlist data from the sampled csv files in data_CSV

In [25]:
df1 = pd.read_csv("data_CSV/sample1.csv")
df2 = pd.read_csv("data_CSV/sample2.csv")
df3 = pd.read_csv("data_CSV/sample3.csv")
df4 = pd.read_csv("data_CSV/sample4.csv")

Final dataframe has 269580 total song entries, with 22626 different artists across 4000 playlists.  ```pid``` being the playlist the artist belongs to

In [26]:
df = pd.concat([df1,df2,df3,df4])
print(df.info())
print(df.nunique())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 269580 entries, 0 to 67023
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   artist_name  269580 non-null  object
 1   pid          269580 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 6.2+ MB
None
artist_name    22626
pid             4000
dtype: int64


Unnamed: 0,artist_name,pid
0,Shawn Mendes,161000
1,Cheat Codes,161000
2,DJ Khaled,161000
3,Natasha Bedingfield,161000
4,Tove Lo,161000


In [27]:
# replace $ with S to avoid different entries for the same artitst that uses the two characters interchangeably 
df['artist_name'] = df['artist_name'].apply(lambda x: x.replace('$', 'S'))

Modify df to a series where each entry is a playlist. The index is the playlist id and the values are a list of the artists that show up in it (repeats are counted)

In [28]:
# Series where each row is a index=PID and value=list of all artists (repeated)
artist_df_minimal = df.groupby('pid')['artist_name'].apply(list)
print(artist_df_minimal.head())
print(artist_df_minimal.info())

pid
161000    [Shawn Mendes, Cheat Codes, DJ Khaled, Natasha...
161001    [Counting Crows, Sister Hazel, Gin Blossoms, N...
161002    [George Strait, George Strait, George Strait, ...
161003    [Gipsy Kings, Gipsy Kings, Texas Tornados, Fla...
161004    [N2DEEP, Warren G, Westside Connection, Digabl...
Name: artist_name, dtype: object
<class 'pandas.core.series.Series'>
Index: 4000 entries, 161000 to 915999
Series name: artist_name
Non-Null Count  Dtype 
--------------  ----- 
4000 non-null   object
dtypes: object(1)
memory usage: 62.5+ KB
None


Reduce the data to a sample of ```SAMPLE_SIZE``` playlists, for more convenient calculations

In [29]:
artist_df_minimal = artist_df_minimal.sample(SAMPLE_SIZE, random_state=RANDOM_STATE)
print(artist_df_minimal.head())
print(artist_df_minimal.info())

pid
414016    [Foreigner, Queen, N.W.A., N.W.A., N.W.A., Wu-...
785253    [Kendrick Lamar, Kendrick Lamar, Childish Gamb...
161243    [Conor Maynard, A Boogie Wit da Hoodie, Lukas ...
785397    [Smokey Robinson & The Miracles, Smokey Robins...
915666    [Bahamas, Amy Winehouse, Palmistry, Ed Sheeran...
Name: artist_name, dtype: object
<class 'pandas.core.series.Series'>
Index: 100 entries, 414016 to 785123
Series name: artist_name
Non-Null Count  Dtype 
--------------  ----- 
100 non-null    object
dtypes: object(1)
memory usage: 1.6+ KB
None


Covert list values to a dictionary where the key is the artist name and value is the number of occurances.

In [30]:
for PID, artists in artist_df_minimal.items():
    # count each artist in the list as a Series and then reatribute to the corresponding PID
    artist_df_minimal[PID] = pd.Series(artists).value_counts().to_dict()
print(artist_df_minimal.head())
print(artist_df_minimal.info())

pid
414016    {'AC/DC': 9, 'Billy Joel': 5, 'Guns N' Roses':...
785253    {'Future': 11, 'Kanye West': 9, 'Kendrick Lama...
161243    {'Melanie Martinez': 3, 'Enrique Iglesias': 3,...
785397    {'Michael Jackson': 7, 'Eagles': 7, 'Elvis Pre...
915666    {'Bahamas': 1, 'Amy Winehouse': 1, 'Palmistry'...
Name: artist_name, dtype: object
<class 'pandas.core.series.Series'>
Index: 100 entries, 414016 to 785123
Series name: artist_name
Non-Null Count  Dtype 
--------------  ----- 
100 non-null    object
dtypes: object(1)
memory usage: 5.6+ KB
None


In [31]:
from collections import deque

Convert Series to a NetworkX Graph.

In [32]:
ARTISTS_GRAPH = nx.Graph()

# Iterate over all playlist IDs and create a node for each artist found in the playlist and add an edge between them
for PID, artist_dict in artist_df_minimal.items():
    # Create a node for each artist in the playlist PID
    artist_array = [(artist, frequency) for artist, frequency in artist_dict.items()]
    for node_name, node_frequecy in artist_array:
        if not ARTISTS_GRAPH.has_node(node_name):
            ARTISTS_GRAPH.add_node(node_name)
            
    # Add edges between each artist in the playlist PID
    # Doesn't double count cooccurances since each artist is popped off in each iteration. 
    artist_deque = deque(artist_array)
    while artist_deque:
        leftier_artist = artist_deque.popleft()
        leftier_artist_name, leftier_artist_frequency = leftier_artist
        for artist in artist_deque:
            artist_name, artist_frequency = artist
            if ARTISTS_GRAPH.has_edge(leftier_artist_name, artist_name):
                ARTISTS_GRAPH[leftier_artist_name][artist_name]['weight'] += leftier_artist_frequency * artist_frequency
            else:
                ARTISTS_GRAPH.add_edge(leftier_artist_name, artist_name, weight=leftier_artist_frequency * artist_frequency)

In [33]:
nx.is_connected(ARTISTS_GRAPH)

False

In [34]:
print(ARTISTS_GRAPH)

Graph with 1888 nodes and 82546 edges


In [35]:
nx.write_graphml(ARTISTS_GRAPH, 'graphs/artists_graph_unconnected.graphml')

In [36]:
# Get a connected graph

artists_subgraph = nx.subgraph(ARTISTS_GRAPH, sorted(nx.connected_components(ARTISTS_GRAPH), key=len, reverse=True)[0])
print(artists_subgraph)


Graph with 1860 nodes and 82404 edges


In [37]:
nx.write_graphml(artists_subgraph, 'graphs/artists_graph_connected.graphml')