In [18]:
import networkx as nx
from collections import defaultdict
from itertools import combinations
import json
from pathlib import Path

# number of playlist to process out of the slice files
NUM_PLAYLISTS = 1000

# Path to your original slice files
folder_path = Path(r"/Users/noa/Desktop/02805 - Social Graphs/playlist_data/")

# folder with artist/lyrics files
artist_folder = Path(r"/Users/noa/Desktop/02805 - Social Graphs/artist_lyrics_cleaned")

In [19]:
# Load all mpd slice JSON files in the folder and merge their playlists
file_list = sorted(folder_path.glob("mpd.slice.*.json"))
playlists = []
for fp in file_list:
    with open(fp, 'r', encoding='utf-8') as f:
        data = json.load(f)
        playlists.extend(data.get("playlists", []))

print(f"Loaded {len(file_list)} files, total playlists merged: {len(playlists[:])}")

Loaded 1 files, total playlists merged: 1000


# add playlists to dictionaries

In [20]:
# Accumulators
artist_songs = defaultdict(set)
artist_playlists = defaultdict(set)
artist_albums = defaultdict(set)
artist_durations = defaultdict(list)
edge_playlists = defaultdict(set)
lyrics_dict = {}

# Load lyrics for artists
for txt_file in artist_folder.glob("*.txt"):
    artist_name = txt_file.stem  # filename without extension
    with open(txt_file, 'r', encoding='utf-8') as f:
        lyrics_dict[artist_name.lower()] = f.read()

def normalize_artist_name(name):
    if not name:
        return None
    return name.replace(' ', '_').strip()

included_playlists = 0

for pl in playlists[:NUM_PLAYLISTS]:
    pid = pl.get("pid")
    tracks = pl.get("tracks", [])
    # filter playlists by track count and unique artist count
    if not (20 <= len(tracks) <= 100):
        continue

    # build normalized set of unique artists for this playlist
    unique_artists = {normalize_artist_name(t["artist_name"]) for t in tracks if t.get("artist_name")}
    unique_artists = {a for a in unique_artists if a}  # drop Nones/empty
    if len(unique_artists) < 6:
        continue

    included_playlists += 1

    # collect songs, albums, durations, playlist membership per artist
    for t in tracks:
        raw_artist = t.get("artist_name")
        artist = normalize_artist_name(raw_artist)
        if not artist:
            continue
        track_name = t.get("track_name")
        album_name = t.get("album_name")
        duration = t.get("duration_ms")

        if track_name:
            artist_songs[artist].add(track_name)
        if album_name:
            artist_albums[artist].add(album_name)
        if duration:
            artist_durations[artist].append(duration)
        artist_playlists[artist].add(pid)

    # increment edge counters for every pair of (normalized) artists in this playlist
    for a, b in combinations(sorted(unique_artists), 2):
        edge_playlists[(a, b)].add(pid)

print(f"Included playlists: {included_playlists}")


Included playlists: 583


# Constructing the Graph

**Nodes** represent artists.
Node attributes:
- songs: set of track names
- playlists: set of playlist IDs
- num_playlists: count of playlists
- num_songs: count of songs
- avg_song_duration: average track duration
- albums: set of album names
- lyrics: if available (for artists like 2Pac)

**Edges** represent co-occurrence in playlists.
Edge attributes:
- shared_playlists: set of playlist IDs
- weight: number of shared playlists
- co_occurrence_count: number of times they appear together

In [21]:
# Build the graph
G = nx.Graph()

# Add nodes with attributes
for artist in artist_songs.keys():
    num_playlists = len(artist_playlists[artist])
    num_songs = len(artist_songs[artist])
    avg_duration = sum(artist_durations[artist]) / len(artist_durations[artist]) if artist_durations[artist] else None
    lyrics = lyrics_dict.get(artist.lower(), None)

    G.add_node(artist,
               songs=list(artist_songs[artist]),
               albums=list(artist_albums[artist]),
               playlists=list(artist_playlists[artist]),
               num_playlists=num_playlists,
               num_songs=num_songs,
               avg_song_duration=avg_duration,
               lyrics=lyrics)

# Add edges with attributes
for (a, b), pls in edge_playlists.items():
    G.add_edge(a, b,
               shared_playlists=list(pls),
               weight=len(pls),
               co_occurrence_count=len(pls))

print(f"Graph built: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

Graph built: 6187 nodes, 297928 edges


In [22]:
# remove nodes where lyrics attribute is missing or empty
nodes_to_remove = [n for n, attrs in G.nodes(data=True) if not attrs.get('lyrics')]

# remove nodes from the graph where num_songs is less than 4 
nodes_to_remove += [n for n, attrs in G.nodes(data=True) if attrs.get('num_songs', 0) < 4]
G.remove_nodes_from(nodes_to_remove) 

print(f"After removing artists with no lyrics: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

After removing artists with no lyrics: 1024 nodes, 63851 edges


In [25]:
# save Graph G

# ensure attributes are GraphML-serializable:
for n, attrs in G.nodes(data=True):
    for k, v in list(attrs.items()):
        if isinstance(v, set):
            attrs[k] = list(v)
        if v is None:
            attrs[k] = ""
        # GraphML doesn't support lists/dicts natively; json-serialize them
        if isinstance(attrs[k], (list, dict)):
            attrs[k] = json.dumps(attrs[k], ensure_ascii=False)

for u, v, attrs in G.edges(data=True):
    for k, val in list(attrs.items()):
        if isinstance(val, set):
            attrs[k] = list(val)
        if val is None:
            attrs[k] = ""
        if isinstance(attrs[k], (list, dict)):
            attrs[k] = json.dumps(attrs[k], ensure_ascii=False)

# Save Graph in GraphML format (lists/dicts are JSON-strings)
nx.write_graphml(G, "tmp/artist_network.graphml")


In [26]:
# display a few nodes and edges with attributes
for i, (n, attrs) in enumerate(G.nodes(data=True)):
    if i >= 10:
        break
    print(f"Node: {n}, Attributes: {attrs}\n")

Node: Missy_Elliott, Attributes: {'songs': '["Pep Rally", "WTF (Where They From) [feat. Pharrell Williams]", "Get Ur Freak On", "Gossip Folks (feat. Ludacris)", "Lose Control (feat. Ciara & Fat Man Scoop)", "One Minute Man (feat. Ludacris)", "Work It"]', 'albums': '["Miss E...So Addictive", "Pep Rally", "WTF (Where They From) [feat. Pharrell Williams]", "Under Construction", "The Cookbook"]', 'playlists': '[0, 203, 779, 112, 912, 218, 127]', 'num_playlists': 7, 'num_songs': 7, 'avg_song_duration': 236137.08333333334, 'lyrics': '\n\nIs it worth it? Let me work it\nI put my thing down, flip it and reverse it\n\u200bti esrever dna ti pilf ,nwod gniht ym tuP\n\u200bti esrever dna ti pilf ,nwod gniht ym tuP\nIf you got a big— let me search ya\nAnd find out how hard I gotta work ya\n\u200bti esrever dna ti pilf ,nwod gniht ym tuP\n\u200bti esrever dna ti pilf ,nwod gniht ym tuP (C\'mon)\n\nI\'d like to get to know ya so I could show ya\nPut the pussy on ya like I told ya\nGive me all your nu

Make a network going through 6000 playlists (mpd_slice)

Nodes are artists occuring in the songs of the networks

Add song titles from playlists to the nodes, disregarding duplicates.

add playlist id as attribute to artist if they have a song on playlist

Edges are between artists if they share a playlist

weight on edge is the number of playlist they share

dont include playlist having less then 40 songs and more then 100

dont include playlists having less than 6 unique artists. 

Steps to Implement:

- Filter playlists: If any song in a playlist cannot be found on Genius, discard the entire playlist.
- Fetch lyrics: Use the Genius API to get lyrics for each unique song per artist.
- Concatenate lyrics per artist: Combine lyrics for all unique songs by that artist into one text block.
- Save to .txt files: Store each artist’s lyrics in a separate file so you can load them later without re-scraping.
- Attach lyrics as an attribute in the graph: When building the NetworkX graph, add the lyrics as a node attribute.

Considerations
- Avoid duplicates: Only fetch lyrics for unique songs per artist.
- Rate limits: Genius API has limits, so you may need to add delays or caching.
- Error handling: If any song in a playlist fails, skip that playlist entirely.
- File structure: Use a folder like artist_lyrics/ to store .txt files named after the artist.