In [1]:
import json
from pathlib import Path

# number of playlist to process out of the slice files
NUM_PLAYLISTS = 1000

# Path to your original slice file
folder_path = Path(r"/Users/noa/Desktop/02805 - Social Graphs/playlist_data/")

In [2]:

# Load all mpd slice JSON files in the folder and merge their playlists
file_list = sorted(folder_path.glob("mpd.slice.*.json"))
playlists = []
for fp in file_list:
    with open(fp, 'r', encoding='utf-8') as f:
        data = json.load(f)
        playlists.extend(data.get("playlists", []))

mpd_slice = {
    "info": {"merged_from_files": len(file_list)},
    "playlists": playlists
}

print(f"Loaded {len(file_list)} files, total playlists merged: {len(mpd_slice['playlists'])}")

# Check how many playlists are inside
print(f"Total playlists in this slice: {len(mpd_slice['playlists'])}")


Loaded 6 files, total playlists merged: 6000
Total playlists in this slice: 6000


Make a network going through 6000 playlists (mpd_slice)

Nodes are artists occuring in the songs of the networks

Add song titles from playlists to the nodes, disregarding duplicates.

add playlist id as attribute to artist if they have a song on playlist

Edges are between artists if they share a playlist

weight on edge is the number of playlist they share

dont include playlist having less then 40 songs and more then 100

dont include playlists having less than 6 unique artists. 

In [3]:
import networkx as nx
from collections import defaultdict
from itertools import combinations

# Build network from the mpd_slice playlists (up to first 100 playlists in the slice)
playlists = mpd_slice.get("playlists", [])[:NUM_PLAYLISTS]
print(f"\nBuilding artist network from {len(playlists)} playlists")

# Accumulators (dictionaries of sets)
artist_songs = defaultdict(set)        # artist -> set of song titles
artist_playlists = defaultdict(set)    # artist -> set of playlist ids they appear in
edge_playlists = defaultdict(set)      # (artist_a, artist_b) -> set of playlist ids they share

included_playlists = 0

for pl in playlists:
    pid = pl.get("pid")
    tracks = pl.get("tracks", [])
    # filter playlists by track count and unique artist count
    if not (40 <= len(tracks) <= 100):
        continue
    unique_artists = {t["artist_name"] for t in tracks}
    if len(unique_artists) < 6:
        continue

    included_playlists += 1

    # collect songs and playlist membership per artist
    for t in tracks:
        artist = t["artist_name"]
        track_name = t.get("track_name")
        if track_name:
            artist_songs[artist].add(track_name)
        artist_playlists[artist].add(pid)

    # increment edge counters for every pair of artists in this playlist
    for a, b in combinations(sorted(unique_artists), 2):
        edge_playlists[(a, b)].add(pid)



Building artist network from 1000 playlists


Steps to Implement:

- Filter playlists: If any song in a playlist cannot be found on Genius, discard the entire playlist.
- Fetch lyrics: Use the Genius API to get lyrics for each unique song per artist.
- Concatenate lyrics per artist: Combine lyrics for all unique songs by that artist into one text block.
- Save to .txt files: Store each artistâ€™s lyrics in a separate file so you can load them later without re-scraping.
- Attach lyrics as an attribute in the graph: When building the NetworkX graph, add the lyrics as a node attribute.

Considerations
- Avoid duplicates: Only fetch lyrics for unique songs per artist.
- Rate limits: Genius API has limits, so you may need to add delays or caching.
- Error handling: If any song in a playlist fails, skip that playlist entirely.
- File structure: Use a folder like artist_lyrics/ to store .txt files named after the artist.

In [4]:
import lyricsgenius
import os
import re
from pathlib import Path

# --- SETUP GENIUS API ---
GENIUS_ACCESS_TOKEN = "IKoqZN1ANyU_2G6zmTPF2xlaH2OlIEEUlDoD97Mo9-P_A6-2QgnSoQlwsJ3Hy3DY"  # <--- paste your token

# Initialize Genius client
genius = lyricsgenius.Genius(
    GENIUS_ACCESS_TOKEN,
    remove_section_headers=True,   # cleans up [Verse], etc.
    timeout=15,
    retries=3
)

In [5]:
import time
from collections import defaultdict

# Folder to save lyrics
lyrics_folder = Path("artist_lyrics")
lyrics_folder.mkdir(exist_ok=True)

artist_lyrics = defaultdict(str)

for artist, songs in artist_songs.items():
    for track_name in songs:
        try:
            song = genius.search_song(track_name, artist)
            if song and song.lyrics:
                artist_lyrics[artist] += "\n" + song.lyrics
        except Exception as e:
            print(f"Error retrieving {track_name} by {artist}: {e}")
        time.sleep(1)  # Avoid hitting rate limits

# Save lyrics to files
for artist, lyrics in artist_lyrics.items():
    safe_name = re.sub(r'[^\w\s-]', '', artist).strip().replace(' ', '_')
    file_path = lyrics_folder / f"{safe_name}.txt"
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(lyrics)

Searching for "Get Ur Freak On" by Missy Elliott...


KeyboardInterrupt: 