# Dev notebook for DJANGLER

# Get top songs from spotify

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from typing import List, Dict, Set
import json

In [None]:
# Popular playlist IDs (from Spotify)
DEFAULT_PLAYLISTS = [
    "37i9dQZF1DXcBWIGoYBM5M",  # Today's Top Hits
    "37i9dQZF1DX0XUsuxWHRQd",  # RapCaviar
    "37i9dQZF1DWXRqgc9PhiG5",  # Rock Classics
    "37i9dQZF1DX4dyzvuaRJ0n",  # mint (indie/alternative)
    "37i9dQZF1DX1lVhptIYRda",  # Hot Country
    "37i9dQZF1DX4SBhb3fqCJd",  # Are & Be (R&B)
    "37i9dQZF1DX0kbJZpiYdZl",  # Hot Hits USA
    "37i9dQZF1DX4UtSsGT1Sbe",  # All Out 80s
    "37i9dQZF1DX4o1oenSJRJd",  # All Out 90s
    "37i9dQZF1DX3rxVfibe1L0",  # Mood Booster
]

SPOTIFY_CLIENT_ID = "dc02e1a590e344558af75713c5f95e02"
SPOTIFY_CLIENT_SECRET = "921349166f544ae88d4f599b4f72b5dc"


In [None]:
NUMBER_OF_SONGS = 10000


def setup_spotify(client_id=SPOTIFY_CLIENT_ID, client_secret=SPOTIFY_CLIENT_SECRET) -> spotipy.Spotify:
    """
    Initialize Spotify client
    
    Args:
        client_id: Spotify API client ID
        client_secret: Spotify API client secret
    
    Returns:
        Spotify client instance
    """
    auth_manager = SpotifyClientCredentials(
        client_id=client_id,
        client_secret=client_secret
    )
    return spotipy.Spotify(auth_manager=auth_manager)


def get_playlist_tracks(sp: spotipy.Spotify, playlist_id: str) -> List[Dict]:
    """
    Get all tracks from a Spotify playlist
    
    Args:
        sp: Spotify client
        playlist_id: Spotify playlist ID
    
    Returns:
        List of track dicts with title, artist, spotify_id
    """
    tracks = []
    results = sp.playlist_tracks(playlist_id)
    
    while results:
        for item in results['items']:
            if item['track'] and item['track']['name']:
                track = item['track']
                tracks.append({
                    'title': track['name'],
                    'artist': track['artists'][0]['name'],
                    'spotify_id': track['id'],
                    'album': track['album']['name'],
                    'release_date': track['album'].get('release_date', ''),
                    'popularity': track.get('popularity', 0)
                })
        
        # Pagination
        results = sp.next(results) if results['next'] else None
    
    return tracks

def get_songs_from_playlists(client_id: str,
                             client_secret: str,
                             playlist_ids: List[str] = DEFAULT_PLAYLISTS,
                             target_count: int = NUMBER_OF_SONGS) -> List[Dict]:
    """
    Fetch songs from multiple playlists until target count reached
    
    Args:
        client_id: Spotify API client ID
        client_secret: Spotify API client secret
        playlist_ids: List of Spotify playlist IDs
        target_count: Target number of unique songs
    
    Returns:
        List of unique song dicts
    """
    sp = setup_spotify(client_id, client_secret)
    
    all_tracks = []
    seen_ids = set()
    
    for playlist_id in playlist_ids:
        print(f"Fetching playlist: {playlist_id}")
        
        try:
            playlist_info = sp.playlist(playlist_id)
            print(f"  - {playlist_info['name']} ({playlist_info['tracks']['total']} tracks)")
            
            tracks = get_playlist_tracks(sp, playlist_id)
            
            # Dedupe
            for track in tracks:
                if track['spotify_id'] not in seen_ids:
                    seen_ids.add(track['spotify_id'])
                    all_tracks.append(track)
            
            print(f"  - Total unique songs: {len(all_tracks)}")
            
            if len(all_tracks) >= target_count:
                break
                
        except Exception as e:
            print(f"  - Error: {e}")
            continue
    
    return all_tracks[:target_count]


def save_song_list(tracks: List[Dict], output_path: str = "song_list.json"):
    """
    Save song list to JSON
    
    Args:
        tracks: List of track dicts
        output_path: Output file path
    """
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(tracks, f, indent=2, ensure_ascii=False)
    
    print(f"\n✓ Saved {len(tracks)} songs to {output_path}")


def load_song_list(input_path: str = "song_list.json") -> List[Dict]:
    """
    Load song list from JSON
    
    Args:
        input_path: Input file path
    
    Returns:
        List of track dicts
    """
    with open(input_path, 'r', encoding='utf-8') as f:
        tracks = json.load(f)
    
    return tracks

In [None]:
# analyze_collection.py
from collections import Counter
import json

def search_playlists(sp: spotipy.Spotify, query: str, limit: int = 5) -> List[str]:
    """Search for playlists and return their IDs"""
    results = sp.search(q=query, type='playlist', limit=limit)
    # return [p['id'] for p in results['playlists']['items']]
    return [p['id'] for p in results['playlists']['items'] if p is not None]

# Get artist genres from Spotify
def get_artist_genres(sp, artist_name):
    results = sp.search(q=f"artist:{artist_name}", type='artist', limit=1)
    if results['artists']['items']:
        return results['artists']['items'][0]['genres']
    return []


def analyze_coverage(songs_file="song_list.json"):
    with open(songs_file) as f:
        songs = json.load(f)
    
    print(f"Total songs: {len(songs)}\n")
    
    # Top artists
    artists = Counter(s['artist'] for s in songs)
    print("Top 20 artists:")
    for artist, count in artists.most_common(20):
        print(f"  {artist}: {count}")
    
    # Year/decade distribution
    years = [s.get('release_date', '')[:4] for s in songs if s.get('release_date')]
    year_dist = Counter(years)
    print(f"\nDecade distribution:")
    for decade in ['1960', '1970', '1980', '1990', '2000', '2010', '2020']:
        count = sum(c for y, c in year_dist.items() if y.startswith(decade[:3]))
        print(f"  {decade}s: {count}")
    
    # Energy/valence if available
    if songs[0].get('energy'):
        avg_energy = sum(s.get('energy', 0) for s in songs) / len(songs)
        avg_valence = sum(s.get('valence', 0) for s in songs) / len(songs)
        print(f"\nAvg energy: {avg_energy:.2f}, valence: {avg_valence:.2f}")



In [None]:
# Example
sp = setup_spotify(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)


In [None]:
# queries = ["folk rock",
#     "indie rock", 
#     "bedroom pop",
#     "alt-folk",
#     "dream pop",
#     "soul classics",
#     "soul",
#     "r&b",
#     "songwriter",
#     "acoustic",
#     "shoegaze",
#     "alternative r&b",
#     "top hits 2024", 
#     "rock classics", 
#     "hip hop hits", 
#     "country top", 
#     "indie favorites", 
#     "90s hits"
#           ]

queries = ["folk rock",
           "alt country",
           "josh ritter"
          ]




In [None]:
playlist_ids = []
for q in queries:
    playlist_ids.extend(search_playlists(sp, q, limit=3))

In [None]:
for q in queries:
    ids = search_playlists(sp, q, limit=5)
    print(f"{q}: {len(ids)} playlists")

In [None]:
# Then use these IDs
songs = get_songs_from_playlists(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET, playlist_ids=playlist_ids, target_count=10000)

In [None]:
songs

In [None]:
len(songs)

In [None]:
save_song_list(songs, output_path="songs_list_ritter.json")

## Analyze distribution of fetched songs

In [None]:
with open('songs_list_4000.json') as f:
    songs = json.load(f)


# Analyze current collection
all_genres = []
for song in songs:
    genres = get_artist_genres(sp, song['artist'])
    all_genres.extend(genres)

genre_counts = Counter(all_genres)
print("Top genres:", genre_counts.most_common(20))
print("\nUnderrepresented:", [g for g, c in genre_counts.items() if c < 10])


In [None]:
import json

# Load Spotify data
with open('songs_list_4000.json') as f:
    songs = json.load(f)

# Check first song's fields
print("Available fields:")
print(songs[0].keys())
print("\nSample song:")
for key, value in songs[0].items():
    print(f"{key}: {value}")

# Check if audio features present
has_audio = any(s.get('energy') is not None for s in songs)
print(f"\nHas audio features: {has_audio}")

# Lyric fetching & cleaning

In [None]:
import os
import re
import json
import time
from typing import Dict, List, Optional
import requests
from bs4 import BeautifulSoup

In [None]:
# Test songs
test_songs = [
    ("Blinding Lights", "The Weeknd"),
    ("Bohemian Rhapsody", "Queen"),
    ("Old Town Road", "Lil Nas X"),
    ("Respect", "Aretha Franklin"),
    ("Smells Like Teen Spirit", "Nirvana"),
    ("God's Plan", "Drake"),
    ("Rolling in the Deep", "Adele"),
    ("Sweet Child O' Mine", "Guns N' Roses"),
    ("Levitating", "Dua Lipa"),
    ("Lose Yourself", "Eminem")
]

GENIUS_ACCESS_TOKEN = "-XneXPQ8TZn0D1Z6cGM_PtgeyN_WowjM65Raw2Ph0Hemn0G8a-HKSjP9CzCdW4fg"


In [None]:
def fetch_lyrics(song_title: str, artist: str, access_token: str) -> Optional[Dict]:
    """
    Fetch raw lyrics from Genius API
    
    Args:
        song_title: Title of the song
        artist: Artist name
        access_token: Genius API access token
    
    Returns:
        Dict with 'lyrics' (raw text) and 'metadata' (title, artist, url, etc.)
        None if song not found
    """
    base_url = "https://api.genius.com"
    headers = {"Authorization": f"Bearer {access_token}"}
    
    # Search for song
    search_url = f"{base_url}/search"
    params = {"q": f"{song_title} {artist}"}
    
    try:
        response = requests.get(search_url, headers=headers, params=params)
        response.raise_for_status()
        search_data = response.json()
        
        if not search_data['response']['hits']:
            print(f"No results found for '{song_title}' by {artist}")
            return None
        
        # Get first result
        song_info = search_data['response']['hits'][0]['result']
        song_url = song_info['url']
        
        # Scrape lyrics from song page
        page = requests.get(song_url)
        html = BeautifulSoup(page.text, 'html.parser')
        
        # Genius stores lyrics in div with specific data attribute
        lyrics_divs = html.find_all('div', {'data-lyrics-container': 'true'})
        
        if not lyrics_divs:
            print(f"Could not extract lyrics for '{song_title}'")
            return None
        
        # Combine all lyric divs and preserve line breaks
        raw_lyrics = '\n'.join([div.get_text(separator='\n') for div in lyrics_divs])
        
        metadata = {
            'title': song_info['title'],
            'artist': song_info['primary_artist']['name'],
            'url': song_url,
            'release_date': song_info.get('release_date_for_display'),
            'genius_id': song_info['id']
        }
        
        return {
            'lyrics': raw_lyrics,
            'metadata': metadata
        }
        
    except Exception as e:
        print(f"Error fetching lyrics for '{song_title}': {str(e)}")
        return None

def parse_sections(raw_lyrics: str) -> List[Dict]:
    """
    Split lyrics into sections (verse, chorus, bridge, etc.)
    
    Args:
        raw_lyrics: Raw lyrics text with section headers like [Verse 1], [Chorus]
    
    Returns:
        List of dicts: [{'section_type': 'verse', 'section_number': 1, 'text': '...'}]
    """
    sections = []
    
    # Split by section headers (e.g., [Verse 1], [Chorus], [Bridge])
    # Pattern matches [Text] or [Text Number]
    section_pattern = r'\[([^\]]+)\]'
    parts = re.split(section_pattern, raw_lyrics)
    
    # parts will be: ['', 'Intro', 'lyrics...', 'Verse 1', 'lyrics...', 'Chorus', 'lyrics...']
    current_section = None
    
    for i, part in enumerate(parts):
        part = part.strip()
        if not part:
            continue
            
        # Check if this is a section header
        if i % 2 == 1:  # Odd indices are section headers (due to split behavior)
            current_section = part
        else:  # Even indices are lyrics
            if current_section and part:
                # Parse section type and number
                section_match = re.match(r'(.+?)\s*(\d+)?', current_section)
                if section_match:
                    section_type = section_match.group(1).lower().strip()
                    section_number = int(section_match.group(2)) if section_match.group(2) else None
                    
                    sections.append({
                        'section_type': section_type,
                        'section_number': section_number,
                        'text': part
                    })
    
    # If no sections found, treat entire lyrics as one section
    if not sections and raw_lyrics.strip():
        sections.append({
            'section_type': 'full',
            'section_number': None,
            'text': raw_lyrics.strip()
        })
    
    return sections

def clean_lyrics(text: str) -> str:
    """
    Clean lyrics text: remove annotations, extra whitespace, normalize
    
    Args:
        text: Raw lyrics text
    
    Returns:
        Cleaned lyrics text
    """
    # Remove text in parentheses (often production notes)
    text = re.sub(r'\([^)]*\)', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    # Normalize quotes
    text = text.replace('"', '"').replace('"', '"')
    text = text.replace(''', "'").replace(''', "'")
    
    return text

def load_songs_as_tuples(json_path: str = "song_list.json") -> List[tuple]:
    """Load song list and convert to (title, artist) tuples"""
    with open(json_path, 'r', encoding='utf-8') as f:
        songs = json.load(f)
    
    return [(song['title'], song['artist']) for song in songs]



## testing each function

### Fetch Lyrics

In [None]:
test_songs

In [None]:
SAVED_SONGS_FN = "songs_list_2000.json"
loaded_songs = load_songs_as_tuples(json_path=SAVED_SONGS_FN)


In [None]:
song_idx = 0
lyrics = fetch_lyrics(test_songs[song_idx][0], test_songs[song_idx][1], GENIUS_ACCESS_TOKEN)

In [None]:
lyrics

### Lyrics parsing

In [None]:
raw_lyrics = lyrics['lyrics']
metadata = lyrics['metadata']

In [None]:
raw_lyrics

In [None]:
metadata

In [None]:
sections = parse_sections(raw_lyrics)

In [None]:
for section in sections:
    section['text'] = clean_lyrics(section['text'])


# Chroma embeddings

In [2]:
import json
import chromadb
from chromadb.utils import embedding_functions
from typing import List, Dict


In [3]:

def setup_chroma(db_path: str = "./lyrics_db") -> chromadb.PersistentClient:
    """
    Initialize Chroma persistent client
    
    Args:
        db_path: Path to store Chroma database
    
    Returns:
        Chroma client instance
    """
    client = chromadb.PersistentClient(path=db_path)
    print(f"Initialized Chroma database at {db_path}")
    return client


def create_collections(client: chromadb.PersistentClient, 
                      model_name: str = "all-MiniLM-L6-v2") -> tuple:
    """
    Create or get two collections: full songs and sections
    
    Args:
        client: Chroma client
        model_name: Sentence transformer model name
    
    Returns:
        Tuple of (songs_collection, sections_collection)
    """
    # Embedding function
    embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=model_name
    )
    
    # Create/get collections
    songs_collection = client.get_or_create_collection(
        name="songs",
        embedding_function=embedding_fn,
        metadata={"description": "Full song lyrics embeddings"}
    )
    
    sections_collection = client.get_or_create_collection(
        name="sections",
        embedding_function=embedding_fn,
        metadata={"description": "Song section embeddings (verse, chorus, etc.)"}
    )
    
    print(f"Collections ready: songs ({songs_collection.count()} docs), sections ({sections_collection.count()} docs)")
    
    return songs_collection, sections_collection

def embed_song(song_data: Dict, songs_collection, sections_collection):
    """
    Embed and store a single song in both collections
    
    Args:
        song_data: Song dict from genius_scraper output
        songs_collection: Chroma collection for full songs
        sections_collection: Chroma collection for sections
    """
    try:
        metadata = song_data['metadata']
        song_id = str(metadata['genius_id'])
        
        # Clean metadata - ensure no None values
        clean_meta = {
            'title': str(metadata.get('title') or 'Unknown'),
            'artist': str(metadata.get('artist') or 'Unknown'),
            'url': str(metadata.get('url') or ''),
            'release_date': str(metadata.get('release_date') or 'Unknown')
        }
        
        # 1. Store full song
        songs_collection.add(
            documents=[song_data['full_lyrics']],
            metadatas=[clean_meta],
            ids=[song_id]
        )
        
        # 2. Store sections
        section_docs = []
        section_metas = []
        section_ids = []
        
        for i, section in enumerate(song_data['sections']):
            section_id = f"{song_id}_section_{i}"
            
            section_docs.append(section['text'])
            section_metas.append({
                'song_id': song_id,
                'title': str(metadata.get('title') or 'Unknown'),
                'artist': str(metadata.get('artist') or 'Unknown'),
                'section_type': str(section.get('section_type') or 'unknown'),
                'section_number': int(section.get('section_number') or 0)
            })
            section_ids.append(section_id)
        
        if section_docs:
            sections_collection.add(
                documents=section_docs,
                metadatas=section_metas,
                ids=section_ids
            )
        
        print(f"✓ Embedded: '{clean_meta['title']}' by {clean_meta['artist']} ({len(section_docs)} sections)")
        
    except Exception as e:
        title = song_data.get('metadata', {}).get('title', 'Unknown')
        print(f"⚠️  Skipping '{title}': {str(e)}")
        print(f"    Metadata: {song_data.get('metadata', {})}")


def reset_collections(db_path: str = "./lyrics_db"):
    """
    Delete and recreate collections (useful for testing)
    
    Args:
        db_path: Path to Chroma database
    """
    client = chromadb.PersistentClient(path=db_path)
    
    try:
        client.delete_collection("songs")
        client.delete_collection("sections")
        print("✓ Collections deleted")
    except:
        print("Collections already empty")


def load_and_embed_all(json_path: str = "songs_data.json", 
                       db_path: str = "./lyrics_db",
                       model_name: str = "all-MiniLM-L6-v2"):
    """
    Complete pipeline: load JSON and embed all songs
    
    Args:
        json_path: Path to songs JSON file
        db_path: Path to Chroma database
        model_name: Sentence transformer model
    """
    # Load songs
    with open(json_path, 'r', encoding='utf-8') as f:
        songs = json.load(f)
    
    print(f"Loaded {len(songs)} songs from {json_path}")
    
    # Setup Chroma
    client = setup_chroma(db_path)
    songs_collection, sections_collection = create_collections(client, model_name)
    
    # Embed each song
    for song_data in songs:
        embed_song(song_data, songs_collection, sections_collection)
    
    print(f"\n✓ Embedded all {len(songs)} songs!")
    print(f"  - Songs collection: {songs_collection.count()} documents")
    print(f"  - Sections collection: {sections_collection.count()} documents")


In [4]:
songs_path = "songs_data.json"
db_path = "./lyrics_db"
# model_name = "all-MiniLM-L6-v2"
model_name = "all-mpnet-base-v2" 


In [5]:
with open(songs_path, 'r', encoding='utf-8') as f:
    songs = json.load(f)

print(len(songs))

4144


### Optionally clear exiting vectorDBs

In [None]:
reset_collections(db_path)

### Continue populating vectorDBs

In [6]:
client = setup_chroma(db_path)

Initialized Chroma database at ./lyrics_db


In [7]:
songs_collection, sections_collection = create_collections(client, model_name)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Collections ready: songs (4091 docs), sections (25711 docs)


In [8]:
len(songs)

4144

In [9]:
for song_data in songs:
        embed_song(song_data, songs_collection, sections_collection)

✓ Embedded: 'Free Fallin’' by Tom Petty (9 sections)
✓ Embedded: 'Have You Ever Seen the Rain?' by Creedence Clearwater Revival (4 sections)
✓ Embedded: 'American Pie' by Don McLean (14 sections)
✓ Embedded: 'A Horse With No Name' by America (7 sections)
✓ Embedded: 'Sweet Home Alabama' by Lynyrd Skynyrd (10 sections)
✓ Embedded: 'For What It’s Worth' by Buffalo Springfield (8 sections)
✓ Embedded: 'Wild World' by Cat Stevens (7 sections)
✓ Embedded: 'Like a Rolling Stone' by Bob Dylan (8 sections)
✓ Embedded: 'Listen to the Music' by The Doobie Brothers (8 sections)
✓ Embedded: 'You’re So Vain' by Carly Simon (13 sections)
✓ Embedded: 'Sweet Caroline' by Neil Diamond (7 sections)
✓ Embedded: 'Maggie May' by Rod Stewart (13 sections)
✓ Embedded: 'Knockin’ on Heaven’s Door' by Bob Dylan (6 sections)
✓ Embedded: 'Me and Julio Down by the Schoolyard' by Paul Simon (6 sections)
✓ Embedded: 'Cecilia' by Simon & Garfunkel (5 sections)
✓ Embedded: 'The Boxer' by Simon & Garfunkel (8 sections)

# Examine Embeddings

In [10]:
# 1. Check collection stats
print(f"Songs: {songs_collection.count()}")
print(f"Sections: {sections_collection.count()}")



Songs: 4095
Sections: 25734


In [11]:
# 2. Peek at a document
result = songs_collection.peek(limit=1)
print(result.keys())
print(result['documents'][0][:200])  # First 200 chars
print(result['metadatas'][0])



dict_keys(['ids', 'embeddings', 'documents', 'uris', 'included', 'data', 'metadatas'])
88 Contributors Translations Deutsch Free Fallin’ Lyrics “Free Fallin',” one of Tom Petty’s biggest hits, is an escapist’s ode to Los Angeles, California. The song, which peaked at #7 on the Billboard
{'release_date': 'April 24, 1989', 'artist': 'Tom Petty', 'url': 'https://genius.com/Tom-petty-free-fallin-lyrics', 'title': 'Free Fallin’'}


In [None]:
QUERY_TEXTS = ["hearttbreak and longing"]

# 3. Test semantic search
results = songs_collection.query(
    query_texts=QUERY_TEXTS,
    n_results=3
)
for i, (doc, meta, dist) in enumerate(zip(results['documents'][0], 
                                           results['metadatas'][0],
                                           results['distances'][0])):
    print(f"{i+1}. {meta['title']} - {meta['artist']} (distance: {dist:.3f})")
    print(f"   {doc[:100]}...\n")


In [None]:
results = sections_collection.query(
    query_texts=QUERY_TEXTS,
    n_results=5
)

for i, (doc, meta, dist) in enumerate(zip(results['documents'][0], 
                                           results['metadatas'][0],
                                           results['distances'][0])):
    print(f"{i+1}. {meta['title']} - {meta['artist']} (distance: {dist:.3f})")
    print(f"   {doc[:100]}...\n")

In [None]:

# 4. Check embedding dimensions
result = songs_collection.get(limit=1, include=['embeddings'])
print(f"Embedding dims: {len(result['embeddings'][0])}")  # Should be 384 for MiniLM

# Find types of tracks in scraped content to filter for songs

In [12]:
import json
from collections import Counter

with open('songs_data.json', 'r') as f:
    songs = json.load(f)

# Analyze titles for non-song keywords
titles = [s['metadata']['title'].lower() for s in songs]
suspicious_words = ['transcript', 'audiobook', 'interview', 'spoken', 'commentary', 'dialogue']

flagged = []
for song in songs:
    title = song['metadata']['title'].lower()
    if any(word in title for word in suspicious_words):
        flagged.append(song['metadata']['title'])

print(f"Flagged {len(flagged)} potential non-songs:")
for t in flagged[:20]:
    print(f"  - {t}")

Flagged 1 potential non-songs:
  - cubicle dialogue


# Scratch

In [13]:
import json

with open('songs_data_25_e.json') as f:
    songs = json.load(f)

# Check first song
print(songs[0]['metadata'].keys())
print(json.dumps(songs[0]['metadata'], indent=2))

# Count enriched songs
enriched = sum(1 for s in songs if s.get('metadata', {}).get('genres'))
print(f"\n{enriched}/{len(songs)} songs have genres")

dict_keys(['title', 'artist', 'url', 'release_date', 'genius_id', 'genres', 'artist_popularity'])
{
  "title": "Free Fallin\u2019",
  "artist": "Tom Petty",
  "url": "https://genius.com/Tom-petty-free-fallin-lyrics",
  "release_date": "April 24, 1989",
  "genius_id": 113622,
  "genres": [
    "classic rock"
  ],
  "artist_popularity": 68
}

21/25 songs have genres
