# DJANGLER Development Notebook

Clean notebook using imported library functions

## Setup & Imports

In [None]:
import json
from collections import Counter

# Import DJANGLER libraries
from spotifylib import (
    setup_spotify,
    get_songs_from_playlists,
    save_song_list,
    load_song_list
)
from fetchlib import (
    fetch_lyrics,
    process_song,
    load_songs_as_tuples
)
from chromalib import (
    setup_chroma,
    create_collections,
    embed_song,
    load_and_embed_all,
    reset_collections
)
from chromasearchlib import (
    search_songs,
    search_sections_only,
    print_results
)

## Configuration

In [None]:
# API credentials
SPOTIFY_CLIENT_ID = "dc02e1a590e344558af75713c5f95e02"
SPOTIFY_CLIENT_SECRET = "921349166f544ae88d4f599b4f72b5dc"
GENIUS_ACCESS_TOKEN = "-XneXPQ8TZn0D1Z6cGM_PtgeyN_WowjM65Raw2Ph0Hemn0G8a-HKSjP9CzCdW4fg"

# File paths
SONGS_DATA = "songs_data.json"
DB_PATH = "./lyrics_db"

---
## 1. Spotify Playlist Scraping

### Search for playlists by genre/theme

In [None]:
sp = setup_spotify(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)

def search_playlists(query: str, limit: int = 5):
    """Search Spotify for playlists matching query"""
    results = sp.search(q=query, type='playlist', limit=limit)
    return [p['id'] for p in results['playlists']['items'] if p is not None]

# Example: Find playlists
queries = ["indie folk", "classic rock", "alt country"]
playlist_ids = []

for q in queries:
    ids = search_playlists(q, limit=3)
    playlist_ids.extend(ids)
    print(f"{q}: {len(ids)} playlists")

print(f"\nTotal playlists: {len(playlist_ids)}")

### Fetch songs from playlists

In [None]:
songs = get_songs_from_playlists(
    SPOTIFY_CLIENT_ID,
    SPOTIFY_CLIENT_SECRET,
    playlist_ids=playlist_ids,
    target_count=1000
)

print(f"\nFetched {len(songs)} unique songs")
print("\nSample:")
for s in songs[:3]:
    print(f"  - {s['title']} by {s['artist']}")

### Save to temporary list

In [None]:
save_song_list(songs, output_path="songs_list_batch1.json")

---
## 2. Lyrics Fetching

### Process song list and append to master dataset

In [None]:
import time

# Load songs to process
song_tuples = load_songs_as_tuples("songs_list_batch1.json")
print(f"Processing {len(song_tuples)} songs...\n")

# Process each song (appends to songs_data.json)
processed = 0
skipped = 0

for i, (title, artist) in enumerate(song_tuples):
    if i % 50 == 0 and i > 0:
        print(f"Progress: {i}/{len(song_tuples)} (processed: {processed}, skipped: {skipped})")
    
    result = process_song(title, artist, GENIUS_ACCESS_TOKEN, output_path=SONGS_DATA)
    if result:
        processed += 1
    else:
        skipped += 1
    
    time.sleep(0.2)  # Rate limiting

print(f"\nâœ“ Complete! Processed: {processed}, Skipped: {skipped}")

### Inspect master dataset

In [None]:
with open(SONGS_DATA) as f:
    songs_data = json.load(f)

print(f"Total songs in master dataset: {len(songs_data)}")
print(f"\nSample song structure:")
print(json.dumps(songs_data[0]['metadata'], indent=2))
print(f"\nNumber of sections: {len(songs_data[0]['sections'])}")

---
## 3. Genre Enrichment (Optional)

### Add genre metadata from Spotify

In [None]:
from enrich_songs_data import enrich_songs_data, analyze_genre_gaps

# Add genres to songs_data.json
enrich_songs_data(
    input_path=SONGS_DATA,
    output_path=SONGS_DATA,  # Overwrites in place
    save_frequency=100  # Checkpoint every 100 songs
)

# Analyze genre coverage
analyze_genre_gaps(SONGS_DATA)

---
## 4. Vector Embeddings

### Option A: Embed all songs from master dataset

In [None]:
load_and_embed_all(
    json_path=SONGS_DATA,
    db_path=DB_PATH,
    model_name="all-mpnet-base-v2"
)

### Option B: Incremental embedding (add new songs only)

In [None]:
# Load existing collections
client = setup_chroma(DB_PATH)
songs_coll, sections_coll = create_collections(client)

# Load songs
with open(SONGS_DATA) as f:
    songs_data = json.load(f)

# Embed only new songs
for song in songs_data:
    embed_song(song, songs_coll, sections_coll)

print(f"\nâœ“ Final counts:")
print(f"  Songs: {songs_coll.count()}")
print(f"  Sections: {sections_coll.count()}")

### Inspect collections

In [None]:
# Check collection stats
print(f"Songs: {songs_coll.count()}")
print(f"Sections: {sections_coll.count()}")

# Peek at a document
result = songs_coll.peek(limit=1)
print(f"\nSample metadata:")
print(result['metadatas'][0])
print(f"\nLyrics preview:")
print(result['documents'][0][:200])

# Check embedding dimensions
result = songs_coll.get(limit=1, include=['embeddings'])
print(f"\nEmbedding dimensions: {len(result['embeddings'][0])}")

---
## 5. Semantic Search

### Basic search

In [None]:
query = "indie songs about feeling lost and searching for purpose"

results = search_songs(
    query=query,
    n_results=10,
    db_path=DB_PATH,
    genre_boost=10
)

print(f"Query: '{query}'\n")
print_results(results, show_sections=True)

In [None]:
# Check ChromaDB metadata
client = setup_chroma(DB_PATH)
songs_coll, _ = create_collections(client)

result = songs_coll.peek(limit=5)
print(result['metadatas'])

In [None]:
import json
with open('songs_data.json') as f:
    songs = json.load(f)

# Check first few songs
for i in range(5):
    print(f"{songs[i]['metadata']['title']}: {songs[i]['metadata'].get('genres', 'NO GENRES')}")

### Section-only search

In [None]:
sections = search_sections_only(
    query="summer nights and freedom",
    n_results=10,
    db_path=DB_PATH
)

print("Top matching sections:\n")
for i, s in enumerate(sections, 1):
    print(f"{i}. {s['title']} - {s['artist']}")
    print(f"   [{s['section_type'].title()}] {s['text'][:100]}...\n")

### Test multiple queries

In [None]:
test_queries = [
    "heartbreak and longing",
    "dancing all night",
    "my heart is empty",
    "childhood memories",
    "fighting for what's right"
]

for query in test_queries:
    results = search_songs(query, n_results=3, db_path=DB_PATH)
    print(f"\n{'='*60}")
    print(f"Query: '{query}'")
    print('='*60)
    for i, r in enumerate(results, 1):
        print(f"{i}. {r['title']} - {r['artist']} (score: {r['score']:.2f})")

### Genre-boosted search

In [None]:
# Search with genre boosting (default 1.5x)
query = "rap folk songs about loss"

results = search_songs(
    query=query,
    n_results=10,
    genre_boost=1.5,
    db_path=DB_PATH
)

print(f"Query: '{query}'\n")
print_results(results, show_sections=True)

In [None]:
# Compare with and without genre boosting
query = "rock songs about rebellion"

print("WITH genre boosting (2.0x):")
results_boosted = search_songs(query, n_results=5, genre_boost=2.0, db_path=DB_PATH)
for i, r in enumerate(results_boosted, 1):
    boost = "ðŸŽµ" if r.get('genre_boosted') else ""
    print(f"{i}. {r['title']} - {r['artist']} {boost} (score: {r['score']:.2f})")

print("\nWITHOUT genre boosting:")
results_no_boost = search_songs(query, n_results=5, genre_boost=0, db_path=DB_PATH)
for i, r in enumerate(results_no_boost, 1):
    print(f"{i}. {r['title']} - {r['artist']} (score: {r['score']:.2f})")

---
## 6. Analysis & Utilities

### Dataset statistics

In [None]:
with open(SONGS_DATA) as f:
    songs_data = json.load(f)

# Basic stats
print(f"Total songs: {len(songs_data)}")
print(f"\nArtist distribution:")
artists = Counter(s['metadata']['artist'] for s in songs_data)
for artist, count in artists.most_common(10):
    print(f"  {artist}: {count}")

# Genre distribution (if enriched)
has_genres = any(s['metadata'].get('genres') for s in songs_data)
if has_genres:
    all_genres = []
    for s in songs_data:
        all_genres.extend(s['metadata'].get('genres', []))
    
    print(f"\nTop genres:")
    genre_counts = Counter(all_genres)
    for genre, count in genre_counts.most_common(15):
        print(f"  {genre}: {count}")

### Reset collections (if needed)

In [None]:
# WARNING: This deletes all embeddings!
reset_collections(DB_PATH)