## Get Album from Spotify

In [1]:
# pip install spotipy
import os, re
from datetime import datetime
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

In [8]:
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id=os.getenv("SPOTIFY_CLIENT_ID"),
        client_secret=os.getenv("SPOTIFY_CLIENT_SECRET"),
    )
)


def _norm_title(s: str) -> str:
    original = s.strip()
    s = s.lower().strip()
    s = re.sub(r"\s*\([^)]*\)", "", s)  # drop (...) like "(Remastered 2009)"
    s = re.sub(r"\s*\[[^\]]*\]", "", s)  # drop [...]
    s = re.sub(r"\s+feat\..*$", "", s)  # drop "feat. ..."
    s = re.sub(r"\s+ft\..*$", "", s)  # drop "ft. ..."
    s = re.sub(r"\s*-\s*(live.*|acoustic.*|remaster.*|mono|stereo|radio edit.*|version.*)$", "", s)
    s = re.sub(r"\s+", " ", s)
    s = s.strip()

    # Restore original casing by mapping positions
    result = []
    orig_idx = 0
    for char in s:
        while orig_idx < len(original) and original[orig_idx].lower() != char:
            orig_idx += 1
        if orig_idx < len(original):
            result.append(original[orig_idx])
            orig_idx += 1
        else:
            result.append(char)

    return "".join(result)


_BAD_ALBUM_TERMS = [
    "deluxe",
    "expanded",
    "remaster",
    "remastered",
    "live",
    "anniversary",
    "special edition",
    "super deluxe",
    "bonus track",
]


def _bad_album_name(name: str) -> bool:
    n = name.lower()
    return any(k in n for k in _BAD_ALBUM_TERMS)


def _parse_release_date(d: str) -> datetime:
    # Spotify gives YYYY or YYYY-MM or YYYY-MM-DD
    if not d:
        return datetime(3000, 1, 1)
    try:
        if len(d) == 4:
            return datetime(int(d), 1, 1)
        if len(d) == 7:
            return datetime(*map(int, (d[:4], d[5:7], "1")))
        y, m, day = map(int, d.split("-"))
        return datetime(y, m, day)
    except Exception:
        return datetime(3000, 1, 1)


def get_official_album_name(artist: str, title: str, verbose: bool = False) -> str | None:
    """
    Returns the most likely 'official' album name for an (artist, title) pair.
    Heuristics:
      1) Exact normalized title match & artist match.
      2) Prefer album_type == 'album' over 'single'/'compilation'.
      3) Avoid 'deluxe/remaster/live/anniversary...' album names.
      4) Prefer the earliest release_date (original LP).
      5) Break ties by higher track popularity.
    """
    q = f'artist:"{artist}" track:"{title}"'
    results = sp.search(q=q, type="track", limit=50)
    items = results.get("tracks", {}).get("items", [])
    if not items:
        return None
    if verbose:
        print(f"Found {len(items)} candidates for '{artist}' - '{title}'")
        print([f"{t['name']} ({t['album']['name']})" for t in items])

    norm_title = _norm_title(title)
    norm_artist = artist.lower()

    def artist_matches(t) -> bool:
        names = [a["name"].lower() for a in t.get("artists", [])]
        return any(norm_artist in n or n in norm_artist for n in names)

    # rank by our heuristics
    def rank_key(t):
        alb = t["album"]
        return (
            alb.get("album_type") != "album",  # prefer full albums
            -int(t.get("popularity", 0)),  # more popular first
            _parse_release_date(alb.get("release_date")),  # earlier is better
            _bad_album_name(alb.get("name", "")),  # avoid deluxe/remaster/live
        )

    pool_sorted = sorted(items, key=rank_key)
    if verbose:
        print("\nRanked candidates:")
        for t in pool_sorted:
            alb = t["album"]
            print(
                f"\t{alb['name']} - {alb.get('album_type')} - {alb.get('release_date')} - pop={t.get('popularity')}"
            )
    best = pool_sorted[0]
    return _norm_title(best["album"]["name"])

In [9]:
name = get_official_album_name("Metallica", "Fade to Black")
print(name)

Ride The Lightning


In [10]:
get_official_album_name("Aether Realm", "The Sun, the Moon, the Star")

'Tarot'

In [11]:
from tqdm.notebook import tqdm

melodeath = "NeuralForge/playlists/Melodic Death Metal.csv"
classic_metal = "NeuralForge/playlists/Classic Metal.csv"
power_metal = "NeuralForge/playlists/Power Metal.csv"
folk_metal = "NeuralForge/playlists/Folk Metal.csv"
prog_metal = "NeuralForge/playlists/Prog Metal.csv"

# List of all playlist CSVs
playlist_files = [
    melodeath,
    classic_metal,
    power_metal,
    folk_metal,
    prog_metal,
]

# Concatenate all playlists into one DataFrame
df_all = pd.concat([pd.read_csv(f) for f in playlist_files], ignore_index=True)

correct = 0
total = 0

for i, row in tqdm(df_all.iterrows(), total=len(df_all)):
    artist = row["Artist"]
    title = row["Title"]
    official_album = row["Album"]
    predicted_album = get_official_album_name(artist, title)
    total += 1
    if predicted_album and str(predicted_album).lower() == str(official_album).lower():
        correct += 1
    else:
        print(f"\nMismatch for '{artist}' - '{title}':")
        print(f"\tOfficial: '{official_album}'")
        print(f"\tPredicted: '{predicted_album}'")

print(f"Correct: {correct}/{total} ({correct / total:.2%})")

  0%|          | 0/716 [00:00<?, ?it/s]


Mismatch for 'Aephanemer' - 'Path of the Wolf':
	Official: 'Know Thyself'
	Predicted: 'Prokopton'

Mismatch for 'Insomnium' - 'Through the Shadows':
	Official: 'One for Sorrow'
	Predicted: 'None'

Mismatch for 'Mors Principium Est' - 'Life in Black':
	Official: 'Inhumanity'
	Predicted: 'Liberate The Unborn Inhumanity'

Mismatch for 'Wintersun' - 'Time':
	Official: 'Time I'
	Predicted: 'Wintersun'

Mismatch for 'Black Sabbath' - 'Heaven and Hell':
	Official: 'Heaven and Hell'
	Predicted: 'Heaven & Hell'

Mismatch for 'Megadeth' - 'In My Darkest Hour':
	Official: 'So Far, So Good... So What!'
	Predicted: 'So Far, So Good...So What!'

Mismatch for 'Megadeth' - 'Peace Sells':
	Official: 'Peace Sells... but Who's Buying?'
	Predicted: 'Peace Sells...But Who's Buying'

Mismatch for 'Megadeth' - 'The Conjuring':
	Official: 'Peace Sells... but Who's Buying?'
	Predicted: 'Peace Sells...But Who's Buying?'

Mismatch for 'Megadeth' - 'Wake Up Dead':
	Official: 'Peace Sells... but Who's Buying?'
	P


Mismatch for 'Insomnium' - 'Through the Shadows':
	Official: 'One for Sorrow'
	Predicted: 'None'

Mismatch for 'Wintersun' - 'Time':
	Official: 'Time I'
	Predicted: 'Wintersun'
Correct: 126/128 (98.44%)


In [53]:
get_official_album_name("Aephanemer", "Path of the Wolf", True)

Found 3 candidates for 'Aephanemer' - 'Path of the Wolf'
['Path of the Wolf (Prokopton)', 'Path of the Wolf (Know Thyself)', 'Path of the Wolf (Path of the Wolf)']

Ranked candidates:
	Know Thyself - album - 2014-01-05 - pop=14
	Prokopton - album - 2019-10-25 - pop=31
	Path of the Wolf - single - 2017-10-13 - pop=5


'Know Thyself'