## Get Album from Spotify

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
from album_lookup import (
    get_official_album_name,
    guess_album,
    album_candidates,
    lookup_album_via_openai,
)

In [2]:
# Example test: Try to look up a well-known track and check the album name
artist = "Iron Maiden"
title = "The Trooper"
expected_album = "Piece of Mind"

predicted_album = get_official_album_name(artist, title)
match = guess_album(artist, title)
print(f"Match details: {match}")
print(f"Artist: {artist}")
print(f"Title: {title}")
print(f"Expected Album: {expected_album}")
print(f"Predicted Album: {predicted_album}")

assert predicted_album is not None, "No album found"
assert expected_album.lower() in predicted_album.lower(), "Album lookup failed"

Match details: AlbumMatch(album='Piece of Mind', source='spotify', confidence=0.577777777777778, album_type='album', raw_album='Piece of Mind (2015 Remaster)', release_date=datetime.datetime(1983, 1, 1, 0, 0), track_id='4OROzZUy6gOWN4UGQVaZMF', track_name='The Trooper - 2015 Remaster', title_score=0.6111111111111112, artist_score=1.0, album_artist_score=1.0, popularity=74, flags=('reissue',))
Artist: Iron Maiden
Title: The Trooper
Expected Album: Piece of Mind
Predicted Album: Piece of Mind


In [3]:
guess_album("Ensiferum", "In My Sword I Trust")

AlbumMatch(album='Unsung Heroes', source='spotify', confidence=1.0, album_type='album', raw_album='Unsung Heroes (Japan Version)', release_date=datetime.datetime(2012, 1, 1, 0, 0), track_id='7aYO9WJaSC8m7YpT0bM5Ke', track_name='In My Sword I Trust', title_score=1.0, artist_score=1.0, album_artist_score=1.0, popularity=46, flags=())

In [4]:
candidates = album_candidates("Iron Maiden", "Powerslave")
candidates

[AlbumMatch(album='Powerslave', source='spotify', confidence=0.5617647058823529, album_type='album', raw_album='Powerslave (2015 Remaster)', release_date=datetime.datetime(1984, 1, 1, 0, 0), track_id='70wXUzmg3tk3Ci2Ixg1YwO', track_name='Powerslave - 2015 Remaster', title_score=0.5882352941176471, artist_score=1.0, album_artist_score=1.0, popularity=58, flags=('reissue',)),
 AlbumMatch(album='Live After Death', source='spotify', confidence=0.55, album_type='album', raw_album='Live After Death (1998 Remaster)', release_date=datetime.datetime(1985, 1, 1, 0, 0), track_id='5iKxoiMFKa8GxDaXGuwApw', track_name='Powerslave - Live at Long Beach Arena; 1998 Remaster', title_score=1.0, artist_score=1.0, album_artist_score=1.0, popularity=34, flags=('reissue', 'live_track')),
 AlbumMatch(album='Flight 666: The Original Soundtrack', source='spotify', confidence=0.7000000000000001, album_type='album', raw_album='Flight 666: The Original Soundtrack', release_date=datetime.datetime(2009, 5, 25, 0, 0)

In [6]:
guess_album("Megadeth", "Symphony of Destruction")

AlbumMatch(album='Countdown To Extinction', source='spotify', confidence=0.8500000000000001, album_type='album', raw_album='Countdown To Extinction (Deluxe Edition - Remastered)', release_date=datetime.datetime(1992, 6, 14, 0, 0), track_id='5mR858YsHYG761aUqZoGkD', track_name='Symphony Of Destruction - Remastered 2012', title_score=1.0, artist_score=1.0, album_artist_score=1.0, popularity=69, flags=('reissue',))

## Full comparison using musicbrainz

In [10]:
import Levenshtein

melodeath = "NeuralForge/playlists/Melodic Death Metal.csv"
classic_metal = "NeuralForge/playlists/Classic Metal.csv"
power_metal = "NeuralForge/playlists/Power Metal.csv"
folk_metal = "NeuralForge/playlists/Folk Metal.csv"
prog_metal = "NeuralForge/playlists/Prog Metal.csv"

# List of all playlist CSVs
playlist_files = [
    # melodeath,
    # classic_metal,
    power_metal,
    # folk_metal,
    # prog_metal,
]

frames = []
column_order = None
for path in playlist_files:
    playlist_df = pd.read_csv(path)
    if column_order is None:
        column_order = playlist_df.columns.tolist()
    playlist_df["source_file"] = path
    frames.append(playlist_df)

if not frames:
    raise ValueError("playlist_files is empty. Please add at least one CSV path.")

df_all = pd.concat(frames, ignore_index=True)

correct = 0
total = len(df_all)
approved_rows = []

for idx, row in tqdm(df_all.iterrows(), total=total):
    artist = row["Artist"]
    title = row["Title"]
    official_album = row["Album"]
    predicted_album = get_official_album_name(artist, title)

    if predicted_album:
        pa = str(predicted_album).strip().lower()
        oa = str(official_album).strip().lower()

        if pa == oa:
            correct += 1
            continue

        # Use python-Levenshtein for distance calculation
        if Levenshtein.distance(pa, oa) <= 3:
            correct += 1
            continue

    print(f"\nMismatch for '{artist}' - '{title}':")
    print(f"\tOfficial: '{official_album}'")
    print(f"\tPredicted: '{predicted_album}'")

    if not predicted_album:
        continue


print(f"Correct: {correct}/{total} ({correct / total:.2%})")
print(f"Approved updates: {len(approved_rows)}")

  0%|          | 0/230 [00:00<?, ?it/s]


Mismatch for 'Angra' - 'Angels Cry':
	Official: 'Angels Cry'
	Predicted: 'Reaching Horizons'

Mismatch for 'Edguy' - 'Lavatory Love Machine':
	Official: 'Hellfire Club'
	Predicted: 'The Singles'

Mismatch for 'Freedom Call' - 'Warriors':
	Official: 'Eternity'
	Predicted: 'Eternity - 666 Weeks Beyond Eternity'

Mismatch for 'Gamma Ray' - 'Heaven Can Wait':
	Official: 'Heading for Tomorrow'
	Predicted: 'The Best (Of)'

Mismatch for 'Gamma Ray' - 'Land of the Free':
	Official: 'Land of the Free'
	Predicted: 'The Best (Of)'

Mismatch for 'Gamma Ray' - 'Man on a Mission':
	Official: 'Land of the Free'
	Predicted: 'The Best (Of)'

Mismatch for 'Gamma Ray' - 'Rebellion in Dreamland':
	Official: 'Land of the Free'
	Predicted: 'The Best (Of)'

Mismatch for 'Pathfinder' - 'The Lord of Wolves':
	Official: 'Beyond the Space, Beyond the Time'
	Predicted: 'Pathfinder'

Mismatch for 'Powerwolf' - 'Demons Are a Girl's Best Friend':
	Official: 'The Sacrament of Sin'
	Predicted: 'The Symphony of Sin'



## Full comparison using GPT search

In [5]:
import Levenshtein

melodeath = "NeuralForge/playlists/Melodic Death Metal.csv"
classic_metal = "NeuralForge/playlists/Classic Metal.csv"
power_metal = "NeuralForge/playlists/Power Metal.csv"
folk_metal = "NeuralForge/playlists/Folk Metal.csv"
prog_metal = "NeuralForge/playlists/Prog Metal.csv"

# List of all playlist CSVs
playlist_files = [
    # melodeath,
    classic_metal,
    # power_metal,
    # folk_metal,
    # prog_metal,
]

frames = []
column_order = None
for path in playlist_files:
    playlist_df = pd.read_csv(path)
    if column_order is None:
        column_order = playlist_df.columns.tolist()
    playlist_df["source_file"] = path
    frames.append(playlist_df)

if not frames:
    raise ValueError("playlist_files is empty. Please add at least one CSV path.")

df_all = pd.concat(frames, ignore_index=True)

correct = 0
total = len(df_all)
approved_rows = []

for idx, row in tqdm(df_all.iterrows(), total=total):
    artist = row["Artist"]
    title = row["Title"]
    official_album = row["Album"]
    predicted_album = lookup_album_via_openai(artist, title)

    if predicted_album:
        pa = str(predicted_album).strip().lower()
        oa = str(official_album).strip().lower()

        if pa == oa:
            correct += 1
            continue

        # Use python-Levenshtein for distance calculation
        if Levenshtein.distance(pa, oa) <= 3:
            correct += 1
            continue

    print(f"\nMismatch for '{artist}' - '{title}':")
    print(f"\tOfficial: '{official_album}'")
    print(f"\tPredicted: '{predicted_album}'")

    if not predicted_album:
        continue

    # response = input("Approve predicted album? [y/N]: ").strip().lower()
    # if response in {"y", "yes"}:
    #     df_all.at[idx, "Album"] = predicted_album
    #     approved_rows.append(idx)
    #     correct += 1

print(f"Correct: {correct}/{total} ({correct / total:.2%})")
print(f"Approved updates: {len(approved_rows)}")

# if approved_rows:
#     updated_files = set(df_all.loc[approved_rows, "source_file"])
#     for path in updated_files:
#         mask = df_all["source_file"] == path
#         df_all.loc[mask, column_order].to_csv(path, index=False)
#         print(f"Updated album assignments saved to '{path}'.")
# else:
#     print("No CSV files updated.")

  0%|          | 0/156 [00:00<?, ?it/s]

KeyboardInterrupt: 