In [None]:
import os
import pandas as pd
import pickle
import requests
import spotipy
import time
import logging
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
from tqdm import tqdm  # For progress tracking

# ------------------------------------------------------------------------------
# 1. Configure Logging
# ------------------------------------------------------------------------------
logging.basicConfig(
    filename="spotify_extraction.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s",
)

# ------------------------------------------------------------------------------
# 2. Load Environment Variables & Authenticate with Spotify
# ------------------------------------------------------------------------------
load_dotenv()
# Use environment variables if available. Otherwise, you can hardcode your credentials.
SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")
print(SPOTIPY_CLIENT_ID)
print(SPOTIPY_CLIENT_SECRET)
if not SPOTIPY_CLIENT_ID or not SPOTIPY_CLIENT_SECRET:
    raise Exception("Spotify credentials are missing. Check your .env file.")

sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=SPOTIPY_CLIENT_ID,
    client_secret=SPOTIPY_CLIENT_SECRET
))

# ------------------------------------------------------------------------------
# 3. Load Dataset and Extract Unique Values
# ------------------------------------------------------------------------------
# Adjust the path to your dataset file as needed
df = pd.read_parquet(r"/Users/xavierhua/Documents/GitHub/bt4222grp9/preliminary datasets/all_songs_with_or_without_lyrics.parquet")
track_uris = df["track_uri"].dropna().unique().tolist()
album_names = df["album_name"].dropna().unique().tolist()

# ------------------------------------------------------------------------------
# 4. Define Paths for Caching Progress
# ------------------------------------------------------------------------------
TRACK_PROGRESS_FILE = "track_popularity_progress.pkl"
ALBUM_PROGRESS_FILE = "album_release_progress.pkl"

# ------------------------------------------------------------------------------
# 5. Helper Functions for Caching and URI Conversion
# ------------------------------------------------------------------------------
def load_progress(file_path):
    """Loads saved progress to avoid redundant API calls."""
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        with open(file_path, "rb") as f:
            return pickle.load(f)
    return {}

def extract_track_id(uri):
    """Extracts track ID from a Spotify URI (e.g. 'spotify:track:TRACK_ID')."""
    parts = uri.split(":")
    return parts[2] if len(parts) == 3 else uri

# Convert track URIs to track IDs so that keys match the API responses
track_ids = [extract_track_id(uri) for uri in track_uris]

# ------------------------------------------------------------------------------
# 6. Load Previous Progress (if available)
# ------------------------------------------------------------------------------
track_popularity_dict = load_progress(TRACK_PROGRESS_FILE)
album_release_dict = load_progress(ALBUM_PROGRESS_FILE)

logging.info(f"Total unique tracks: {len(track_ids)} | Already processed: {len(track_popularity_dict)}")
logging.info(f"Total unique albums: {len(album_names)} | Already processed: {len(album_release_dict)}")

# ------------------------------------------------------------------------------
# 7. Define Functions to Fetch Data from Spotify with Rate Limit Handling
# ------------------------------------------------------------------------------
def fetch_track_popularity(track_batch):
    """Fetches track popularity scores from Spotify for a batch of track IDs."""
    while True:
        try:
            response = sp.tracks(track_batch)
            # Response keys are track IDs and popularity scores
            return {track["id"]: track["popularity"] for track in response["tracks"] if track}
        except spotipy.SpotifyException as e:
            if e.http_status == 429:
                retry_after = int(e.headers.get("Retry-After", 5))
                logging.warning(f"Rate limit hit. Retrying track popularity after {retry_after} seconds...")
                time.sleep(retry_after)
            else:
                logging.error(f"Error fetching track popularity: {e}")
                return {}

def fetch_album_release(album_batch):
    """
    Fetches album release dates from Spotify for a list of album names.

    1. For each album name, searches Spotify (limit=1).
    2. If found, retrieves the album ID.
    3. Calls sp.album(album_id) to get the official album details (including release_date).
    4. Returns a dict {album_name_in_spotify: release_date} for each album in the batch.

    NOTE: Searching by album name alone can be ambiguous. If you have the album ID or artist info,
          you should refine the search or directly call sp.album(album_id).
    """
    results = {}
    for album_name in album_batch:
        while True:
            try:
                # Step 1: Search by album name
                search_results = sp.search(q=f"album:{album_name}", type="album", limit=1)
                items = search_results["albums"]["items"]
                if items:
                    # Step 2: Retrieve the album ID
                    album_id = items[0]["id"]

                    # Step 3: Get full album details from the official 'Get an Album' endpoint
                    album_data = sp.album(album_id)

                    # Step 4: Store the release_date in the results dict
                    #   album_data["name"] is Spotify's canonical album title
                    #   album_data["release_date"] is the official release date
                    results[album_data["name"]] = album_data["release_date"]
                else:
                    # If no search results, store None or empty
                    results[album_name] = None

                # Break out of the while-loop once we've processed this album_name
                break

            except spotipy.SpotifyException as e:
                if e.http_status == 429:
                    # Handle rate limit
                    retry_after = int(e.headers.get("Retry-After", 5))
                    logging.warning(f"Rate limit hit. Retrying after {retry_after} seconds...")
                    time.sleep(retry_after)
                else:
                    # Log any other SpotifyException, skip this album
                    logging.error(f"SpotifyException fetching release date for '{album_name}': {e}")
                    results[album_name] = None
                    break

            except requests.exceptions.ReadTimeout:
                # If there's a timeout, log and skip this album
                logging.error(f"ReadTimeout fetching release date for '{album_name}'. Skipping.")
                results[album_name] = None
                break

            except requests.exceptions.RequestException as re:
                # Other network-related errors
                logging.error(f"RequestException fetching release date for '{album_name}': {re}")
                results[album_name] = None
                break

            except Exception as ex:
                # Catch-all for unexpected errors
                logging.error(f"Unexpected error for album '{album_name}': {ex}")
                results[album_name] = None
                break

    return results

# ------------------------------------------------------------------------------
# 8. Process Track Popularity in Batches (Resuming from Last Progress)
# ------------------------------------------------------------------------------
# Create batches of track IDs (50 per batch)
track_batches = [track_ids[i:i+50] for i in range(0, len(track_ids), 50)]
# Only process batches that are not fully in the saved dictionary
remaining_track_batches = [batch for batch in track_batches if not set(batch).issubset(track_popularity_dict.keys())]

logging.info(f"Starting track popularity extraction | Remaining batches: {len(remaining_track_batches)}")
for batch in tqdm(remaining_track_batches, desc="Fetching Track Popularity"):
    track_popularity_dict.update(fetch_track_popularity(batch))
    # Save progress every 500 records processed
    if len(track_popularity_dict) % 500 < 50:  # when crossing multiples of 500
        with open(TRACK_PROGRESS_FILE, "wb") as f:
            pickle.dump(track_popularity_dict, f)
        logging.info(f"Saved track popularity progress at {len(track_popularity_dict)} records.")

# Final save for track popularity progress
with open(TRACK_PROGRESS_FILE, "wb") as f:
    pickle.dump(track_popularity_dict, f)
logging.info(f"Final track popularity progress saved: {len(track_popularity_dict)} records.")

# ------------------------------------------------------------------------------
# 9. Process Album Release Dates in Batches (Resuming from Last Progress)
# ------------------------------------------------------------------------------
# Create batches of album names (20 per batch)
album_batches = [album_names[i:i+20] for i in range(0, len(album_names), 20)]
remaining_album_batches = [batch for batch in album_batches if not set(batch).issubset(album_release_dict.keys())]

logging.info(f"Starting album release extraction | Remaining batches: {len(remaining_album_batches)}")
for batch in tqdm(remaining_album_batches, desc="Fetching Album Release Dates"):
    album_release_dict.update(fetch_album_release(batch))
    # Save progress every 100 records processed
    if len(album_release_dict) % 100 < 20:  # when crossing multiples of 100
        with open(ALBUM_PROGRESS_FILE, "wb") as f:
            pickle.dump(album_release_dict, f)
        logging.info(f"Saved album release progress at {len(album_release_dict)} records.")

# Final save for album release dates progress
with open(ALBUM_PROGRESS_FILE, "wb") as f:
    pickle.dump(album_release_dict, f)
logging.info(f"Final album release progress saved: {len(album_release_dict)} records.")

# ------------------------------------------------------------------------------
# 10. Merge Fetched Data into Dataset and Export CSV Files
# ------------------------------------------------------------------------------
# Map track popularity by matching track IDs to the dataset’s track_uri (converted to IDs)
df["track_popularity"] = df["track_uri"].apply(lambda uri: track_popularity_dict.get(extract_track_id(uri)))
df["album_release_date"] = df["album_name"].map(album_release_dict)

# Save CSV files (dropping rows with missing data in the respective columns)
df[["track_uri", "track_popularity"]].dropna().to_csv("track_popularity.csv", index=False)
df[["album_name", "album_release_date"]].dropna().to_csv("album_release_dates.csv", index=False)

logging.info("Extraction complete! Two CSV files saved: 'track_popularity.csv' and 'album_release_dates.csv'")
print("Extraction complete! Check logs for detailed progress.")


SPOTIPY_CLIENT_ID: fe85ab3c7e504bbf92ebb2bf84003d9a
SPOTIPY_CLIENT_SECRET: 9862ea******
Loaded dataset from: /Users/xavierhua/Documents/GitHub/bt4222grp9/preliminary datasets/all_songs_with_or_without_lyrics.parquet
Unique track URIs: 252236 | Unique album names: 105931
Previously processed tracks: 203541
Previously processed albums: 0
Starting track popularity extraction | Remaining track batches: 983


Remaining track batches: 983


Fetching Track Popularity:   0%|          | 0/983 [00:00<?, ?it/s]Fetching track popularity for batch (size=50): ['4BOffopj7gh2lTmJK0i6pH', '4rx79VDJizteqSOT7PQaJl', '32KK5ssfN5vuKRaCsfa63R', '1jPTgvMhaSoRAOD9COQm8l', '25XUnsWG9Bv08BVQP3Sze1', '2vGkhLDyTI3RVu1eKIQVeh', '3AlLUDEBD7teu96ozLarml', '7HbJiOw9TePjIcvvHCeWRc', '3PaUIDgMkbY8otG3XUBQek', '4Vwj4sbwakE8oCmqJ3ufki', '7q0ZcgGz9DSjadnEOe5Nsk', '60kbEtzqPb2l5tXwKFWwYQ', '722etYXeNBNmfsR69z27al', '00WFoR7tPga5Wj92MSR3eD', '2eq9aw5ZZIuRswW9fy5PKE', '6BNymc6C7PlRwBzgZBm5dk', '4h4tD5u0YMrHRh47sIy1Wl', '4itZtZUPOT2WkXO6vpeYqD', '5iAQ8dHSRTxZBJvjQvcR94', '3sI59rkOqPPZju0sh5K8il', '0qeG7GDVR7DXOgQVJkIJUB', '41JWvOS6vhn2uANnUsKVEG', '7gznAeluIdC6GGRSP16dil', '7HS8WT7Qf7OUKcQhW6zF4h', '21JiHTl5br1RJ3QRFaF4pa', '13D9T7wQr4m2XFA2OxQr7x', '2sQF0ZgDVbStYLQ1nKl4WL', '6Tllev6fw9tXOodmoAsIZC', '6d4PPwLXfs7dxpOt7NiP7K', '2MvBQVGPEEMQixRYu3HWea', '0t8OGYVNHQmO4VbJS3L64L', '6T6prTkN0PgnGPdW7lZ7Jv', '3RqMqrZ4dKSiGX7vBiS7L6', '2mNJADsO7l51mVhoCEESz7', '5

KeyboardInterrupt: 