In [None]:
import pandas as pd
tracks_df = pd.read_parquet("/Users/xavierhua/Documents/GitHub/bt4222grp9/parquet datasets/tracks_XavierHua.parquet")
tracks_df

In [None]:
import os
import pandas as pd
import pickle
from dotenv import load_dotenv
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# Load environment variables
load_dotenv()
SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")

# Authenticate with Spotify
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id=SPOTIPY_CLIENT_ID,
    client_secret=SPOTIPY_CLIENT_SECRET))

# Helper: Batch iterator
def batch(iterable, n=1):
    """Yield successive n-sized batches from list."""
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

# Load your DataFrame of tracks (if you have it saved before)
# tracks_df = pd.read_csv("your_tracks_file.csv")  # Uncomment if needed

# Get list of track URIs and album URIs from your DataFrame
track_uris = tracks_df["track_uri"].tolist()
album_uris = tracks_df["album_uri"].tolist()

##############################################
# PART 1: Process Tracks with Resume Support #
##############################################

# Create batches for track URIs (Spotify API supports up to 50 per request)
track_batches = list(batch(track_uris, n=50))
total_track_batches = len(track_batches)

# Check if there's an existing progress file for tracks
if os.path.exists("track_details_progress.pkl"):
    with open("track_details_progress.pkl", "rb") as f:
        all_track_details = pickle.load(f)
    # Assuming each batch returns 50 tracks (except possibly the last),
    # we can estimate how many batches are done.
    processed_track_batches = len(all_track_details) // 50
    print(f"Resuming track processing from batch {processed_track_batches + 1} out of {total_track_batches}...")
else:
    all_track_details = []
    processed_track_batches = 0
    print(f"Starting track processing from batch 1 out of {total_track_batches}...")

# Process remaining track batches
for i, track_batch in enumerate(track_batches[processed_track_batches:], start=processed_track_batches + 1):
    print(f"Processing track batch {i}/{total_track_batches}...")
    response = sp.tracks(track_batch)
    all_track_details.extend(response["tracks"])
    # Save progress every 10 batches (adjust as needed)
    if i % 10 == 0:
        with open("track_details_progress.pkl", "wb") as f:
            pickle.dump(all_track_details, f)
        print("Saved track details progress.")

# Save one more time after completing all batches
with open("track_details_progress.pkl", "wb") as f:
    pickle.dump(all_track_details, f)
print("Track processing completed.\n")

##############################################
# PART 2: Process Albums with Resume Support #
##############################################

# For album details, we only need unique album URIs.
unique_album_uris = list(set(album_uris))
album_batches = list(batch(unique_album_uris, n=20))
total_album_batches = len(album_batches)

# Check if there's an existing progress file for albums
if os.path.exists("album_details_progress.pkl"):
    with open("album_details_progress.pkl", "rb") as f:
        album_details_dict = pickle.load(f)
    # Estimate number of processed batches.
    # (This assumes each complete batch returns 20 albums; the last batch might be smaller.)
    processed_album_batches = len(album_details_dict) // 20
    print(f"Resuming album processing from batch {processed_album_batches + 1} out of {total_album_batches}...")
else:
    album_details_dict = {}
    processed_album_batches = 0
    print(f"Starting album processing from batch 1 out of {total_album_batches}...")

# Process remaining album batches
for i, album_batch in enumerate(album_batches[processed_album_batches:], start=processed_album_batches + 1):
    print(f"Processing album batch {i}/{total_album_batches}...")
    response = sp.albums(album_batch)
    for album in response["albums"]:
        # Skip if album is None or its URI is missing
        if album is None or album.get("uri") is None:
            continue
        album_details_dict[album["uri"]] = album
    # Save progress every 5 batches (adjust as needed)
    if i % 5 == 0:
        with open("album_details_progress.pkl", "wb") as f:
            pickle.dump(album_details_dict, f)
        print("Saved album details progress.")

# Save one more time after completing all album batches
with open("album_details_progress.pkl", "wb") as f:
    pickle.dump(album_details_dict, f)
print("Album processing completed.\n")


In [None]:
###########################################
# PART 3: Combine Track and Album Metadata #
###########################################

track_metadata = []
for track in all_track_details:
    # Skip if track or its album URI is None
    if track is None or track["album"].get("uri") is None:
        continue
    album_uri = track["album"]["uri"]
    album_info = album_details_dict.get(album_uri, {})
    track_metadata.append({
        "track_uri": track["uri"],
        "track_name": track["name"],
        "track_popularity": track["popularity"],
        "album_name": album_info.get("name", None),
        "album_release_date": album_info.get("release_date", None),
    })

# Save the final metadata to CSV
metadata_df = pd.DataFrame(track_metadata)
metadata_df.to_csv("track_metadata.csv", index=False)

print("Metadata processing completed and saved to 'track_metadata.csv'.")

In [None]:
# Convert the list of dictionaries into a DataFrame
track_metadata_df = pd.DataFrame(track_metadata)
track_metadata_df.head()

In [None]:
# from sklearn.preprocessing import StandardScaler
# import pandas as pd

# # Assume track_metadata_df already has "track_popularity" and "album_release_date"

# # 1. Standardize track popularity
# scaler = StandardScaler()
# track_metadata_df["track_popularity_scaled"] = scaler.fit_transform(track_metadata_df[["track_popularity"]])

# # 2. Define a function to convert release dates to eras (e.g., 70s, 80s, etc.)
# def release_date_to_era(release_date):
#     try:
#         # Convert release_date to datetime; this handles formats like "YYYY", "YYYY-MM-DD", etc.
#         year = pd.to_datetime(release_date, errors='coerce').year
#         if pd.isnull(year):
#             return None
#         if 1970 <= year < 1980:
#             return "70s"
#         elif 1980 <= year < 1990:
#             return "80s"
#         elif 1990 <= year < 2000:
#             return "90s"
#         elif 2000 <= year < 2010:
#             return "00s"
#         elif 2010 <= year < 2020:
#             return "10s"
#         else:
#             return "Other"
#     except Exception as e:
#         return None

# # Apply the function to create a new column for era
# track_metadata_df["album_era"] = track_metadata_df["album_release_date"].apply(release_date_to_era)
# track_metadata_df.head()