In [None]:
import pandas as pd
tracks_df = pd.read_parquet(r"C:\Users\tkeit\OneDrive\Documents\GitHub\bt4222grp9\preliminary datasets\all_songs_with_or_without_lyrics.parquet")
tracks_df

In [None]:
import os
import pandas as pd
import pickle
import time
import spotipy
import concurrent.futures
from dotenv import load_dotenv
from spotipy.oauth2 import SpotifyClientCredentials
from requests.exceptions import ReadTimeout

# Load environment variables
load_dotenv()
SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")

# Authenticate with Spotify
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id="c68416b36aeb4eaabca398101322e5b4",
    client_secret="44702a064cf740f19ee00de860a7d67a"
))

# Helper: Batch iterator
def batch(iterable, n=50):
    """Yield successive n-sized batches from a list."""
    for i in range(0, len(iterable), n):
        yield iterable[i:i + n]

# Load your DataFrame (Ensure track_uri & album_uri exist)
#tracks_df = pd.read_csv("your_tracks_file.csv")  # Update with actual file path
track_uris = tracks_df["track_uri"].tolist()
album_uris = tracks_df["album_name"].tolist()

##############################################
# PART 1: Process Tracks with Resume Support #
##############################################

# Define the progress file
TRACK_PROGRESS_FILE = "track_details_progressAll.pkl"
PROCESSED_TRACKS_FILE = "processed_tracksALL.pkl"

# Load existing progress if available
if os.path.exists(TRACK_PROGRESS_FILE) and os.path.getsize(TRACK_PROGRESS_FILE) > 0:
    try:
        with open(TRACK_PROGRESS_FILE, "rb") as f:
            all_track_details = pickle.load(f)
            processed_track_uris = {track["id"] for track in all_track_details if track and isinstance(track, dict) and "id" in track}
            print(f"Resuming track processing... {len(processed_track_uris)} tracks already processed.")
    except (EOFError, pickle.UnpicklingError):
        print("Error: Pickle file is empty or corrupted. Starting fresh...")
        all_track_details = []
        processed_track_uris = set()
else:
    print("No valid progress file found. Starting track processing from scratch...")
    all_track_details = []
    processed_track_uris = set()

# Function to fetch track details with error handling
def fetch_track_details(track_batch):
    try:
        response = sp.tracks(track_batch)
        return response["tracks"]
    except ReadTimeout:
        print("Timeout error, retrying in 5 seconds...")
        time.sleep(5)
        return fetch_track_details(track_batch)
    except Exception as e:
        print(f"Error fetching tracks: {e}")
        return []

# Process track batches efficiently using ThreadPoolExecutor
track_batches = list(batch(track_uris, 50))

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for i, track_batch in enumerate(track_batches):
        if set(track_batch).issubset(processed_track_uris):
            continue  # Skip already processed tracks
        
        print(f"Processing batch {i+1}/{len(track_batches)}...")
        track_results = executor.submit(fetch_track_details, track_batch).result()
        
        if track_results:
            all_track_details.extend(track_results)
            processed_track_uris.update(track_batch)

        # Save progress every 10 batches
        if (i + 1) % 10 == 0:
            with open(TRACK_PROGRESS_FILE, "wb") as f:
                pickle.dump(all_track_details, f)
            print("Track details progress saved.")

# Final save after completion
with open(TRACK_PROGRESS_FILE, "wb") as f:
    pickle.dump(all_track_details, f)

print("Track processing completed.")

##############################################
# PART 2: Process Albums with Resume Support #
##############################################

ALBUM_PROGRESS_FILE = "album_details_progressALL.pkl"
PROCESSED_ALBUMS_FILE = "processed_albumsALL.pkl"

# Load existing album progress if available
if os.path.exists(ALBUM_PROGRESS_FILE):
    with open(ALBUM_PROGRESS_FILE, "rb") as f:
        album_details_dict = pickle.load(f)
    processed_album_uris = set(album_details_dict.keys())
    print(f"Resuming album processing... {len(processed_album_uris)} albums already processed.")
else:
    album_details_dict = {}
    processed_album_uris = set()
    print("Starting album processing from scratch...")

# Function to fetch album details with error handling
def fetch_album_details(album_batch):
    try:
        response = sp.albums(album_batch)
        return {album["uri"]: album for album in response["albums"] if album}
    except ReadTimeout:
        print("Timeout error, retrying in 5 seconds...")
        time.sleep(5)
        return fetch_album_details(album_batch)
    except Exception as e:
        print(f"Error fetching albums: {e}")
        return {}

# Process album batches efficiently using ThreadPoolExecutor
unique_album_uris = list(set(album_uris))
album_batches = list(batch(unique_album_uris, 20))

with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    for i, album_batch in enumerate(album_batches):
        if set(album_batch).issubset(processed_album_uris):
            continue  # Skip already processed albums
        
        print(f"Processing album batch {i+1}/{len(album_batches)}...")
        album_results = executor.submit(fetch_album_details, album_batch).result()
        
        if album_results:
            album_details_dict.update(album_results)
            processed_album_uris.update(album_batch)

        # Save progress every 5 batches
        if (i + 1) % 5 == 0:
            with open(ALBUM_PROGRESS_FILE, "wb") as f:
                pickle.dump(album_details_dict, f)
            print("Album details progress saved.")

# Final save after completion
with open(ALBUM_PROGRESS_FILE, "wb") as f:
    pickle.dump(album_details_dict, f)

print("Album processing completed.")


In [None]:
###########################################
# PART 3: Combine Track and Album Metadata #
###########################################

# Load track details
TRACK_PROGRESS_FILE = "track_details_progress.pkl"
ALBUM_PROGRESS_FILE = "album_details_progress.pkl"

# Load saved track data
if os.path.exists(TRACK_PROGRESS_FILE):
    with open(TRACK_PROGRESS_FILE, "rb") as f:
        all_track_details = pickle.load(f)
    # Remove any None or corrupted data
    all_track_details = [track for track in all_track_details if track and isinstance(track, dict)]
else:
    raise FileNotFoundError("Track data file not found. Ensure track processing has been completed.")

# Load saved album data
if os.path.exists(ALBUM_PROGRESS_FILE):
    with open(ALBUM_PROGRESS_FILE, "rb") as f:
        album_details_dict = pickle.load(f)
else:
    raise FileNotFoundError("Album data file not found. Ensure album processing has been completed.")

# Function to process and merge track & album metadata
def process_track_metadata(track):
    """Processes individual track metadata by merging album details."""
    try:
        if not track or "album" not in track or track["album"].get("uri") is None:
            return None  # Skip if no album data
        
        album_uri = track["album"]["uri"]
        album_info = album_details_dict.get(album_uri, {})

        return {
            "track_uri": track.get("id"),  # Ensure correct Spotify ID format
            "track_name": track.get("name"),
            "track_popularity": track.get("popularity"),
            "artist_name": track["artists"][0]["name"] if "artists" in track and track["artists"] else None,
            "album_name": album_info.get("name", None),
            "album_release_date": album_info.get("release_date", None),
            "release_date_precision": album_info.get("release_date_precision", None)  # Ensuring track release precision
        
        }
    
    except Exception as e:
        print(f"Error processing track {track.get('id', 'Unknown')}: {e}")
        return None  # Avoid crashes on errors

# Use ThreadPoolExecutor to speed up metadata processing
track_metadata = []
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    results = list(executor.map(process_track_metadata, all_track_details))
    track_metadata = [r for r in results if r is not None]  # Remove None values

# Convert list to DataFrame
metadata_df = pd.DataFrame(track_metadata)

# Save final metadata to CSV
metadata_df.to_csv("track_metadata2.csv", index=False)

print(f"Metadata processing completed and saved to 'track_metadata2.csv'. Total records: {len(metadata_df)}")


In [None]:
import os
import pandas as pd
import pickle
import concurrent.futures
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from dotenv import load_dotenv
from tqdm import tqdm  # For progress tracking

# ✅ Load environment variables
load_dotenv()
SPOTIPY_CLIENT_ID = os.getenv("SPOTIPY_CLIENT_ID")
SPOTIPY_CLIENT_SECRET = os.getenv("SPOTIPY_CLIENT_SECRET")

# ✅ Authenticate with Spotify API
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    client_id="e7815f861b194762a562b79ecfd55ceb",
    client_secret="8e01abeb62a1439f84cab00276dfc37a"
))

# ✅ Load your existing dataset (which already contains track_uri, track_name, album_name, artist_name)
df = pd.read_parquet(r"C:\Users\tkeit\OneDrive\Documents\GitHub\bt4222grp9\preliminary datasets\all_songs_with_or_without_lyrics.parquet")

# ✅ Extract unique Track URIs, Album Names, and Artist Names
track_uris = df["track_uri"].dropna().unique().tolist()
album_names = df["album_name"].dropna().unique().tolist()
artist_names = df["artist_name"].dropna().unique().tolist()

# ✅ Paths for caching progress
TRACK_PROGRESS_FILE = "track_popularity_progress.pkl"
ALBUM_PROGRESS_FILE = "album_release_progress.pkl"
ARTIST_PROGRESS_FILE = "artist_genre_progress.pkl"

# ✅ Load previous progress to prevent duplicate API calls
def load_progress(file_path):
    """Loads saved progress to avoid redundant API calls."""
    if os.path.exists(file_path) and os.path.getsize(file_path) > 0:
        with open(file_path, "rb") as f:
            return pickle.load(f)
    return {}

# ✅ Load previously processed data
track_popularity_dict = load_progress(TRACK_PROGRESS_FILE)
album_release_dict = load_progress(ALBUM_PROGRESS_FILE)
artist_genre_dict = load_progress(ARTIST_PROGRESS_FILE)

# ✅ Fetch track popularity
def fetch_track_popularity(track_batch):
    """Fetches track popularity scores from Spotify."""
    try:
        response = sp.tracks(track_batch)
        return {track["id"]: track["popularity"] for track in response["tracks"] if track}
    except Exception as e:
        print(f"Error fetching track popularity: {e}")
        return {}

# ✅ Fetch album release date
def fetch_album_release(album_batch):
    """Fetches album release dates from Spotify."""
    try:
        response = sp.search(q=f"album:{album_batch}", type="album", limit=1)
        if response["albums"]["items"]:
            album = response["albums"]["items"][0]
            return {album["name"]: album["release_date"]}
        return {}
    except Exception as e:
        print(f"Error fetching album release dates: {e}")
        return {}

# ✅ Fetch artist genres
def fetch_artist_genre(artist_batch):
    """Fetches genres for a list of artists from Spotify."""
    try:
        response = sp.search(q=f"artist:{artist_batch}", type="artist", limit=1)
        if response["artists"]["items"]:
            artist = response["artists"]["items"][0]
            return {artist["name"]: ", ".join(artist["genres"])}  # Convert list to string
        return {}
    except Exception as e:
        print(f"Error fetching artist genres: {e}")
        return {}

# ✅ Process track popularity in batches
track_batches = [track_uris[i:i+50] for i in range(0, len(track_uris), 50)]
for batch in tqdm(track_batches, desc="Fetching Track Popularity"):
    if set(batch).issubset(track_popularity_dict.keys()):
        continue  # Skip already processed tracks
    track_popularity_dict.update(fetch_track_popularity(batch))

    # Save progress every 500 entries
    if len(track_popularity_dict) % 500 == 0:
        with open(TRACK_PROGRESS_FILE, "wb") as f:
            pickle.dump(track_popularity_dict, f)

# ✅ Process album release dates in batches
album_batches = [album_names[i:i+20] for i in range(0, len(album_names), 20)]
for batch in tqdm(album_batches, desc="Fetching Album Release Dates"):
    if set(batch).issubset(album_release_dict.keys()):
        continue  # Skip already processed albums
    album_release_dict.update(fetch_album_release(batch))

    # Save progress every 100 entries
    if len(album_release_dict) % 100 == 0:
        with open(ALBUM_PROGRESS_FILE, "wb") as f:
            pickle.dump(album_release_dict, f)

# ✅ Process artist genres in batches
artist_batches = [artist_names[i:i+10] for i in range(0, len(artist_names), 10)]
for batch in tqdm(artist_batches, desc="Fetching Artist Genres"):
    if set(batch).issubset(artist_genre_dict.keys()):
        continue  # Skip already processed artists
    artist_genre_dict.update(fetch_artist_genre(batch))

    # Save progress every 100 entries
    if len(artist_genre_dict) % 100 == 0:
        with open(ARTIST_PROGRESS_FILE, "wb") as f:
            pickle.dump(artist_genre_dict, f)

# ✅ Merge newly fetched data with existing dataset
df["track_popularity"] = df["track_uri"].map(track_popularity_dict)
df["album_release_date"] = df["album_name"].map(album_release_dict)
df["artist_genre"] = df["artist_name"].map(artist_genre_dict)

# ✅ Save updated dataset
df.to_parquet("your_dataset_with_popularity_and_genre.parquet", index=False)
print("✅ Updated dataset saved as 'your_dataset_with_popularity_and_genre.parquet'")


In [None]:
# Verify output and Load the newly created file
df_new = pd.read_csv("track_metadata2.csv")

# Display first few rows
print(df_new.head())

# Check total records
print(f"Total records in new file: {len(df_new)}")


In [None]:
# Ensure track_metadata is not empty
if not track_metadata:
    raise ValueError("Error: track_metadata list is empty. Check the track processing step.")

# Convert the list of dictionaries into a DataFrame
track_metadata_df = pd.DataFrame(track_metadata)

# Check if DataFrame is successfully created
if track_metadata_df.empty:
    raise ValueError("Error: track_metadata_df is empty after conversion. Verify data extraction.")

# Display first few rows
print("Successfully converted track metadata to DataFrame:")
print(track_metadata_df.head())

# Optional: Save DataFrame to CSV for verification
track_metadata_df.to_csv("track_metadata_df.csv", index=False)
print("Track metadata DataFrame saved as 'track_metadata_df.csv'.")


In [None]:
from sklearn.preprocessing import StandardScaler

######################################
# PART 1: Standardizing Track Popularity #
######################################

# Ensure 'track_popularity' exists before scaling
if "track_popularity" in track_metadata_df.columns:
    scaler = StandardScaler()
    track_metadata_df["track_popularity_scaled"] = scaler.fit_transform(track_metadata_df[["track_popularity"]])
else:
    raise KeyError("Error: 'track_popularity' column missing from DataFrame.")

######################################
# PART 2: Categorizing Release Date into Eras #
######################################

def release_date_to_era(release_date):
    """
    Converts a song's release date into a categorized music era.

    Arguments:
    - release_date (str): The release date from Spotify API. Format: YYYY-MM-DD, YYYY-MM, or YYYY

    Returns:
    - str: The corresponding music era, or None if invalid.
    """
    try:
        # Convert release_date to standardized year format
        year = pd.to_datetime(release_date, errors="coerce").year

        # Handle missing or invalid dates
        if pd.isnull(year):
            return None

        # Categorize into refined music eras
        if year < 1950:
            return "Pre-1950s"
        elif 1950 <= year < 1960:
            return "1950s Rock & Roll"
        elif 1960 <= year < 1970:
            return "1960s Classic Rock & Motown"
        elif 1970 <= year < 1980:
            return "1970s Disco & Hard Rock"
        elif 1980 <= year < 1990:
            return "1980s Pop & New Wave"
        elif 1990 <= year < 2000:
            return "1990s Alternative & Hip-Hop Boom"
        elif 2000 <= year < 2010:
            return "2000s Digital Pop & Emo"
        elif 2010 <= year < 2020:
            return "2010s Streaming & Trap"
        elif 2020 <= year < 2030:
            return "2020s Present Era"
        else:
            return "Future Music"
    
    except Exception as e:
        return None  # Catch unexpected errors

# Apply function to categorize all release dates
if "album_release_date" in track_metadata_df.columns:
    track_metadata_df["album_era"] = track_metadata_df["album_release_date"].apply(release_date_to_era)
else:
    raise KeyError("Error: 'album_release_date' column missing from DataFrame.")

# Display first few rows for verification (optional)
print(track_metadata_df[["album_release_date", "album_era"]].head())

# Save the updated DataFrame to a new file for analysis
track_metadata_df.to_csv("track_metadata_with_eras.csv", index=False)
print("Updated track metadata saved to 'track_metadata_with_eras.csv'.")


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Ensure 'track_popularity' column exists before visualization
if "track_popularity" not in track_metadata_df.columns:
    raise KeyError("Error: 'track_popularity' column missing from dataset.")

# Set figure size
plt.figure(figsize=(14, 7))

# Create a violin + boxplot hybrid for deeper analysis
sns.violinplot(
    x="album_era", 
    y="track_popularity", 
    data=track_metadata_df, 
    inner="quartile", 
    palette="coolwarm"
)

sns.boxplot(
    x="album_era", 
    y="track_popularity", 
    data=track_metadata_df, 
    width=0.3, 
    boxprops={'facecolor':'None'}, 
    showcaps=False
)

# Formatting
plt.xticks(rotation=45, ha="right")
plt.xlabel("Music Era", fontsize=12)
plt.ylabel("Popularity Score", fontsize=12)
plt.title("🔥 Popularity Distribution Across Music Eras", fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Show plot
plt.show()


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure 'track_popularity' column exists
if "track_popularity" not in track_metadata_df.columns:
    raise KeyError("Error: 'track_popularity' column missing from dataset.")

# Sort the DataFrame by era for better visualization
track_metadata_df = track_metadata_df.sort_values("album_era")

# Set plot style
sns.set(style="whitegrid")

# Create the ridgeline plot (density stacked by era)
plt.figure(figsize=(12, 8))
sns.kdeplot(
    data=track_metadata_df, 
    x="track_popularity", 
    hue="album_era", 
    fill=True, 
    palette="coolwarm", 
    alpha=0.7, 
    linewidth=1.5
)

# Formatting
plt.xlabel("Popularity Score", fontsize=12)
plt.ylabel("Density", fontsize=12)
plt.title("Popularity Evolution Over Time - Ridgeline Density Plot", fontsize=14)
plt.legend(title="Music Era", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(axis="x", linestyle="--", alpha=0.5)

# Show plot
plt.show()


In [None]:
# Temporal analysis
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

# If 'retrieval_date' is missing, create it using today's date
if "retrieval_date" not in track_metadata_df.columns:
    track_metadata_df["retrieval_date"] = datetime.datetime.today().strftime('%Y-%m-%d')

# Convert to datetime format
track_metadata_df["retrieval_date"] = pd.to_datetime(track_metadata_df["retrieval_date"])


# Ensure required columns exist
required_columns = ["album_era", "track_popularity", "retrieval_date"]
for col in required_columns:
    if col not in track_metadata_df.columns:
        raise KeyError(f"Error: '{col}' column missing from dataset.")

# Convert retrieval_date to datetime for accurate plotting
track_metadata_df["retrieval_date"] = pd.to_datetime(track_metadata_df["retrieval_date"])

# Group data by era and retrieval year, then compute average popularity
popularity_over_time = (
    track_metadata_df.groupby([track_metadata_df["retrieval_date"].dt.year, "album_era"])["track_popularity"]
    .mean()
    .reset_index()
    .rename(columns={"retrieval_date": "year"})
)

# Set figure size
plt.figure(figsize=(12, 6))

# Plot popularity evolution for each era
sns.lineplot(
    data=popularity_over_time, 
    x="year", 
    y="track_popularity", 
    hue="album_era", 
    palette="coolwarm", 
    marker="o"
)

# Formatting
plt.xlabel("Year", fontsize=12)
plt.ylabel("Average Popularity Score", fontsize=12)
plt.title("Evolution of Popularity Over Time by Song Era", fontsize=14)
plt.xticks(rotation=45)
plt.legend(title="Original Song Era", bbox_to_anchor=(1.05, 1), loc="upper left")
plt.grid(axis="y", linestyle="--", alpha=0.5)

# Show plot
plt.show()
