Extract Preview URL

In [20]:
import subprocess
import json
import time
import pandas as pd
import math
import os
import re
import glob

def get_spotify_previews_batch(df_batch):
    data_json = df_batch.to_json(orient="records")
    result = subprocess.run(
        ["node", "spotify_preview.js"],
        input=data_json,
        stdout=subprocess.PIPE,   # Capture stdout explicitly
        stderr=subprocess.STDOUT,  # Merge stderr into stdout
        text=True,
        encoding='utf-8'
    )
    
    # Print all output so any warnings or messages are visible
    print(result.stdout)
    
    try:
        return json.loads(result.stdout)
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON output: {result.stdout}")

def get_last_processed_batch(directory):
    """Find the last batch number from saved CSV files in the specified directory."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        return 0  # No batches processed yet

    batch_files = [f for f in os.listdir(directory) if f.startswith("batch_") and f.endswith(".csv")]
    batch_numbers = []
    for filename in batch_files:
        match = re.search(r"batch_(\d+)\.csv", filename)
        if match:
            batch_numbers.append(int(match.group(1)))
    return max(batch_numbers, default=0)  # Return the last processed batch number

def get_new_merged_filepath(directory, base_name='merged_file.csv'):
    """
    Return a file path for the merged file. If a file with the base name already exists,
    append a number to avoid overwriting.
    """
    merged_filepath = os.path.join(directory, base_name)
    if not os.path.exists(merged_filepath):
        return merged_filepath
    else:
        i = 1
        new_name = f"merged_file({i}).csv"
        new_filepath = os.path.join(directory, new_name)
        while os.path.exists(new_filepath):
            i += 1
            new_name = f"merged_file({i}).csv"
            new_filepath = os.path.join(directory, new_name)
        return new_filepath

def merge_preview_batches(directory):
    """
    Merge all CSV files from the given directory (which holds preview batch CSVs),
    drop duplicates by 'track_uri' while prioritizing those with valid 'previewUrl',
    save the merged DataFrame (using a new filename if a merged file already exists),
    and delete all individual batch CSV files.
    """
    csv_files = glob.glob(os.path.join(directory, '*.csv'))
    if not csv_files:
        print(f"No CSV files found in {directory}.")
        return pd.DataFrame()
    
    dfs = [pd.read_csv(file) for file in csv_files]
    previews = pd.concat(dfs, ignore_index=True)
    # Sort so that rows with non-NaN previewUrl come first,
    # then drop duplicates by 'track_uri' keeping the first occurrence.
    previews = previews.sort_values(by=['previewUrl'], ascending=False)\
                       .drop_duplicates(subset=['track_uri'], keep='first')
    
    merged_filepath = get_new_merged_filepath(directory)
    previews.to_csv(merged_filepath, index=False)
    print(f"Merged file saved to {merged_filepath}")
    
    # Delete individual batch CSV files (but not any previously merged files)
    batch_files = glob.glob(os.path.join(directory, 'batch_*.csv'))
    for file in batch_files:
        try:
            os.remove(file)
            print(f"Deleted {file}")
        except Exception as e:
            print(f"Could not delete {file}: {e}")
    return previews

def get_spotify_previews_in_batches(dataframe, batch_size=50, delay=2, output_dir=""):
    # Use current working directory if output_dir is empty
    if not output_dir:
        output_dir = os.getcwd()

    last_batch = get_last_processed_batch(output_dir)
    print(f"Resuming from batch {last_batch + 1} in directory: {output_dir}...")

    # Skip already processed batches based on batch count
    dataframe = dataframe.iloc[last_batch * batch_size:]
    num_songs = len(dataframe)
    total_batches = math.ceil(num_songs / batch_size)

    for i in range(0, num_songs, batch_size):
        batch_df = dataframe.iloc[i:i+batch_size]
        batch_index = last_batch + (i // batch_size) + 1
        print(f"Processing batch {batch_index} of {total_batches + last_batch}...")
        
        batch_results = get_spotify_previews_batch(batch_df)
        batch_data = []
        
        for entry in batch_results:
            if entry is None:
                continue
            track_name = entry.get("track_name")
            track_uri = entry.get("track_uri")
            preview_url = None
            if "error" not in entry or not entry.get("error"):
                preview_urls = entry.get("previewUrls", [])
                preview_url = preview_urls[0] if preview_urls else None
            batch_data.append({
                "track_name": track_name,
                "track_uri": track_uri,
                "previewUrl": preview_url
            })
        
        batch_df_results = pd.DataFrame(batch_data)
        csv_filename = f"batch_{batch_index}.csv"
        batch_filepath = os.path.join(output_dir, csv_filename)
        batch_df_results.to_csv(batch_filepath, index=False)
        print(f"Batch saved to {batch_filepath}")

        if i + batch_size < num_songs:
            time.sleep(delay)

if __name__ == "__main__":
    # Ask user for the directory where batch CSVs are saved.
    output_dir = input("Enter the directory where batch CSVs are saved (default: current working directory): ").strip()
    if not output_dir:
        output_dir = os.getcwd()
    
    # Folder containing the preview batch CSV files.
    preview_batches_dir = "preview_batches_Xav"
    
    # Merge any existing preview batch CSV files and clean up batch files.
    merged_previews = merge_preview_batches(preview_batches_dir)
    
    # Load the full tracks dataset (adjust path if needed)
    df = pd.read_parquet('../../parquet datasets/tracks_XavierHua.parquet')
    df = df[['track_name', 'track_uri']]
    
    # If there is a merged file, skip tracks that already have a valid previewUrl.
    if not merged_previews.empty:
        processed_track_uris = merged_previews[~merged_previews['previewUrl'].isna()]['track_uri'].unique()
        df_to_process = df[~df['track_uri'].isin(processed_track_uris)]
        print(f"Found {len(processed_track_uris)} processed tracks. {len(df_to_process)} tracks remain to be processed.")
    else:
        df_to_process = df
        print("No previously processed tracks found, processing all tracks.")
    
    # Process the remaining tracks in batches (by default 50 per batch)
    get_spotify_previews_in_batches(df_to_process, output_dir=output_dir)
    merge_preview_batches(preview_batches_dir)


Merged file saved to preview_batches_Xav/merged_file(2).csv
Deleted preview_batches_Xav/batch_1.csv
Deleted preview_batches_Xav/batch_2.csv
Deleted preview_batches_Xav/batch_3.csv
Deleted preview_batches_Xav/batch_4.csv
Deleted preview_batches_Xav/batch_5.csv
Found 48550 processed tracks. 1897 tracks remain to be processed.
Resuming from batch 1 in directory: /Users/xavierhua/Documents/GitHub/bt4222grp9/phase2_feature_engineering/spotify_previewurl_extraction/preview_batches_Xav...
Processing batch 1 of 38...
[{"track_name":"What I Might Do - Kilter Remix","track_uri":"spotify:track:6codEgrEs41B6j01nkSIqe","previewUrls":[],"error":"No preview URLs found"},{"track_name":"Post Malates","track_uri":"spotify:track:7wLB0OVFKakmbkUFmh9C58","previewUrls":[],"error":"No preview URLs found"},{"track_name":"Lovin U (feat. Xiomara)","track_uri":"spotify:track:2YCkPwsWHsTPSTgEwqhxQl","previewUrls":[],"error":"No preview URLs found"},{"track_name":"Somewhere In My Car","track_uri":"spotify:track:3M