# Workshop #2 :
## Reccobeats API - Extraction

------------------------------------------------------------

https://reccobeats.com/docs/apis/extract-audio-features

Extract acoustic features for nominated songs using Reccobeats API.  

In [53]:
import os
import sys
import re
import subprocess
import requests
import pandas as pd
import logging
import json
from tqdm import tqdm
import yt_dlp
import time

sys.path.append(os.path.abspath('../'))
from src.params import Params
from src.client import DatabaseClient
from src.logging_config import setup_logging

In [54]:
setup_logging()

# Load relevant songs from clean Grammys CSV


In [None]:
df = pd.read_csv("../data/intermediate/grammys.csv")

# Filter categories
filtered_df = df[df['normalized_category'].isin(['Song Of The Year', 'Record Of The Year'])]

# Count how many rows there are
total_songs = len(filtered_df)
logging.info(f"Total relevant songs: {total_songs}")


2025-04-10 16:35:26,545 - INFO - root - Total relevant songs: 124


In [None]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124 entries, 0 to 4404
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   year                 124 non-null    int64 
 1   title                124 non-null    object
 2   nominee              124 non-null    object
 3   artist               124 non-null    object
 4   winner               124 non-null    bool  
 5   normalized_category  124 non-null    object
dtypes: bool(1), int64(1), object(4)
memory usage: 5.9+ KB


We filtered duplicate songs and keep the first register.

In [None]:
filtered_df = filtered_df.drop_duplicates(subset="nominee", keep="first")

We define Directory path where audio files are stored for processing.

In [None]:
AUDIO_DIR = "../data/audio_files"

In [None]:
def safe_filename(title):
    """
    Cleans and formats a given string to be safely used as a filename.

    This function removes invalid characters that are not typically allowed in filenames,
    such as special symbols, while retaining alphanumeric characters, underscores, 
    hyphens, parentheses, and spaces. It also replaces spaces with underscores for 
    better readability in filenames.

    Args:
        title (str): The string to be formatted as a safe filename.

    Returns:
        str: A cleaned and formatted string suitable for use as a filename.
    """
    return re.sub(r'[^\w\-_\(\)\s]', '', title).replace(" ", "_")

 ## Function to download audio from YouTube as MP3

In [None]:
def download_audio(query, output_dir=AUDIO_DIR):
    """
    Downloads audio content based on a given search query and saves it as an MP3 file.

    This function uses yt-dlp to search for and download the best available audio version 
    of a given query (e.g., song title, artist name) from YouTube or similar sources. It 
    ensures the output file is safely named and stored in the specified directory.

    Args:
        query (str): The search term used to locate audio content online (e.g., song or artist).
        output_dir (str, optional): Path to the directory where the downloaded MP3 file will be saved.
                                    Defaults to AUDIO_DIR.

    Returns:
        str: Path to the downloaded MP3 file if successful, or None if an error occurs 
             or the file cannot be found.

    Raises:
        Exception: Handles any issues during the download process using yt-dlp.
    
    Notes:
        - The filename is sanitized using the `safe_filename` function to remove invalid characters.
        - If the `features` postprocessor is enabled, it extracts and converts the audio to MP3 format.
        - Logs errors and download statuses, ensuring user feedback on success or failure.

    """
    os.makedirs(output_dir, exist_ok=True)
    safe_name = safe_filename(query)
    output_path = os.path.join(output_dir, f"{safe_name}.%(ext)s")

    ydl_opts = {
        'format': 'bestaudio/best',
        'noplaylist': True,
        'quiet': True,
        'outtmpl': output_path,
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
    }

    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            ydl.download([f"ytsearch1:{query}"])
        final_path = os.path.join(output_dir, f"{safe_name}.mp3")
        return final_path if os.path.exists(final_path) else None
    except Exception as e:
        print(f"Error wwhile downloading {query} con yt_dlp: {e}")
        return None

## Function to trim audio to 30 seconds

In [None]:
def trim_audio(audio_path, output_dir="../data/audio_files/trimmed"):
    """
    Trims an audio file to the first 30 seconds and saves the result.

    This function uses FFmpeg to extract the first 30 seconds of the provided audio file.
    The trimmed audio is saved in the specified output directory with "_trimmed" appended
    to the original filename. If the trimming is successful, the path to the trimmed file 
    is returned; otherwise, None is returned.

    Args:
        audio_path (str): Path to the input audio file to be trimmed.
        output_dir (str, optional): Directory where the trimmed audio file will be saved.
                                    Defaults to "../data/audio_files/trimmed".

    Returns:
        str: Path to the trimmed audio file if successful, or None if the trimming fails.

    Raises:
        Exception: Logs errors during the trimming process, including issues with FFmpeg execution.

    Notes:
        - The function ensures the output directory exists by creating it if necessary.
        - FFmpeg is called via `subprocess.run()` to perform the trimming operation.
        - Error messages are logged using the `logging` module for debugging purposes.
        - The function checks the existence and file size of the trimmed audio to verify success.
    """
    os.makedirs(output_dir, exist_ok=True)
    base_name = os.path.basename(audio_path)
    trimmed_path = os.path.join(output_dir, f"{os.path.splitext(base_name)[0]}_trimmed.mp3")

    try:
        subprocess.run([
            "ffmpeg", "-y", "-i", audio_path,
            "-t", "30", "-acodec", "copy", trimmed_path
        ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

        if os.path.exists(trimmed_path) and os.path.getsize(trimmed_path) > 0:
            return trimmed_path
        else:
            return None
    except Exception as e:
        logging.error(f"Error while trimming {audio_path}: {e}")
        return None

## Funtion to use ReccoBeats API for audio feature analysis

In [None]:
def analyze_with_reccobeats(trimmed_path):
    """
    Sends a trimmed audio file to the ReccoBeats API for audio feature analysis.

    This function reads the provided audio file, prepares it as a multipart/form-data 
    payload, and sends it to the ReccoBeats API for processing. Upon a successful 
    request, the API returns a JSON response containing audio features. If an error 
    occurs (either during the request or if the API responds with an error code), 
    appropriate error handling is performed.

    Args:
        trimmed_path (str): Path to the trimmed audio file to be analyzed.

    Returns:
        tuple: 
            - dict: The JSON response from the API if the request is successful.
            - str: Error message (e.g., API response status or exception message) if an error occurs.

    Raises:
        Exception: Captures and logs any exceptions during the file preparation or 
                   API request process.

    Notes:
        - The `audioFile` key in the payload sends the audio file with its basename 
          and content type `audio/mpeg`.
        - The headers specify that the response should be in JSON format.
        - If the API request fails, the function returns the status code and reason 
          for debugging purposes.
        - Uses `requests.post` for the HTTP POST request to the ReccoBeats API.
    """
    try:
        with open(trimmed_path, 'rb') as file:
            files = {
                'audioFile': (os.path.basename(trimmed_path), file, 'audio/mpeg')
            }

            headers = {
                'Accept': 'application/json'
            }

            response = requests.post(
                "https://api.reccobeats.com/v1/analysis/audio-features",
                files=files,
                headers=headers
            )

            if response.status_code == 200:
                return response.json(), None
            else:
                return None, f"{response.status_code} {response.reason}"

    except Exception as e:
        return None, str(e)

## Processing

Sending the trimmed audio files to the ReccoBeats API for audio feature analysis.

In [63]:
results = []

for _, row in tqdm(filtered_df.iterrows(), total=len(filtered_df), desc="Analizyng with ReccoBeats"):
    nominee = row["nominee"]
    filename = safe_filename(nominee) + ".mp3"
    audio_path = os.path.join(AUDIO_DIR, filename)

    if not os.path.exists(audio_path):
        audio_path = download_audio(nominee)

    if audio_path and os.path.exists(audio_path):
        trimmed = trim_audio(audio_path)
        if trimmed:
            features, error = analyze_with_reccobeats(trimmed)
            os.remove(trimmed)
        else:
            features, error = None, "Trimmed audio not found"
    else:
        features, error = None, "Invalid file or not found"

    results.append({
        "nominee": nominee,
        "features": features,
        "error": error
    })
    time.sleep(3) 


Analizyng with ReccoBeats: 100%|██████████| 92/92 [09:03<00:00,  5.91s/it]


# Save results

Processes and saves extracted features data in both JSON and CSV formats.

This cell handles the saving of results data in two formats:
1. JSON: A raw representation of the data structure.
2. CSV: A tabular format that expands nested 'features' dictionaries into separate columns.

The JSON file is saved for general data storage, while the CSV file ensures a structured
representation for analysis, especially if the 'features' column is present in the data.

Steps:
1. Saves the full 'results' data as JSON in the raw data directory.
2. Converts the data into a pandas DataFrame for further processing.
3. Filters rows with valid 'features' data and expands nested dictionaries.
4. Saves the expanded DataFrame as a CSV in the external data directory.

Args:
    json_path (str): File path to save the JSON output.
    csv_path (str): File path to save the CSV output.

Returns:
    None

Notes:
    - The CSV file includes expanded 'features' as separate columns for easier analysis.
    - Logging messages confirm the successful saving of both JSON and CSV files.


In [None]:
# Save as JSON
json_path = "../data/raw/reccobeats_features.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)
logging.info(f"Results saved in JSON format: {json_path}")

# Save as CSV (only features in columnas if they are)
df_results = pd.DataFrame(results)

# We expand the 'features' dictionaries into separate columns
features_df = df_results.dropna(subset=["features"]).copy()
features_expanded = features_df["features"].apply(pd.Series)
features_combined = pd.concat([features_df[["nominee"]], features_expanded], axis=1)

csv_path = "../data/external/reccobeats_features.csv"
features_combined.to_csv(csv_path, index=False)
logging.info(f"Results saved in CSV format: {csv_path}")


2025-04-10 16:44:30,062 - INFO - root - Results saved in JSON format: ../data/raw/reccobeats_features.json
2025-04-10 16:44:30,095 - INFO - root - Results saved in CSV format: ../data/external/reccobeats_features.csv
