Install dependencies

In [None]:
!pip install opensmile -q
!pip install boxsdk -q
!pip install dotenv -q
!pip install numpy -q
!pip install torch -q

Box Authorization

In [None]:
from dotenv import load_dotenv
from boxsdk import OAuth2, Client
import os

load_dotenv()

auth = OAuth2(
    client_id=os.getenv('BOX_CLIENT_ID'),
    client_secret=os.getenv('BOX_CLIENT_SECRET'),
    access_token=os.getenv('BOX_ACCESS_TOKEN'),
)
client = Client(auth)

Stream data

In [None]:
from pathlib import Path
from io import BytesIO
import soundfile as sf

def stream_data(filename: str, client: Client, folder_id: int):
    """
    file_name: name of a .wav file (including extension) located somewhere in the Box folder identified by folder_id
    client: authenticated boxsdk.Client
    folder_id: Box folder ID (root for the search scope)

    Uses the Box Search API scoped to the ancestor folder to find the file by name,
    downloads the file bytes and decodes the WAV into a waveform and sampling rate.

    Returns: (waveform (numpy array), sampling_rate)
    Raises FileNotFoundError if the file cannot be found.

    """
    # use Box search scoped to the ancestor folder to find candidate files
    results = client.search().query(query=filename, file_extensions=['wav'], ancestor_folder_ids=[str(folder_id)], limit=100)

    candidate = None
    for item in results:
        # ensure it's a file and the name matches exactly
        if getattr(item, 'type', None) == 'file' and getattr(item, 'name', None) == filename:
            candidate = item
            break

    if not candidate:
        raise FileNotFoundError(f'File not found: {filename}')

    # download bytes (for large files prefer download_to with a stream to disk)
    file_bytes = client.file(candidate.id).content()
    bio = BytesIO(file_bytes)

    # decode waveform
    waveform, sr = sf.read(bio)
    return waveform, sr

openSMILE Feature Extractor

In [None]:
import opensmile

smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,  # ComParE_2016 or emobase or eGeMAPSv02
    feature_level=opensmile.FeatureLevel.Functionals,
)

def extract_features(waveform, sampling_rate):
    """
    waveform: waveform of a .wav file
    sampling_rate: sampling rate of a .wav file

    Returns extracted features using the openSMILE feature extractor

    """
    return smile.process_signal(waveform, sampling_rate)

In [None]:
Extract and Save Embeddings

In [None]:
import os
import numpy as np
from pathlib import Path
from typing import Union
import csv

def save_embeddings(out_path: Path, features: np.ndarray) -> None:
    """
    features: 1-D array (single row) or 2-D array (one or more rows)
    out_path: Path to a .npy file that stores a 2-D array (rows x features)

    Writes the features to out_path. If out_path already exists, appends the new features

    """
    np.save(out_path, features)
    
def extract_and_save_embeddings(csv_path: Path, out_path: Path, client: Client, folder_id: int) -> None:
    """
    Read rows from csv_path (expects an 'id' column), fetch audio from Box, extract features, and save to out_path.

    Behavior change: this function ensures the output file is cleared before processing. If out_path exists it will be removed
    so we start fresh. The first successful feature extracted will be written as the initial .npy file; subsequent rows are appended via save_embedding.
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)

    # If out_path exists, remove it to clear previous contents so we start fresh
    if out_path.exists():
        try:
            os.remove(out_path)
        except Exception as e:
            print(f"Warning: failed to remove existing out_path {out_path}: {e}")

    combined = []
    with open(csv_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            filename = row['id'] + '.wav'
            try:
                waveform, sr = stream_data(filename, client, folder_id)
                features = extract_features(waveform, sr)

                combined.append(features)

                print(f"Processed features for {filename}")
            except FileNotFoundError:
                print(f"File not found: {filename}, skipping.")
            except Exception as e:
                print(f"Error processing {filename}: {e}")

    combined_np = np.vstack(combined).astype(np.float32)
    save_embeddings(out_path, combined_np)

In [None]:
DATA_DIR = Path('../../data/iemocap_4way_data')
TRAIN_PATH = DATA_DIR / 'train_4way_with_minus_one.csv'
VAL_PATH = DATA_DIR / 'val_4way_with_minus_one.csv'
TEST_PATH = DATA_DIR / 'test_4way_with_minus_one.csv'

SAVE_DIR = Path('../../data/embeddings/4way_opensmile/eGeMAPSv02')
SAVE_TRAIN_PATH = SAVE_DIR / 'train.npy'
SAVE_VAL_PATH = SAVE_DIR / 'val.npy'
SAVE_TEST_PATH = SAVE_DIR / 'test.npy'

FOLDER_ID = 279758036893
extract_and_save_embeddings(TRAIN_PATH, SAVE_TRAIN_PATH, client, FOLDER_ID)