In [17]:
from pathlib import Path
import pandas as pd
import numpy as np

In [18]:
BASE_DIR = Path('../../../')

CSV_DIR = BASE_DIR / 'data' / 'iemocap_4way_data'
TRAIN_CSV_PATH = CSV_DIR / 'train_4way_with_minus_one.csv'
VAL_CSV_PATH = CSV_DIR / 'val_4way_with_minus_one.csv'
TEST_CSV_PATH = CSV_DIR / 'test_4way_with_minus_one.csv'

EMBEDDING_DIR = BASE_DIR / 'data' / 'embeddings' / '4way_opensmile' / 'eGeMAPSv02'
TRAIN_EMBEDDING_PATH = EMBEDDING_DIR / 'train.npy'
VAL_EMBEDDING_PATH = EMBEDDING_DIR / 'val.npy'
TEST_EMBEDDING_PATH = EMBEDDING_DIR / 'test.npy'

### Filter openSMILE Embeddings

In [19]:
def filter_embeddings(csv_path, embedding_path, output_path):
    # Load CSV and embeddings
    df = pd.read_csv(csv_path)
    embeddings = np.load(embedding_path, allow_pickle=True)

    # Edit embeddings so that it matches the CSV files. df['id'] corresponds to the second column of the embeddings array
    # Create a filtered embeddings array where the order of the rows matches the order of df['id'] in the CSV file. In the filtered array, only keep the first column
    filtered_embeddings = []
    id_to_embedding = {row[1]: row[0] for row in embeddings}
    valid_indices = []
    for i, filename in enumerate(df['id']):
        try:
            filtered_embeddings.append(id_to_embedding[filename])
            valid_indices.append(i)
        except KeyError:
            pass
    filtered_embeddings = np.array(filtered_embeddings)

    # Save filtered embeddings
    np.save(output_path, filtered_embeddings)

    return valid_indices

#### Filter Training, Validation, and Testing Data Embeddings

In [22]:
valid_train = filter_embeddings(TRAIN_CSV_PATH, TRAIN_EMBEDDING_PATH, EMBEDDING_DIR / 'train_filtered.npy')
valid_val = filter_embeddings(VAL_CSV_PATH, VAL_EMBEDDING_PATH, EMBEDDING_DIR / 'val_filtered.npy')
valid_test = filter_embeddings(TEST_CSV_PATH, TEST_EMBEDDING_PATH, EMBEDDING_DIR / 'test_filtered.npy')

### Filter SRoBERTa Embeddings

In [23]:
def filter_sroberta_embeddings(npz_path, valid_indices, output_path):
    # Load the embeddings
    data = np.load(npz_path)
    embeddings = data['embeddings']

    # Ensure indices are a NumPy array
    valid_indices = np.asarray(valid_indices, dtype=int)

    # Filter the embeddings
    filtered_embeddings = embeddings[valid_indices]

    # Save to output path
    np.savez(output_path, embeddings=filtered_embeddings)

    print(f"Filtered embeddings saved to {output_path} with shape {filtered_embeddings.shape}")


In [26]:
SROBERTA_DIR = BASE_DIR / 'data' / 'embeddings' / '4way-best-sroberta' / 'avg_last4_wmean_pos_rev'
SROBERTA_TRAIN_PATH = SROBERTA_DIR / 'train.npz'
SROBERTA_VAL_PATH = SROBERTA_DIR / 'val.npz'
SROBERTA_TEST_PATH = SROBERTA_DIR / 'test.npz'

In [27]:
filter_sroberta_embeddings(SROBERTA_TRAIN_PATH, valid_train, SROBERTA_DIR / 'train_filtered.npz')

Filtered embeddings saved to ../../../data/embeddings/4way-best-sroberta/avg_last4_wmean_pos_rev/train_filtered.npz with shape (5766, 768)


In [28]:
filter_sroberta_embeddings(SROBERTA_VAL_PATH, valid_val, SROBERTA_DIR / 'val_filtered.npz')

Filtered embeddings saved to ../../../data/embeddings/4way-best-sroberta/avg_last4_wmean_pos_rev/val_filtered.npz with shape (2103, 768)


In [29]:
filter_sroberta_embeddings(SROBERTA_TEST_PATH, valid_test, SROBERTA_DIR / 'test_filtered.npz')

Filtered embeddings saved to ../../../data/embeddings/4way-best-sroberta/avg_last4_wmean_pos_rev/test_filtered.npz with shape (2170, 768)


### Filter CSV Files

In [None]:
def filter_csv_with_indices(csv_path, valid_indices, output_path):
    """Filter a CSV by row indices and write to output_path."""
    df = pd.read_csv(csv_path)
    valid_indices = np.asarray(valid_indices, dtype=int)
    # Keep rows in the original order specified by valid_indices
    filtered_df = df.iloc[valid_indices].reset_index(drop=True)
    filtered_df.to_csv(output_path, index=False)
    print(f"Saved filtered CSV to {output_path} with {len(filtered_df)} rows")

# Write filtered CSVs using the indices returned by filter_embeddings above
filter_csv_with_indices(TRAIN_CSV_PATH, valid_train, CSV_DIR / 'train_filtered.csv')
filter_csv_with_indices(VAL_CSV_PATH, valid_val, CSV_DIR / 'val_filtered.csv')
filter_csv_with_indices(TEST_CSV_PATH, valid_test, CSV_DIR / 'test_filtered.csv')