In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
BASE_DIR = Path('../../../')

CSV_DIR = BASE_DIR / 'data' / 'iemocap_4way_data'
TRAIN_CSV_PATH = CSV_DIR / 'train_4way_with_minus_one.csv'
VAL_CSV_PATH = CSV_DIR / 'val_4way_with_minus_one.csv'
TEST_CSV_PATH = CSV_DIR / 'test_4way_with_minus_one.csv'

EMBEDDING_DIR = BASE_DIR / 'data' / 'embeddings' / '4way_opensmile' / 'eGeMAPSv02'
TRAIN_EMBEDDING_PATH = EMBEDDING_DIR / 'train.npy'
VAL_EMBEDDING_PATH = EMBEDDING_DIR / 'val.npy'
TEST_EMBEDDING_PATH = EMBEDDING_DIR / 'test.npy'

### Filter Embeddings

In [5]:
def filter_embeddings(csv_path, embedding_path, output_path):
    # Load CSV and embeddings
    df = pd.read_csv(csv_path)
    embeddings = np.load(embedding_path, allow_pickle=True)

    # Edit embeddings so that it matches the CSV files. df['id'] corresponds to the second column of the embeddings array
    # Create a filtered embeddings array where the order of the rows matches the order of df['id'] in the CSV file. In the filtered array, only keep the first column
    filtered_embeddings = []
    id_to_embedding = {row[1]: row[0] for row in embeddings}
    for idx in df['id']:
        filtered_embeddings.append(id_to_embedding[idx])
    filtered_embeddings = np.array(filtered_embeddings)

    # Save filtered embeddings
    np.save(output_path, filtered_embeddings)

#### Filter Training, Validation, and Testing Data Embeddings

In [6]:
filter_embeddings(TRAIN_CSV_PATH, TRAIN_EMBEDDING_PATH, EMBEDDING_DIR / 'train_filtered.npy')
filter_embeddings(VAL_CSV_PATH, VAL_EMBEDDING_PATH, EMBEDDING_DIR / 'val_filtered.npy')
filter_embeddings(TEST_CSV_PATH, TEST_EMBEDDING_PATH, EMBEDDING_DIR / 'test_filtered.npy')