In [None]:
%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
n_classes = 2
modality = 'music' # 'music', 'speech', or 'video'
which = 'openl3' # 'mfcc', 'msd' or 'openl3' for music, 'slow_fast' for video, 'hubert' for speech
voice = True 

fn_suffix = {
    'music': {
        'mfcc': '',
        'msd': '_backend', 
        'openl3': '_env', # '_music' or '_env'
    },
    'video': {
        'slow_fast': '_slow', # '_slow' or '_fast'
    },
    'speech': {
        'hubert': '_wave_encoder', # '_wave_encoder' or '_transformer'
    }
}

embedding_dimensions = {
    'video': {
        'slow_fast': 2048 if fn_suffix['video']['slow_fast']=='_slow' else 256,
    },
    'music': {
        'mfcc': 60,
        'msd': 256,
        'openl3': 512,
    },
    'speech': {
        'hubert': 1024 if fn_suffix['speech']['hubert']=='_transformer' else 512,
    }
}

## Load ground truth

In [None]:
groundtruth_df = pd.read_csv("groundtruth_merged.csv")
groundtruth_df.set_index("stimulus_id", inplace=True)
groundtruth_df.head()

In [None]:
not_found = 0
for stimulus_id in groundtruth_df.index:
    if os.path.exists(f"{modality}/embeddings_{which}/{stimulus_id}{fn_suffix[modality][which]}.npy"):
        continue
    else:
        print(f"Embedding for {stimulus_id} not found")
        not_found += 1

assert not_found == 0

## Load embeddings

In [None]:
embedding_dim = embedding_dimensions[modality][which]

all_embeddings = np.empty((groundtruth_df.shape[0], embedding_dim))

for i,stimulus_id in enumerate(groundtruth_df.index):
    embedding = np.load(f"{modality}/embeddings_{which}{'' if voice else '_novoice'}/" +
                        f"{stimulus_id}{fn_suffix[modality][which]}.npy")
    all_embeddings[i] = embedding.mean(axis=0)

all_embeddings.shape

In [None]:
# k-fold cross-validation with xgboost
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

kf = KFold(n_splits=5, shuffle=True, random_state=42)

classes = ["Girls/women", "Boys/men"] if n_classes==2 else ["Girls/women", "Mixed", "Boys/men"]
mask = groundtruth_df.target.isin(classes) 

X = all_embeddings[mask]
y = groundtruth_df.target[mask].values

# convert to integers
le = LabelEncoder()
y = le.fit_transform(y)

accuracies = []
f1s = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dtest = xgb.DMatrix(X_test, label=y_test)
    
    param = {'max_depth': 3, 'eta': 0.3, 'objective': 'multi:softmax', 'num_class': len(classes)}
    num_round = 30
    bst = xgb.train(param, dtrain, num_round)
    
    y_pred = bst.predict(dtest)
    
    accuracies.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='weighted'))



In [None]:
# print results
print(f"Accuracy: {np.mean(accuracies):.2f} ± {np.std(accuracies):.2f}")
print(f"F1: {np.mean(f1s):.2f} ± {np.std(f1s):.2f}")

In [None]:
# k-fold cross-validation with SVM
from sklearn.svm import SVC

kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []
f1s = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = SVC(gamma='auto')
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    accuracies.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='weighted'))

# print results
print(f"Accuracy: {np.mean(accuracies):.2f} ± {np.std(accuracies):.2f}")
print(f"F1: {np.mean(f1s):.2f} ± {np.std(f1s):.2f}")


In [None]:
# k-fold cross-validation with pytorch-lightning softmax classifier
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.nn import functional as F
from pytorch_lightning import Trainer
from pytorch_lightning.core import LightningModule

kf = KFold(n_splits=5, shuffle=True, random_state=42)

class EmbeddingsDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X).float()
        self.y = torch.from_numpy(y).long()
        
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
    
class SoftmaxClassifier(LightningModule):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def configure_optimizers(self):
        return torch.optim.AdamW(self.parameters(), lr=1e-3, weight_decay=1e-5)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        self.log('train_loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        self.log('val_loss', loss)
        return loss
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = F.cross_entropy(y_hat, y)
        self.log('test_loss', loss)
        return loss
    
accuracies = []
f1s = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    train_dataset = EmbeddingsDataset(X_train, y_train)
    test_dataset = EmbeddingsDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=4)
    
    model = SoftmaxClassifier(embedding_dim, len(classes))
    
    trainer = Trainer(max_epochs=10)

    trainer.fit(model, train_loader)
    
    trainer.test(model, test_loader)
    
    y_pred = torch.argmax(model(torch.from_numpy(X_test).float()), dim=1).numpy()
    
    accuracies.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='weighted'))


In [None]:
# print results
print(f"Accuracy: {np.mean(accuracies):.2f} ± {np.std(accuracies):.2f}")
print(f"F1: {np.mean(f1s):.2f} ± {np.std(f1s):.2f}")