# **Saving processed features**

Una vez obtenidos los archivos `.npy`, se procesarán para posteriormente realizar **reducción de dimensionalidad** y **clasificación**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
import seaborn as sns
import os

## **Load datasets**

In [3]:
ravdess_df = pd.read_csv('ravdess_dataset.csv')
cremad_df = pd.read_csv('crema_dataset.csv')

## **Useful functions**

In [121]:
def get_X_y_id(paths: list[str], df:pd.DataFrame, is_train:bool=True, type:str="mean", name_df = "r"):

    """
    Description:
        This function returns the feature_vectors, labels an ids of the videos

    Args: 
        path (str): the path to the directory where the videos are located
        df (pd.DataFrame): a DataFrame of Pandas, could be train_df, val_df or test_df
        is_train (bool): if True, the function returns the feature_vectors, labels an ids of the videos
        type (str): the type of feature to extract from the videos, could be "mean" or "max"

    """

    feature_vectors = []
    labels = []
    ids = []

    for path in paths:

        for video in Path(path).glob('*.npy'):

            if name_df == "r":
                id = os.path.basename(video).split('_')[2]
            else:
                id = os.path.basename(video).replace('_vggish.npy', '')

            if id not in df['filename'].values:
                print("Video corrupto:", id)
                continue

            current_video = np.load(video)

            if len(current_video.shape) <= 1:
                print("Video corrupto:", id)
                continue
            
            if type == "mean":
                feature_vectors.append(np.mean(current_video, axis = 0))
            elif type == "max":
                feature_vectors.append(np.max(current_video, axis = 0))
            
            ids.append(id)

            if is_train:
                labels.append(df[df['filename'] == id]['emotion_id'].values[0])

    feature_vectors = pd.DataFrame(np.vstack(feature_vectors))
    ids = pd.DataFrame({'filename': ids})
    
    if is_train:
        labels = pd.DataFrame(np.vstack(labels))
        return feature_vectors, labels, ids
    else:
        return feature_vectors, ids

## **RAVDESS**


In [124]:
ravdess_path = ['extraction/ravdess']

In [125]:
X_ravdess, y_ravdess, ids_ravdess =get_X_y_id(ravdess_path, ravdess_df, is_train=True, type="max")

In [126]:
X_ravdess.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.642407,0.0,0.0,0.388049,0.0,0.244497,0.0,0.074573,0.0,0.035013,...,0.0,0.701898,0.0,0.187065,0.0,0.500393,0.553233,0.883846,0.376374,0.0
1,0.678407,0.007144,0.049436,0.473569,0.046587,0.116013,0.0,0.060779,0.0,0.0,...,0.0,0.597775,0.0,0.322823,0.0,0.227596,0.342369,0.938277,0.159414,0.0
2,0.357666,0.0,0.0,0.305949,0.0,0.445085,0.0,0.0,0.055653,0.032519,...,0.0,0.603902,0.0,0.55581,0.0,0.148221,0.210598,0.840518,0.418516,0.173115
3,0.303738,0.064187,0.0,0.370461,0.005149,0.304514,0.0,0.0,0.0,0.012435,...,0.0,0.619586,0.0,0.315052,0.0,0.13845,0.201639,0.736059,0.342278,0.165868
4,0.344171,0.0,0.0,0.302962,0.009363,0.16495,0.0,0.059925,0.0,0.0,...,0.0,0.889876,0.0,0.19847,0.0,0.343841,0.723522,0.902789,0.227773,0.174657


In [127]:
y_ravdess.head()

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,2


In [128]:
ids_ravdess.head()

Unnamed: 0,filename
0,03-01-01-01-01-01-01
1,03-01-01-01-01-02-01
2,03-01-01-01-02-01-01
3,03-01-01-01-02-02-01
4,03-01-02-01-01-01-01


De esta manera, la fila 0 de X, y, id se corresponden, lo mismo para la fila 1, 2, ...

In [129]:
print(f"Shape de la matriz de ravdess: {X_ravdess.shape}")
print(f"Size de los labels de ravdess: {y_ravdess.shape}")
print(f"Size de los ids de ravdess: {ids_ravdess.shape}\n")

Shape de la matriz de ravdess: (1440, 128)
Size de los labels de ravdess: (1440, 1)
Size de los ids de ravdess: (1440, 1)



## **CREMA-D**

In [108]:
cremad_path = [f'./extraction/crema-d/Author_{i}' for i in range(1, 92)]

In [None]:
X_cremad, y_cremad, ids_cremad = get_X_y_id(cremad_path, cremad_df, is_train=True, type="max", name_df="cremad")

In [130]:
X_cremad.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,118,119,120,121,122,123,124,125,126,127
0,0.705473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.309565,...,0.0,0.235982,0.0,0.100697,0.0,0.255692,0.122422,0.725615,0.057861,0.0
1,0.463978,0.0,0.047867,0.0,0.0,0.0,0.090284,0.0,0.0,0.079699,...,0.0,0.201771,0.154929,0.314134,0.247927,0.414657,0.0,0.453375,0.610512,0.172905
2,0.575726,0.0,0.052069,0.0,0.0,0.0,0.0,0.0,0.0,0.235755,...,0.0,0.254564,0.079369,0.118244,0.0,0.360906,0.09278,0.377095,0.171009,0.0
3,0.330921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.187882,0.070401,0.130754,0.0,0.0,0.0,0.524978,0.0,0.0
4,0.764689,0.0,0.080885,0.0,0.0,0.0,0.0,0.0,0.0,0.160881,...,0.0,0.284488,0.0,0.225813,0.0,0.336033,0.196115,0.688541,0.0,0.0


In [131]:
y_cremad.head()

Unnamed: 0,0
0,1
1,2
2,3
3,4
4,5


In [132]:
ids_cremad.head()

Unnamed: 0,filename
0,1001_DFA_ANG_XX
1,1001_DFA_DIS_XX
2,1001_DFA_FEA_XX
3,1001_DFA_HAP_XX
4,1001_DFA_NEU_XX


De esta manera, la fila 0 de X, y, id se corresponden, lo mismo para la fila 1, 2, ...

In [133]:
print(f"Shape de la matriz de crema-d: {X_cremad.shape}")
print(f"Size de los labels de crema-d: {y_cremad.shape}")
print(f"Size de los ids de crema-d: {ids_cremad.shape}\n")

Shape de la matriz de crema-d: (7442, 128)
Size de los labels de crema-d: (7442, 1)
Size de los ids de crema-d: (7442, 1)



## **Saving features**

Se guardarán los features en archivos `.npz`

In [None]:
def save_df_as_npz(directory:str, filename:str, features:pd.DataFrame, ids, labels=None):
    """
    Description:
        This function saves the feature vectors, ids and labels as npz files.
        Ej: en test.npz estará una matriz con los features y un array con sus
        respectivas labels.

    Args: 
        features (pd.DataFrame): the feature vectors
        labels (pd.DataFrame): the labels
        directory (str): the directory where the npz files will be saved
        filename (str): the name of the npz file

    """

    features = features.to_numpy()
    ids = ids.to_numpy()

    if labels is not None:
        labels= labels.to_numpy()
        np.savez(os.path.join(directory, filename), features=features, ids=ids, labels=labels)
    else:
        np.savez(os.path.join(directory, filename), features=features, ids=ids)

def load_features_ids_labels(filename:str, has_labels=True):
    """
    Description:
        Cargar los features, ids y labels de un archivo npz.
        Si es data de testeo, has_labels=False
    """

    contents = np.load(filename, allow_pickle=True)
    features = contents['features']
    ids = contents['ids']

    if has_labels:
        labels = contents['labels']
        return features, ids, labels
    
    return features, ids

In [None]:
folder_name = "features_npz"
os.makedirs(folder_name, exist_ok=True)

In [None]:
save_df_as_npz(directory='features_npz', filename='features_ravdess.npz', features=X_ravdess, ids=ids_ravdess, labels=y_ravdess)
save_df_as_npz(directory='features_npz', filename='features_cremad.npz', features=X_cremad, ids=ids_cremad, labels=y_cremad)

In [142]:
X_ravdess_np, ids_ravdess_np, y_ravdess_np = load_features_ids_labels(filename='features_npz/features_ravdess.npz', has_labels=True)
X_cremad_np, ids_cremad_np, y_cremad_np = load_features_ids_labels(filename='features_npz/features_cremad.npz', has_labels=True)

In [140]:
print(f"Shape de la matriz de ravdess: {X_ravdess_np.shape}")
print(f"Size de los labels de ravdess: {y_ravdess_np.shape}")
print(f"Size de los ids de ravdess: {ids_ravdess_np.shape}\n")  

Shape de la matriz de ravdess: (1440, 128)
Size de los labels de ravdess: (1440, 1)
Size de los ids de ravdess: (1440, 1)



In [143]:
print(f"Shape de la matriz de crema-d: {X_cremad_np.shape}")    
print(f"Size de los labels de crema-d: {y_cremad_np.shape}")
print(f"Size de los ids de crema-d: {ids_cremad_np.shape}\n")

Shape de la matriz de crema-d: (7442, 128)
Size de los labels de crema-d: (7442, 1)
Size de los ids de crema-d: (7442, 1)

