# Audio and Video Fusion

## Setup

In [1]:
import os
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd

### Paths

In [2]:
# Audio path
audio_input_path = '../out/audio/egemaps/'

# Video path
video_input_path = '../out/video/'


# CSV output
output_path = '../out/fusion/'

# Global configuration path
glob_conf_path = '../../config/global_config_paper.py'

### Load global variables

In [3]:
exec(open(glob_conf_path).read())

## Load and merge data (Unsupervised Learning)

In [4]:
from sklearn import preprocessing

# Load audio data
audio_df = pd.read_csv(audio_input_path + 'audio_data_step_1_egemaps_data_cleaning.csv')
audio_df = audio_df.drop(columns=['frameIndex', 'frameTime', 'emotion', 'file'])
audio_df

# Normalize audio data
X = audio_df.drop(columns=['emotion_id','actor_id','file_id'])                  # Get features from dataset
min_max_scaler_egemaps = preprocessing.MinMaxScaler()                           # eGeMAPS min max scaler
X_scaled = min_max_scaler_egemaps.fit_transform(X)                              # Fit and transform features
X_scaled_audio_df = pd.DataFrame(data=X_scaled, columns=X.columns.to_list())    # Create a dataframe from normalized features
metadata_df = audio_df[['emotion_id','actor_id','file_id']]                     # Get metadata
audio_df = pd.concat([X_scaled_audio_df,metadata_df], axis=1)


# Load video data
video_df = pd.read_csv(video_input_path + 'video_data_step_3_mean_intensity.csv')
video_df
# Normalize video data
X = video_df.drop(columns=['emotion_id','actor_id','file_id'])                             # Get features from dataset
min_max_scaler_video = preprocessing.MinMaxScaler()                             # Video min max scaler
X_scaled = min_max_scaler_video.fit_transform(X)                                # Fit and transform features
X_scaled_video_df = pd.DataFrame(data=X_scaled, columns=X.columns.to_list())    # Create a dataframe from normalized features
metadata_df = video_df[['emotion_id','file_id']]
video_df = pd.concat([X_scaled_video_df,metadata_df], axis=1)

# Merge audio and video data
multi_df = pd.merge(audio_df.drop(columns='emotion_id'),video_df,how='inner',on='file_id')
multi_df = multi_df.drop(columns='file_id')
multi_df

# Reorder columns
features = list(multi_df.columns.drop(['actor_id','emotion_id']))
multi_metadata_df = multi_df[features+['actor_id','emotion_id']]
multi_df = multi_df[features+['emotion_id']]
multi_df.to_csv(output_path + 'audio_video_dataset_unsupervised_learning.csv', index=None, header=True)
multi_metadata_df.to_csv(output_path + 'audio_video_dataset_unsupervised_learning_with_metadata.csv', index=None, header=True)

In [5]:
audio_df

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,emotion_id,actor_id,file_id
0,0.579782,0.276111,0.533631,0.596311,0.632179,0.209500,0.178389,0.032262,0.261604,0.070276,...,0.718996,0.330578,0.037114,0.117179,0.047826,0.088380,0.731259,amu,1,0
1,0.393945,0.343674,0.384970,0.423466,0.446001,0.147274,0.282919,0.175538,0.267522,0.068505,...,0.792587,0.264558,0.055237,0.089090,0.057971,0.000000,0.725443,amu,1,1
2,0.487920,0.288303,0.466380,0.452395,0.584362,0.226165,0.321936,0.164298,0.311106,0.203101,...,0.611349,0.344937,0.038350,0.090084,0.050725,0.049834,0.742759,amu,1,2
3,0.415525,0.320765,0.357067,0.355146,0.490685,0.233617,0.187911,0.038932,0.222829,0.040867,...,0.661599,0.337101,0.039662,0.074729,0.045894,0.040150,0.789177,amu,1,3
4,0.578382,0.277224,0.455800,0.559710,0.627945,0.291699,0.210346,0.112102,0.258617,0.044221,...,0.602913,0.161629,0.090247,0.320940,0.045894,0.058668,0.764986,amu,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.455030,0.051548,0.459465,0.432665,0.428898,0.042556,0.136981,0.009610,0.209519,0.008897,...,0.456192,0.192274,0.065019,0.064878,0.086957,0.127309,0.693686,tri,10,1255
1255,0.220083,0.513768,0.031770,0.317742,0.347561,0.422764,0.358086,0.306875,0.228330,0.014988,...,0.193118,0.396111,0.023316,0.090696,0.077122,0.096082,0.760948,tri,10,1256
1256,0.270762,0.520014,0.024084,0.363869,0.372689,0.462329,0.259245,0.118235,0.245574,0.060586,...,0.183356,0.376332,0.022740,0.107705,0.091938,0.147996,0.786001,tri,10,1257
1257,0.450948,0.089263,0.419079,0.440764,0.442042,0.104093,0.162826,0.065396,0.205408,0.005887,...,0.317359,0.155720,0.063989,0.077539,0.134058,0.268334,0.721901,tri,10,1258


In [6]:
video_df

Unnamed: 0,AU01_r_mean,AU01_r_stddevNorm,AU01_r_percentile20.0,AU01_r_percentile50.0,AU01_r_percentile80.0,AU01_r_iqr60_80-20,AU01_r_numPeaks,AU02_r_mean,AU02_r_stddevNorm,AU02_r_percentile20.0,...,AU26_r_numPeaks,AU45_r_mean,AU45_r_stddevNorm,AU45_r_percentile20.0,AU45_r_percentile50.0,AU45_r_percentile80.0,AU45_r_iqr60_80-20,AU45_r_numPeaks,emotion_id,file_id
0,0.130612,0.240045,0.0,0.000000,0.238713,0.238713,0.250000,0.141816,0.451565,0.0,...,0.363636,0.090554,0.363398,0.0,0.007299,0.092857,0.092857,0.258065,amu,0
1,0.044369,0.260825,0.0,0.009934,0.095937,0.095937,0.142857,0.069400,0.354630,0.0,...,0.181818,0.086685,0.584047,0.0,0.007299,0.057143,0.057143,0.161290,amu,1
2,0.064720,0.237358,0.0,0.009934,0.110045,0.110045,0.285714,0.117179,0.324758,0.0,...,0.212121,0.057670,0.380347,0.0,0.029197,0.039286,0.039286,0.096774,amu,2
3,0.102294,0.225245,0.0,0.000000,0.173815,0.173815,0.071429,0.068794,0.339125,0.0,...,0.181818,0.035939,0.303059,0.0,0.014599,0.035714,0.035714,0.129032,amu,3
4,0.122895,0.296350,0.0,0.000000,0.174379,0.174379,0.178571,0.126430,0.469950,0.0,...,0.242424,0.073285,0.255539,0.0,0.058394,0.064286,0.064286,0.225806,amu,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.051192,0.192379,0.0,0.026490,0.081264,0.081264,0.321429,0.093234,0.390645,0.0,...,0.272727,0.387610,0.358930,0.0,0.000000,0.569286,0.569286,0.161290,tri,1255
1255,0.026198,0.275834,0.0,0.009934,0.028217,0.028217,0.535714,0.050406,0.353327,0.0,...,0.333333,0.170638,0.507585,0.0,0.021898,0.107143,0.107143,0.258065,tri,1256
1256,0.081445,0.338825,0.0,0.003311,0.079007,0.079007,0.464286,0.118401,0.490531,0.0,...,0.545455,0.205594,0.551471,0.0,0.007299,0.078571,0.078571,0.322581,tri,1257
1257,0.042385,0.270673,0.0,0.004967,0.070542,0.070542,0.214286,0.047610,0.394670,0.0,...,0.242424,0.260038,0.358960,0.0,0.000000,0.328571,0.328571,0.225806,tri,1258


In [7]:
multi_df

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,AU26_r_iqr60_80-20,AU26_r_numPeaks,AU45_r_mean,AU45_r_stddevNorm,AU45_r_percentile20.0,AU45_r_percentile50.0,AU45_r_percentile80.0,AU45_r_iqr60_80-20,AU45_r_numPeaks,emotion_id
0,0.579782,0.276111,0.533631,0.596311,0.632179,0.209500,0.178389,0.032262,0.261604,0.070276,...,0.296988,0.363636,0.090554,0.363398,0.0,0.007299,0.092857,0.092857,0.258065,amu
1,0.393945,0.343674,0.384970,0.423466,0.446001,0.147274,0.282919,0.175538,0.267522,0.068505,...,0.219880,0.181818,0.086685,0.584047,0.0,0.007299,0.057143,0.057143,0.161290,amu
2,0.487920,0.288303,0.466380,0.452395,0.584362,0.226165,0.321936,0.164298,0.311106,0.203101,...,0.301205,0.212121,0.057670,0.380347,0.0,0.029197,0.039286,0.039286,0.096774,amu
3,0.415525,0.320765,0.357067,0.355146,0.490685,0.233617,0.187911,0.038932,0.222829,0.040867,...,0.472892,0.181818,0.035939,0.303059,0.0,0.014599,0.035714,0.035714,0.129032,amu
4,0.578382,0.277224,0.455800,0.559710,0.627945,0.291699,0.210346,0.112102,0.258617,0.044221,...,0.154819,0.242424,0.073285,0.255539,0.0,0.058394,0.064286,0.064286,0.225806,amu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.455030,0.051548,0.459465,0.432665,0.428898,0.042556,0.136981,0.009610,0.209519,0.008897,...,0.134337,0.272727,0.387610,0.358930,0.0,0.000000,0.569286,0.569286,0.161290,tri
1255,0.220083,0.513768,0.031770,0.317742,0.347561,0.422764,0.358086,0.306875,0.228330,0.014988,...,0.256024,0.333333,0.170638,0.507585,0.0,0.021898,0.107143,0.107143,0.258065,tri
1256,0.270762,0.520014,0.024084,0.363869,0.372689,0.462329,0.259245,0.118235,0.245574,0.060586,...,0.243976,0.545455,0.205594,0.551471,0.0,0.007299,0.078571,0.078571,0.322581,tri
1257,0.450948,0.089263,0.419079,0.440764,0.442042,0.104093,0.162826,0.065396,0.205408,0.005887,...,0.250000,0.242424,0.260038,0.358960,0.0,0.000000,0.328571,0.328571,0.225806,tri


In [8]:
multi_metadata_df

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,AU26_r_numPeaks,AU45_r_mean,AU45_r_stddevNorm,AU45_r_percentile20.0,AU45_r_percentile50.0,AU45_r_percentile80.0,AU45_r_iqr60_80-20,AU45_r_numPeaks,actor_id,emotion_id
0,0.579782,0.276111,0.533631,0.596311,0.632179,0.209500,0.178389,0.032262,0.261604,0.070276,...,0.363636,0.090554,0.363398,0.0,0.007299,0.092857,0.092857,0.258065,1,amu
1,0.393945,0.343674,0.384970,0.423466,0.446001,0.147274,0.282919,0.175538,0.267522,0.068505,...,0.181818,0.086685,0.584047,0.0,0.007299,0.057143,0.057143,0.161290,1,amu
2,0.487920,0.288303,0.466380,0.452395,0.584362,0.226165,0.321936,0.164298,0.311106,0.203101,...,0.212121,0.057670,0.380347,0.0,0.029197,0.039286,0.039286,0.096774,1,amu
3,0.415525,0.320765,0.357067,0.355146,0.490685,0.233617,0.187911,0.038932,0.222829,0.040867,...,0.181818,0.035939,0.303059,0.0,0.014599,0.035714,0.035714,0.129032,1,amu
4,0.578382,0.277224,0.455800,0.559710,0.627945,0.291699,0.210346,0.112102,0.258617,0.044221,...,0.242424,0.073285,0.255539,0.0,0.058394,0.064286,0.064286,0.225806,1,amu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.455030,0.051548,0.459465,0.432665,0.428898,0.042556,0.136981,0.009610,0.209519,0.008897,...,0.272727,0.387610,0.358930,0.0,0.000000,0.569286,0.569286,0.161290,10,tri
1255,0.220083,0.513768,0.031770,0.317742,0.347561,0.422764,0.358086,0.306875,0.228330,0.014988,...,0.333333,0.170638,0.507585,0.0,0.021898,0.107143,0.107143,0.258065,10,tri
1256,0.270762,0.520014,0.024084,0.363869,0.372689,0.462329,0.259245,0.118235,0.245574,0.060586,...,0.545455,0.205594,0.551471,0.0,0.007299,0.078571,0.078571,0.322581,10,tri
1257,0.450948,0.089263,0.419079,0.440764,0.442042,0.104093,0.162826,0.065396,0.205408,0.005887,...,0.242424,0.260038,0.358960,0.0,0.000000,0.328571,0.328571,0.225806,10,tri


## Create tsv files
To explore data using https://projector.tensorflow.org/

* Vectors:  
    Example of 3 vectors with dimension 4:  
    0.1\t0.2\t0.5\t0.9  
    0.2\t0.1\t5.0\t0.2  
    0.4\t0.1\t7.0\t0.8  
        
    
* Metadata  
    Example of 3 data points and 2 columns.  
    Note: If there is more than one column, the first row will be parsed as column labels.  
    actor_id\emotion_id  
    1\ttri  
    8\tint  
    6\tsur


### Multimodality

In [9]:
import csv

multi_df = pd.read_csv(output_path + 'audio_video_dataset_unsupervised_learning_with_metadata.csv')
vectors_df = multi_df.drop(columns=['emotion_id','actor_id'])
multi_df['emotion'] = multi_df['emotion_id'].map(emotion_id_to_emotion)
multi_df['valence'] = multi_df['emotion_id'].map(emotion_id_to_valence)
multi_df['sex'] = multi_df['actor_id'].map(actor_id_to_sex)
metadata_df = multi_df[['emotion', 'actor_id', 'sex', 'valence']]

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)                       

### Single modality

#### Audio

In [10]:
# Load audio data
audio_df = pd.read_csv(audio_input_path + 'audio_data_step_1_egemaps_data_cleaning.csv')
audio_df = audio_df.drop(columns=['frameIndex', 'frameTime', 'emotion', 'file'])
audio_df

# Normalize audio data
X = audio_df.drop(columns=['emotion_id','actor_id','file_id'])                  # Get features from dataset
min_max_scaler_egemaps = preprocessing.MinMaxScaler()                           # eGeMAPS min max scaler
X_scaled = min_max_scaler_egemaps.fit_transform(X)                              # Fit and transform features
X_scaled_audio_df = pd.DataFrame(data=X_scaled, columns=X.columns.to_list())    # Create a dataframe from normalized features
metadata_df = audio_df[['emotion_id','actor_id','file_id']]                     # Get metadata
audio_df = pd.concat([X_scaled_audio_df,metadata_df], axis=1)

vectors_df = audio_df.drop(columns=['emotion_id','actor_id','file_id'])
audio_df['emotion'] = audio_df['emotion_id'].map(emotion_id_to_emotion)
audio_df['valence'] = audio_df['emotion_id'].map(emotion_id_to_valence)
audio_df['sex'] = audio_df['actor_id'].map(actor_id_to_sex)
metadata_df = audio_df[['emotion','actor_id','sex', 'valence']]

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'audio_vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'audio_metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)   
                    

#### Video

In [11]:
# Load video data
video_df = pd.read_csv(video_input_path + 'video_data_step_3_mean_intensity.csv')
video_df
# Normalize video data
X = video_df.drop(columns=['emotion_id','actor_id','file_id'])                  # Get features from dataset
min_max_scaler_video = preprocessing.MinMaxScaler()                             # Video min max scaler
X_scaled = min_max_scaler_video.fit_transform(X)                                # Fit and transform features
X_scaled_video_df = pd.DataFrame(data=X_scaled, columns=X.columns.to_list())    # Create a dataframe from normalized features
metadata_df = video_df[['emotion_id','actor_id','file_id']]
video_df = pd.concat([X_scaled_video_df,metadata_df], axis=1)

vectors_df = video_df.drop(columns=['emotion_id','actor_id','file_id'])
video_df['emotion'] = video_df['emotion_id'].map(emotion_id_to_emotion)
video_df['valence'] = video_df['emotion_id'].map(emotion_id_to_valence)
video_df['sex'] = video_df['actor_id'].map(actor_id_to_sex)
metadata_df = video_df[['emotion','actor_id','sex','valence']]

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'video_vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'video_metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)   
                    