# Audio and Video Fusion

## Setup

In [4]:
import os
from glob import glob
from pathlib import Path

import numpy as np
import pandas as pd

### Paths

In [5]:
# Audio path
audio_input_path = '../out/audio/egemaps/'

# Video path
video_input_path = '../out/video/'


# CSV output
output_path = '../out/fusion/'

# Global configuration path
glob_conf_path = '../../config/global_config.py'

### Load global variables

In [6]:
exec(open(glob_conf_path).read())

## Load and merge data (Unsupervised Learning)

In [4]:
from sklearn import preprocessing

# Load audio data
audio_df = pd.read_csv(audio_input_path + 'audio_data_step_1_egemaps_data_cleaning.csv')
audio_df = audio_df.drop(columns=['frameIndex', 'frameTime', 'emotion', 'file'])
audio_df

# Normalize audio data
X = audio_df.drop(columns=['emotion_id','actor_id','file_id'])                  # Get features from dataset
min_max_scaler_egemaps = preprocessing.MinMaxScaler()                           # eGeMAPS min max scaler
X_scaled = min_max_scaler_egemaps.fit_transform(X)                              # Fit and transform features
X_scaled_audio_df = pd.DataFrame(data=X_scaled, columns=X.columns.to_list())    # Create a dataframe from normalized features
metadata_df = audio_df[['emotion_id','actor_id','file_id']]                     # Get metadata
audio_df = pd.concat([X_scaled_audio_df,metadata_df], axis=1)


# Load video data
video_df = pd.read_csv(video_input_path + 'video_data_step_3_mean_intensity.csv')

# Normalize video data
X_scaled_video_df = video_df.drop(columns=['file_id','emotion_id'])/5           # Get features and normalize
metadata_df = video_df[['emotion_id','file_id']]
video_df = pd.concat([X_scaled_video_df,metadata_df], axis=1)

# Merge audio and video data
multi_df = pd.merge(audio_df.drop(columns='emotion_id'),video_df,how='inner',on='file_id')
multi_df = multi_df.drop(columns='file_id')

# Reorder columns
features = list(multi_df.columns.drop(['actor_id','emotion_id']))
multi_metadata_df = multi_df[features+['actor_id','emotion_id']]
multi_df = multi_df[features+['emotion_id']]
multi_df.to_csv(output_path + 'audio_video_dataset_unsupervised_learning.csv', index=None, header=True)
multi_metadata_df.to_csv(output_path + 'audio_video_dataset_unsupervised_learning_with_metadata.csv', index=None, header=True)

In [5]:
audio_df

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,emotion_id,actor_id,file_id
0,0.579782,0.276111,0.533631,0.596311,0.632179,0.209500,0.178389,0.032262,0.261604,0.070276,...,0.718996,0.330578,0.037114,0.117179,0.047826,0.088380,0.731259,amu,1,0
1,0.393945,0.343674,0.384970,0.423466,0.446001,0.147274,0.282919,0.175538,0.267522,0.068505,...,0.792587,0.264558,0.055237,0.089090,0.057971,0.000000,0.725443,amu,1,1
2,0.487920,0.288303,0.466380,0.452395,0.584362,0.226165,0.321936,0.164298,0.311106,0.203101,...,0.611349,0.344937,0.038350,0.090084,0.050725,0.049834,0.742759,amu,1,2
3,0.415525,0.320765,0.357067,0.355146,0.490685,0.233617,0.187911,0.038932,0.222829,0.040867,...,0.661599,0.337101,0.039662,0.074729,0.045894,0.040150,0.789177,amu,1,3
4,0.578382,0.277224,0.455800,0.559710,0.627945,0.291699,0.210346,0.112102,0.258617,0.044221,...,0.602913,0.161629,0.090247,0.320940,0.045894,0.058668,0.764986,amu,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.455030,0.051548,0.459465,0.432665,0.428898,0.042556,0.136981,0.009610,0.209519,0.008897,...,0.456192,0.192274,0.065019,0.064878,0.086957,0.127309,0.693686,tri,10,1255
1255,0.220083,0.513768,0.031770,0.317742,0.347561,0.422764,0.358086,0.306875,0.228330,0.014988,...,0.193118,0.396111,0.023316,0.090696,0.077122,0.096082,0.760948,tri,10,1256
1256,0.270762,0.520014,0.024084,0.363869,0.372689,0.462329,0.259245,0.118235,0.245574,0.060586,...,0.183356,0.376332,0.022740,0.107705,0.091938,0.147996,0.786001,tri,10,1257
1257,0.450948,0.089263,0.419079,0.440764,0.442042,0.104093,0.162826,0.065396,0.205408,0.005887,...,0.317359,0.155720,0.063989,0.077539,0.134058,0.268334,0.721901,tri,10,1258


In [6]:
video_df

Unnamed: 0,AU01_r,AU02_r,AU04_r,AU05_r,AU06_r,AU07_r,AU09_r,AU10_r,AU12_r,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,AU45_r,emotion_id,file_id
0,0.064383,0.021489,0.028213,0.002894,0.458000,0.374170,0.023447,0.543404,0.391106,0.112723,0.033404,0.133362,0.016468,0.019319,0.394809,0.123362,0.024128,amu,0
1,0.021871,0.010516,0.000000,0.004903,0.230000,0.146000,0.017871,0.357548,0.292710,0.074774,0.029097,0.082065,0.028774,0.016000,0.246774,0.110516,0.023097,amu,1
2,0.031902,0.017756,0.001854,0.004537,0.369122,0.227415,0.019756,0.606829,0.457659,0.221317,0.024927,0.105854,0.022000,0.055171,0.179024,0.128976,0.015366,amu,2
3,0.050424,0.010424,0.000000,0.014303,0.138848,0.009939,0.008303,0.486848,0.298545,0.180061,0.031030,0.073758,0.022121,0.034182,0.153212,0.168485,0.009576,amu,3
4,0.060579,0.019158,0.070632,0.015789,0.340789,0.182579,0.015632,0.556000,0.353421,0.246211,0.010053,0.095368,0.011947,0.047632,0.182842,0.058526,0.019526,amu,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.025234,0.014128,0.069872,0.007319,0.000000,0.046298,0.023957,0.000000,0.000000,0.000000,0.013319,0.071872,0.024128,0.014128,0.129957,0.069532,0.103277,tri,1255
1255,0.012914,0.007638,0.003328,0.007052,0.000000,0.000000,0.008017,0.000000,0.000000,0.005724,0.018466,0.072966,0.009569,0.015741,0.072776,0.088310,0.045466,tri,1256
1256,0.040147,0.017941,0.001412,0.006691,0.000000,0.001029,0.007500,0.000000,0.000000,0.000544,0.021265,0.068397,0.012838,0.008632,0.086441,0.097809,0.054779,tri,1257
1257,0.020893,0.007214,0.004214,0.003571,0.000000,0.016571,0.006607,0.000000,0.000786,0.055107,0.009464,0.050286,0.016357,0.006714,0.117643,0.090536,0.069286,tri,1258


In [7]:
multi_df

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,AU12_r,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,AU45_r,emotion_id
0,0.579782,0.276111,0.533631,0.596311,0.632179,0.209500,0.178389,0.032262,0.261604,0.070276,...,0.391106,0.112723,0.033404,0.133362,0.016468,0.019319,0.394809,0.123362,0.024128,amu
1,0.393945,0.343674,0.384970,0.423466,0.446001,0.147274,0.282919,0.175538,0.267522,0.068505,...,0.292710,0.074774,0.029097,0.082065,0.028774,0.016000,0.246774,0.110516,0.023097,amu
2,0.487920,0.288303,0.466380,0.452395,0.584362,0.226165,0.321936,0.164298,0.311106,0.203101,...,0.457659,0.221317,0.024927,0.105854,0.022000,0.055171,0.179024,0.128976,0.015366,amu
3,0.415525,0.320765,0.357067,0.355146,0.490685,0.233617,0.187911,0.038932,0.222829,0.040867,...,0.298545,0.180061,0.031030,0.073758,0.022121,0.034182,0.153212,0.168485,0.009576,amu
4,0.578382,0.277224,0.455800,0.559710,0.627945,0.291699,0.210346,0.112102,0.258617,0.044221,...,0.353421,0.246211,0.010053,0.095368,0.011947,0.047632,0.182842,0.058526,0.019526,amu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.455030,0.051548,0.459465,0.432665,0.428898,0.042556,0.136981,0.009610,0.209519,0.008897,...,0.000000,0.000000,0.013319,0.071872,0.024128,0.014128,0.129957,0.069532,0.103277,tri
1255,0.220083,0.513768,0.031770,0.317742,0.347561,0.422764,0.358086,0.306875,0.228330,0.014988,...,0.000000,0.005724,0.018466,0.072966,0.009569,0.015741,0.072776,0.088310,0.045466,tri
1256,0.270762,0.520014,0.024084,0.363869,0.372689,0.462329,0.259245,0.118235,0.245574,0.060586,...,0.000000,0.000544,0.021265,0.068397,0.012838,0.008632,0.086441,0.097809,0.054779,tri
1257,0.450948,0.089263,0.419079,0.440764,0.442042,0.104093,0.162826,0.065396,0.205408,0.005887,...,0.000786,0.055107,0.009464,0.050286,0.016357,0.006714,0.117643,0.090536,0.069286,tri


In [8]:
multi_metadata_df

Unnamed: 0,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevRisingSlope,F0semitoneFrom27.5Hz_sma3nz_meanFallingSlope,F0semitoneFrom27.5Hz_sma3nz_stddevFallingSlope,...,AU14_r,AU15_r,AU17_r,AU20_r,AU23_r,AU25_r,AU26_r,AU45_r,actor_id,emotion_id
0,0.579782,0.276111,0.533631,0.596311,0.632179,0.209500,0.178389,0.032262,0.261604,0.070276,...,0.112723,0.033404,0.133362,0.016468,0.019319,0.394809,0.123362,0.024128,1,amu
1,0.393945,0.343674,0.384970,0.423466,0.446001,0.147274,0.282919,0.175538,0.267522,0.068505,...,0.074774,0.029097,0.082065,0.028774,0.016000,0.246774,0.110516,0.023097,1,amu
2,0.487920,0.288303,0.466380,0.452395,0.584362,0.226165,0.321936,0.164298,0.311106,0.203101,...,0.221317,0.024927,0.105854,0.022000,0.055171,0.179024,0.128976,0.015366,1,amu
3,0.415525,0.320765,0.357067,0.355146,0.490685,0.233617,0.187911,0.038932,0.222829,0.040867,...,0.180061,0.031030,0.073758,0.022121,0.034182,0.153212,0.168485,0.009576,1,amu
4,0.578382,0.277224,0.455800,0.559710,0.627945,0.291699,0.210346,0.112102,0.258617,0.044221,...,0.246211,0.010053,0.095368,0.011947,0.047632,0.182842,0.058526,0.019526,1,amu
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1254,0.455030,0.051548,0.459465,0.432665,0.428898,0.042556,0.136981,0.009610,0.209519,0.008897,...,0.000000,0.013319,0.071872,0.024128,0.014128,0.129957,0.069532,0.103277,10,tri
1255,0.220083,0.513768,0.031770,0.317742,0.347561,0.422764,0.358086,0.306875,0.228330,0.014988,...,0.005724,0.018466,0.072966,0.009569,0.015741,0.072776,0.088310,0.045466,10,tri
1256,0.270762,0.520014,0.024084,0.363869,0.372689,0.462329,0.259245,0.118235,0.245574,0.060586,...,0.000544,0.021265,0.068397,0.012838,0.008632,0.086441,0.097809,0.054779,10,tri
1257,0.450948,0.089263,0.419079,0.440764,0.442042,0.104093,0.162826,0.065396,0.205408,0.005887,...,0.055107,0.009464,0.050286,0.016357,0.006714,0.117643,0.090536,0.069286,10,tri


## Create tsv files
To explore data using https://projector.tensorflow.org/

* Vectors:  
    Example of 3 vectors with dimension 4:  
    0.1\t0.2\t0.5\t0.9  
    0.2\t0.1\t5.0\t0.2  
    0.4\t0.1\t7.0\t0.8  
        
    
* Metadata  
    Example of 3 data points and 2 columns.  
    Note: If there is more than one column, the first row will be parsed as column labels.  
    actor_id\emotion_id  
    1\ttri  
    8\tint  
    6\tsur


### Multimodality

In [9]:
import csv

multi_df = pd.read_csv(output_path + 'audio_video_dataset_unsupervised_learning_with_metadata.csv')
vectors_df = multi_df.drop(columns=['emotion_id','actor_id'])
multi_df['emotion'] = multi_df['emotion_id'].map(emotion_id_to_emotion)
multi_df['emotion_type'] = multi_df['emotion_id'].map(emotion_id_to_emotion_type)
multi_df['gender'] = multi_df['actor_id'].map(actor_id_to_gender)
metadata_df = multi_df[['emotion', 'actor_id', 'gender', 'emotion_type']]

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)                       

### Single modality

#### Audio

In [10]:
# Load audio data
audio_df = pd.read_csv(audio_input_path + 'audio_data_step_1_egemaps_data_cleaning.csv')
audio_df = audio_df.drop(columns=['frameIndex', 'frameTime', 'emotion', 'file'])
audio_df

# Normalize audio data
X = audio_df.drop(columns=['emotion_id','actor_id','file_id'])                  # Get features from dataset
min_max_scaler_egemaps = preprocessing.MinMaxScaler()                           # eGeMAPS min max scaler
X_scaled = min_max_scaler_egemaps.fit_transform(X)                              # Fit and transform features
X_scaled_audio_df = pd.DataFrame(data=X_scaled, columns=X.columns.to_list())    # Create a dataframe from normalized features
metadata_df = audio_df[['emotion_id','actor_id','file_id']]                     # Get metadata
audio_df = pd.concat([X_scaled_audio_df,metadata_df], axis=1)


vectors_df = audio_df.drop(columns=['emotion_id','actor_id','file_id'])
audio_df['emotion'] = audio_df['emotion_id'].map(emotion_id_to_emotion)
audio_df['emotion_type'] = audio_df['emotion_id'].map(emotion_id_to_emotion_type)
audio_df['gender'] = audio_df['actor_id'].map(actor_id_to_gender)
metadata_df = audio_df[['emotion','actor_id','gender', 'emotion_type']]

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'audio_vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'audio_metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)   
                    

#### Video

In [11]:
# Load video data
video_df = pd.read_csv(video_input_path + 'video_data_step_3_mean_intensity.csv')
X_scaled_video_df = video_df.drop(columns=['file_id','emotion_id'])/5           # Get features and normalize
metadata_df = video_df[['emotion_id','file_id']]
video_df = pd.concat([X_scaled_video_df,metadata_df], axis=1)

# Merge on file_id with audio to get actor_id 
audio_df = pd.read_csv(audio_input_path + 'audio_data_step_1_egemaps_data_cleaning.csv')
video_df = pd.merge(video_df,audio_df[['file_id','actor_id']], how='inner', on='file_id')

vectors_df = video_df.drop(columns=['emotion_id','actor_id','file_id'])
video_df['emotion'] = video_df['emotion_id'].map(emotion_id_to_emotion)
video_df['emotion_type'] = video_df['emotion_id'].map(emotion_id_to_emotion_type)
video_df['gender'] = video_df['actor_id'].map(actor_id_to_gender)
metadata_df = video_df[['emotion','actor_id','gender','emotion_type']]

vectors_df.to_csv(os.path.join(output_path, 'tsv', 'video_vectors.tsv'), sep='\t', index=None, header=False, quoting=csv.QUOTE_NONE)
metadata_df.to_csv(os.path.join(output_path, 'tsv', 'video_metadata.tsv'), sep='\t', index=None, header=True, quoting=csv.QUOTE_NONE)   
                    