In [6]:
ENV='local' 
# ENV='colab'

In [7]:
if 'colab' in ENV:
    !sudo apt-get install sox
    !pip install opensmile

In [8]:
import os
from pathlib import Path
import tqdm
import glob

import soundfile as sf
import opensmile
from IPython.display import Audio

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from tqdm import tqdm

Funciones para extraccion de features de los archivos de audio:

In [9]:
def get_lld_feats(file_path):
    '''
    Devuelve los atributos de bajo nivel (LLD) de un audio.
    '''
    smile = opensmile.Smile(
        feature_set   = opensmile.FeatureSet.eGeMAPSv02,
        feature_level = opensmile.FeatureLevel.LowLevelDescriptors,
    )
    feats = smile.process_file(file_path, channel=0)
    return feats.reset_index()


def get_functional_feats(file_path):
    '''
    Devuelve los atributos de alto nivel (Functionals) de un audio.
    
    Se calculan a partir de los atributos de bajo nivel, calculando estadísticas 
    que resuman las secuencias. Los modelos que vimos no modelan secuencias!, 
    por ende, es necesario disponer de atributos.
    que consistan de vectores fijos.
    '''
    smile = opensmile.Smile(
        feature_set   = opensmile.FeatureSet.eGeMAPSv02,
        feature_level = opensmile.FeatureLevel.Functionals,
    )
    feats = smile.process_file(file_path, channel=0)
    return feats.reset_index()

def plot_feats(feats):
    feats_cols = [col for col in feats.columns if col not in ['file','start','end']]
    for col in feats_cols:
      plt.figure()
      plt.plot(feats.start.dt.total_seconds(),feats[col].values)
      plt.title(col)

Funciones para escuchar el audio de un ejemplo del dataset:

In [10]:
def play_audio(path, rate=None):
    x, sr = sf.read(path)
    display(Audio(x, rate=sr))
    time = np.arange(0,len(x)) / sr
    plt.plot(time,x)

def play(example):
    play_audio(example.file_path)

### Dataset

In [16]:
def download_audio_dataset(path, dataset_name):
    '''
    Descarga el dataset y los descomprime en el path pasado como parametro.
    '''
    if os.path.exists(path):
        print('Dataset alredy downloaded!')
        return

    filename = 'Audio_{}_Actors_01-24.zip'.format(dataset_name.capitalize())
    url = 'https://zenodo.org/record/1188976/files/{}'.format(filename)

    print('Download {} dataset...'.format(url))
    !wget $url
    !unzip $filename -d $path
    !rm $filename


class Dataset:
    def __init__(self, path='.', dataset_name='speech'):
        self.dataset_path = '{}/{}'.format(path, dataset_name)
        download_audio_dataset(self.dataset_path, dataset_name)

    def get_all(self, with_feats=True):
        '''
        Usa el metodo de busqueda search_by para consultar todos lo ejemplos del dataset.
        '''
        return self.search_by('Actor_*', '*', with_feats)

    def search_by(self, actor_pattern, filename_pattern, with_feats=True):
        '''
        Permite buscar ejemplos en el dataset buscando con un patro por artista y otro patro 
        para el nombre del archivo. 
        
        Devuelve una tabla donde cada ejemplo tiene como columnas todos los datos extraidos del
        nombre del archivo de audio y ademas todos los atributos extraido del audio con la 
        libreria opensmile.
        ''' 
        search_path = '{}/{}/{}.wav'.format(self.dataset_path, actor_pattern, filename_pattern)
        result_file_paths = glob.glob(search_path)

        examples = []        
        for index in tqdm(range(len(result_file_paths))):
            file_path = result_file_paths[index]

            file_name_parts = self.__get_file_parts(file_path) 
 
            example = {
                'actor'              : self.__get_actor_number_from(file_path),
                'file_path'          : file_path,
                'modality'           : file_name_parts[0],
                'vocal_channel'      : file_name_parts[1],
                'emotion'            : file_name_parts[2],
                'emotional_intensity': file_name_parts[3],
                'statement'          : file_name_parts[4],
                'repetition'         : file_name_parts[5],
                'actor'              : file_name_parts[6]
            }
            if with_feats:
                self.__append_feats(example)
            
            examples.append(example)

        return pd.DataFrame(examples)
    
    
    def __get_actor_number_from(self, path):
        actor_part = path.replace(self.dataset_path, '') .split('/')[1]
        return int(actor_part.split('_')[1])
    
    def __get_file_parts(self, file_path):
        filename       = Path(file_path).stem
        return filename.split('-')
    
    def __append_feats(self, example):
        feats = get_functional_feats(example['file_path'])            
        for feat_col in feats.columns:
            example[feat_col] = feats[feat_col][0]

Creamos el dataset: Este ya se encarga de bajar los datos por unica vez.

In [17]:
speech_dataset = Dataset(dataset_name='speech')

Dataset alredy downloaded!


In [19]:
song_dataset = Dataset(dataset_name='song')

Dataset alredy downloaded!


Le pedimos que nos devuelve todos los ejemplo del dataset:

In [22]:
speech_examples = speech_dataset.get_all()

100%|██████████| 1440/1440 [02:02<00:00, 11.74it/s]


In [23]:
song_examples = song_dataset.get_all()

100%|██████████| 1012/1012 [01:47<00:00,  9.38it/s]


In [24]:
speech_examples.head()

Unnamed: 0,actor,file_path,modality,vocal_channel,emotion,emotional_intensity,statement,repetition,file,start,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,23,./speech/Actor_23/03-01-04-02-01-01-23.wav,3,1,4,2,1,1,./speech/Actor_23/03-01-04-02-01-01-23.wav,0 days,...,0.044101,0.007548,0.015374,1.877934,1.425178,0.251667,0.247482,0.367143,0.419309,-39.951572
1,23,./speech/Actor_23/03-01-04-02-02-02-23.wav,3,1,4,2,2,2,./speech/Actor_23/03-01-04-02-02-02-23.wav,0 days,...,0.011092,0.00802,0.01409,1.694915,0.856531,0.3575,0.072241,0.634,0.483429,-42.310856
2,23,./speech/Actor_23/03-01-06-02-01-02-23.wav,3,1,6,2,1,2,./speech/Actor_23/03-01-06-02-01-02-23.wav,0 days,...,0.101003,0.014311,0.014765,2.339181,0.593472,0.635,0.335,0.683333,0.449024,-34.312801
3,23,./speech/Actor_23/03-01-07-01-01-02-23.wav,3,1,7,1,1,2,./speech/Actor_23/03-01-07-01-01-02-23.wav,0 days,...,0.062009,0.011443,0.021647,2.52809,1.424501,0.18,0.0998,0.416667,0.422834,-44.573288
4,23,./speech/Actor_23/03-01-03-01-01-02-23.wav,3,1,3,1,1,2,./speech/Actor_23/03-01-03-01-01-02-23.wav,0 days,...,0.032799,0.011666,0.009868,2.153846,0.625,0.57,0.32,0.67,0.452548,-36.173256


In [25]:
song_examples.head()

Unnamed: 0,actor,file_path,modality,vocal_channel,emotion,emotional_intensity,statement,repetition,file,start,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,23,./song/Actor_23/03-02-04-02-01-01-23.wav,3,2,4,2,1,1,./song/Actor_23/03-02-04-02-01-01-23.wav,0 days,...,0.087133,0.016804,0.010952,1.405975,0.70922,0.865,0.985051,0.69,0.45218,-34.619511
1,23,./song/Actor_23/03-02-06-01-02-02-23.wav,3,2,6,1,2,2,./song/Actor_23/03-02-06-01-02-02-23.wav,0 days,...,0.035799,0.012916,0.023227,1.931331,0.650759,0.806667,0.285346,0.703333,0.378535,-31.953684
2,23,./song/Actor_23/03-02-06-01-01-01-23.wav,3,2,6,1,1,1,./song/Actor_23/03-02-06-01-01-01-23.wav,0 days,...,0.041038,0.012036,0.028251,2.433628,1.342282,0.368333,0.246537,0.304286,0.415343,-36.003284
3,23,./song/Actor_23/03-02-02-01-01-01-23.wav,3,2,2,1,1,1,./song/Actor_23/03-02-02-01-01-01-23.wav,0 days,...,0.001443,0.009786,0.005541,1.492537,0.646552,0.826667,0.655862,0.5225,0.47636,-39.769188
4,23,./song/Actor_23/03-02-02-02-01-01-23.wav,3,2,2,2,1,1,./song/Actor_23/03-02-02-02-01-01-23.wav,0 days,...,0.012821,0.011602,0.006391,1.50376,0.56926,1.01,0.800875,0.5425,0.461919,-39.360474


In [26]:
speech_examples['audio_type'] = 'speech'
song_examples['audio_type'] = 'song'

In [27]:
dataset = pd.concat([speech_examples, song_examples])

In [28]:
dataset.to_csv('dataset.csv')

In [29]:
dataset.head()

Unnamed: 0,actor,file_path,modality,vocal_channel,emotion,emotional_intensity,statement,repetition,file,start,...,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp,audio_type
0,23,./speech/Actor_23/03-01-04-02-01-01-23.wav,3,1,4,2,1,1,./speech/Actor_23/03-01-04-02-01-01-23.wav,0 days,...,0.007548,0.015374,1.877934,1.425178,0.251667,0.247482,0.367143,0.419309,-39.951572,speech
1,23,./speech/Actor_23/03-01-04-02-02-02-23.wav,3,1,4,2,2,2,./speech/Actor_23/03-01-04-02-02-02-23.wav,0 days,...,0.00802,0.01409,1.694915,0.856531,0.3575,0.072241,0.634,0.483429,-42.310856,speech
2,23,./speech/Actor_23/03-01-06-02-01-02-23.wav,3,1,6,2,1,2,./speech/Actor_23/03-01-06-02-01-02-23.wav,0 days,...,0.014311,0.014765,2.339181,0.593472,0.635,0.335,0.683333,0.449024,-34.312801,speech
3,23,./speech/Actor_23/03-01-07-01-01-02-23.wav,3,1,7,1,1,2,./speech/Actor_23/03-01-07-01-01-02-23.wav,0 days,...,0.011443,0.021647,2.52809,1.424501,0.18,0.0998,0.416667,0.422834,-44.573288,speech
4,23,./speech/Actor_23/03-01-03-01-01-02-23.wav,3,1,3,1,1,2,./speech/Actor_23/03-01-03-01-01-02-23.wav,0 days,...,0.011666,0.009868,2.153846,0.625,0.57,0.32,0.67,0.452548,-36.173256,speech
