# Speech Feature Extraction using OpenSMILE (GeMapsv01b + ComParE config)

In [1]:
%load_ext autoreload
%autoreload 2

from pathlib import Path
from typing import List, Optional, Tuple
import traceback
from multiprocess import Pool

import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# opensmile
import torchaudio
import opensmile


In [11]:
# configure user
user = "jonas"
extracted_feats = False

if user.lower() == "jonas":
    BASE_PATH = Path("/media/uz_study/EEGstudy2_idlab_cloud")
elif user.lower() == "mitchel":
    #     BASE_PATH = Path("Z:/shares/ghep_lab/2021_VanhollebekeKappen_EEGStudy2_MIST_Cyberball_Audio/")
    BASE_PATH = Path("D:/Data/EEG_Study_2")
DATA_PATH = BASE_PATH.joinpath("Data/Raw/Audio")


# Extracting features

useful links:
* [opensmile config folder](https://github.com/audeering/opensmile/tree/v3.0.0/config)
* difference between GeMAPS versions [here](https://github.com/audeering/opensmile/blob/v3.0.0/config/gemaps/CHANGES.txt')

**note**: `eGeMAPS` is an _extended_ version of the GeMAPS

feature-level`
* `Functionals`: global segment based features (1 feature per segment)
* `LowLevelDescriptor`: sliding window features (1 feature per window)

In [3]:
# define the feature extraction configs
func_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)

lld_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)


In [9]:
# ---------- Helper functions ----------
def _parse_feat_df(df_feat: pd.DataFrame, wav_path: Path) -> pd.DataFrame:
    df_feat["file"] = df_feat["file"].astype(str)
    df_feat["fileName"] = wav_path.name

    ppt_id, task = wav_path.parent.name.split("_")[1:3]
    df_feat["participantNum"] = int(ppt_id)
    df_feat["taskType"] = task
    desc_type, exp_phase = wav_path.stem.split("_")[1:3]
    # description type [picture or referential]
    df_feat["descriptionType"] = desc_type
    # experimental phase [baseline, control or stress]
    df_feat["experimentPhase"] = exp_phase
    return df_feat


def _extract_parse_smile_resample(s: opensmile.Smile, wav_path: Path) -> pd.DataFrame:
    # Load the wav file and resample to 16kHz
    arr_orig_wav_n, fs_orig = torchaudio.load(wav_path, normalize=True)
    arr_16khz_n = (
        torchaudio.functional.resample(
            arr_orig_wav_n, orig_freq=fs_orig, new_freq=16_000
        )
        .numpy()
        .ravel()
    )

    return _parse_feat_df(
        s.process_signal(
            signal=arr_16khz_n,
            sampling_rate=16_000,
            file=str(wav_path),
        ).reset_index(drop=False),
        wav_path=wav_path,
    )


def _extract_parse_smile(s: opensmile.Smile, wav_path: Path) -> pd.DataFrame:
    return _parse_feat_df(
        s.process_file(file=wav_path).reset_index(drop=False), wav_path=wav_path
    )


def _parse_concat_df(df_conc: pd.DataFrame) -> pd.DataFrame:
    df_conc["participantNum"] = df_conc["participantNum"].astype(int)
    for c in ["taskType", "descriptionType", "experimentPhase", "file", "fileName"]:
        df_conc[c] = df_conc[c].astype("category")
    df_conc["start"] = df_conc["start"].dt.total_seconds()
    df_conc["end"] = df_conc["end"].dt.total_seconds()
    return df_conc


## On the raw WAV data

In [5]:
def _extract_opensmile(file) -> Tuple[pd.DataFrame, ...]:
    return (
        _extract_parse_smile(func_gemaps, wav_path=file),
        _extract_parse_smile(lld_gemaps, wav_path=file),
    )


In [7]:
if not extracted_feats:
    arr_files = list(DATA_PATH.glob("*_speech/audio_*.wav"))
    print(len(arr_files))

    out: List = []
    with Pool(processes=8) as pool:
        results = pool.imap_unordered(_extract_opensmile, arr_files)
        results = tqdm(results, total=len(arr_files))
        try:
            out = [f for f in results]
        except:
            traceback.print_exc()
            pool.terminate()
        finally:
            pool.close()
            pool.join()

    df_fnc_gemaps = _parse_concat_df(pd.concat([o[0] for o in out], ignore_index=True))
    df_lld_gemaps = _parse_concat_df(pd.concat([o[1] for o in out], ignore_index=True))
    df_fnc_gemaps.to_parquet(DATA_PATH / "df_gemaps_func.parquet", engine="fastparquet")
    df_lld_gemaps.to_parquet(DATA_PATH / "df_gemaps_lld.parquet", engine="fastparquet")
    del df_lld_gemaps, df_fnc_gemaps, out, pool, results, arr_files
else:
    print("Nothing to do")


623


  0%|          | 0/623 [00:00<?, ?it/s]

## On the 16kHz normalized data

In [10]:
def _extract_opensmile_rs(file) -> Tuple[pd.DataFrame, ...]:
    return (
        _extract_parse_smile_resample(func_gemaps, wav_path=file),
        _extract_parse_smile_resample(lld_gemaps, wav_path=file),
    )


if not extracted_feats:
    arr_files = list(DATA_PATH.glob("*_speech/audio_*.wav"))
    print(len(arr_files))

    out: List = []
    with Pool(processes=8) as pool:
        results = pool.imap_unordered(_extract_opensmile_rs, arr_files)
        results = tqdm(results, total=len(arr_files))
        try:
            out = [f for f in results]
        except:
            traceback.print_exc()
            pool.terminate()
        finally:
            pool.close()
            pool.join()

    df_fnc_gemaps = _parse_concat_df(pd.concat([o[0] for o in out], ignore_index=True))
    df_lld_gemaps = _parse_concat_df(pd.concat([o[1] for o in out], ignore_index=True))
    df_fnc_gemaps.to_parquet(
        DATA_PATH / "df_gemaps_func_16khz.parquet", engine="fastparquet"
    )
    df_lld_gemaps.to_parquet(
        DATA_PATH / "df_gemaps_lld_16khz.parquet", engine="fastparquet"
    )
    del df_lld_gemaps, df_fnc_gemaps, out, pool, results, arr_files
else:
    print("Nothing to do")


623


  0%|          | 0/623 [00:00<?, ?it/s]