# Speech Feature Extraction using OpenSMILE (GeMapsv01b + ComParE config)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
from typing import List, Optional

import opensmile
import pandas as pd
from IPython.display import display
from tqdm.auto import tqdm

from speech_study.path_conf import loc_data_dir
from speech_study.audio_ptcpt_filtering import add_audio_mask


In [3]:
# configure user
USER = "jonas"
EXTRACTED_FEATS = False  # set this to False if you want to extract the features again

if USER.lower() == "jonas":
    BASE_PATH = Path("/users/jonvdrdo/jonas/data/aaa_contextaware/raw/uz_study/")
elif USER.lower() == "mitchel":
    BASE_PATH = "D:/Data/EEG_Study_1/"  # Go check 1_audio_preprocess.py
else:
    raise ValueError("unknown user")

DATA_PATH = BASE_PATH.joinpath("aligned_data")


# Extracting features

useful links:
* [opensmile config folder](https://github.com/audeering/opensmile/tree/v3.0.0/config)
* difference between GeMAPS versions [here](https://github.com/audeering/opensmile/blob/v3.0.0/config/gemaps/CHANGES.txt')

**note**: `eGeMAPS` is an _extended_ version of the GeMAPS

feature-level`
* `Functionals`: global segment based features (1 feature per segment)
* `LowLevelDescriptor`: sliding window features (1 feature per window)

In [4]:
func_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.Functionals,
)

lld_gemaps = opensmile.Smile(
    feature_set=opensmile.FeatureSet.GeMAPSv01b,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)

# we will use the ComParE LLD to calculate frequency-based features on `F0final_sma`
lld_compare = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors,
)


In [5]:
if not EXTRACTED_FEATS:
    df_gemaps_func_list: List[pd.DataFrame] = []
    df_gemaps_lld_list: List[pd.DataFrame] = []
    df_compare_lld_list: List[pd.DataFrame] = []

    def _extract_parse_smile_df(s: opensmile.Smile, f: Path) -> pd.DataFrame:
        """process the audio file with the given OpenSmile conf & add relevant metadata.

        Parameters
        ----------
        s: opensmile.Smile
            The OpenSmile feature extraction conf
        f: Union[Path, str]
            The file that will be processed

        Returns
        -------
        pd.DataFrame
            The DataFrame output of the processed file with additional metadata,
            i.e., the "fileName", the "fileNumber", and the "participantNum"

        """
        # 1. process the audio file with the given opensmile conf
        df_feat = s.process_file(str(f))
        df_feat = df_feat.reset_index(drop=False)
        # 2. add metadata; i.e., file-path, filename, file number, participant number
        df_feat["file"] = df_feat["file"].astype("str")

        df_feat["fileName"] = f.name
        df_feat["fileNum"] = pd.to_numeric(
            df_feat.fileName.map(lambda x: x.split(".")[0].split("_")[-1]),
            errors="coerce",
        )
        df_feat["participantNum"] = int(f.parent.parent.name)
        return df_feat

    # iterate over all the audio files
    for file in tqdm(list(DATA_PATH.glob("*/audio/sentences_*.wav"))):
        # calculate the global utterance features
        df_gemaps_func_list.append(_extract_parse_smile_df(func_gemaps, f=file))

        # calculate sliding window based utterance features
        df_gemaps_lld_list.append(_extract_parse_smile_df(lld_gemaps, f=file))
        df_compare_lld_list.append(_extract_parse_smile_df(lld_compare, f=file))

    def _parse_concat_df(df_conc: pd.DataFrame) -> pd.DataFrame:
        """parse a just-concatenated DataFrame."""
        # Convert columns to categorical typ
        df_conc["file"] = df_conc["file"].astype("category")
        df_conc["fileName"] = df_conc["fileName"].astype("category")

        # Merge the OpenSmile feature dataframe with the SAM output
        return pd.merge(
            df_conc,
            pd.read_csv(loc_data_dir.joinpath("SAMsCompiled.csv")),
            on=["participantNum", "fileNum"],
        )

    df_gemaps_func = _parse_concat_df(pd.concat(df_gemaps_func_list, ignore_index=True))
    df_gemaps_lld = _parse_concat_df(pd.concat(df_gemaps_lld_list, ignore_index=True))
    df_compare_lld = _parse_concat_df(pd.concat(df_compare_lld_list, ignore_index=True))

    # Save the dataframes
    # As the gemaps dataframe is used in further analysis, we already apply a 
    # filter to it
    add_audio_mask(df_gemaps_func)
    df_gemaps_func[~df_gemaps_func.bad_audio].reset_index(drop=True).to_parquet(
        loc_data_dir.joinpath("df_gemaps_func.parquet"), engine="fastparquet"
    )
    df_gemaps_lld.to_parquet(
        loc_data_dir.joinpath("df_gemaps_lld.parquet"), engine="fastparquet"
    )
    df_compare_lld.to_parquet(
        loc_data_dir.joinpath("df_compare_lld.parquet"), engine="fastparquet"
    )

    # Delete future unused variables
    del (
        file,
        df_gemaps_func_list,
        df_gemaps_lld_list,
        df_compare_lld_list,
        _parse_concat_df,
        _extract_parse_smile_df,
    )

else:  # Already extracted features -> load the dataframes
    df_gemaps_func = pd.read_parquet(loc_data_dir.joinpath("df_gemaps_func.parquet"))
    df_gemaps_lld = pd.read_parquet(loc_data_dir.joinpath("df_gemaps_lld.parquet"))
    df_compare_lld = pd.read_parquet(loc_data_dir.joinpath("df_compare_lld.parquet"))


  0%|          | 0/794 [00:00<?, ?it/s]

## Extract `F0`-range from the LLD's

use `tsflex` to extract FO-quantile-range based features from the LLD's

In [7]:
from tsflex.features import FuncWrapper
from tsflex.features.utils import make_robust
import numpy as np


In [8]:
# ------------- define some additional feature functions -------------
def quantile_nz(a: np.ndarray, q=List[float]) -> List[Optional[float]]:
    """non-zero quantiles"""
    a_nz = a[a > 0]
    if len(a_nz):
        return np.quantile(a_nz, q=q)
    else:
        return [None] * len(q)


def nonzero_count(a: np.ndarray) -> int:
    return sum(a > 0)


def return_func_series_list(a: np.ndarray, f_list: List[FuncWrapper]) -> pd.Series:
    """apply each function on a and return a pd.Series with index `feat_name`
    and value `feat_value`.
    """
    s = pd.Series(dtype="float64")
    for f in f_list:
        s = pd.concat([s, pd.Series(data=f(a), index=f.output_names)])
    return s.sort_index()


qs = sum([[1 - q, q] for q in [0, 0.01, 0.02, 0.03, 0.05, 0.1, 0.15, 0.2]], []) + [0.5]
display(str(qs))


'[1, 0, 0.99, 0.01, 0.98, 0.02, 0.97, 0.03, 0.95, 0.05, 0.9, 0.1, 0.85, 0.15, 0.8, 0.2, 0.5]'

### GeMAPS LLD

Calculate the F0-range features on the gemaps LLD's

In [9]:
df_gemaps_lld.filter(like="F0").columns
# logRelF0-H1-H2_sma3nz -> log freq difference between the harmonics
# SMA -> moving average window
# NZ -> no-zero


Index(['F0semitoneFrom27.5Hz_sma3nz', 'logRelF0-H1-H2_sma3nz',
       'logRelF0-H1-A3_sma3nz', 'F1amplitudeLogRelF0_sma3nz',
       'F2amplitudeLogRelF0_sma3nz', 'F3amplitudeLogRelF0_sma3nz'],
      dtype='object')

In [10]:
# define the signal on which the function will be performed, and the functions
s_name = "F0semitoneFrom27.5Hz_sma3nz"

f_gemaps_lld_funcs: List[FuncWrapper] = [
    make_robust(
        FuncWrapper(quantile_nz, output_names=[s_name + f"_q={q}" for q in qs], q=qs),
        min_nb_samples=3,
        passthrough_nans=False,
    ),
    make_robust(FuncWrapper(nonzero_count, output_names=[f"{s_name}_nzcount"])),
]

# Apply the functions on each group
df_gemaps_lld_F0 = (
    df_gemaps_lld.groupby(by=["fileName", "fileNum", "participantNum"])[[s_name]]
    .apply(lambda x: return_func_series_list(x.values, f_list=f_gemaps_lld_funcs))
    .reset_index()
)
display(df_gemaps_lld_F0)

df_gemaps_lld_F0.to_parquet(loc_data_dir.joinpath("df_gemaps_lld_F0.parquet"))


Unnamed: 0,fileName,fileNum,participantNum,F0semitoneFrom27.5Hz_sma3nz_nzcount,F0semitoneFrom27.5Hz_sma3nz_q=0,F0semitoneFrom27.5Hz_sma3nz_q=0.01,F0semitoneFrom27.5Hz_sma3nz_q=0.02,F0semitoneFrom27.5Hz_sma3nz_q=0.03,F0semitoneFrom27.5Hz_sma3nz_q=0.05,F0semitoneFrom27.5Hz_sma3nz_q=0.1,...,F0semitoneFrom27.5Hz_sma3nz_q=0.2,F0semitoneFrom27.5Hz_sma3nz_q=0.5,F0semitoneFrom27.5Hz_sma3nz_q=0.8,F0semitoneFrom27.5Hz_sma3nz_q=0.85,F0semitoneFrom27.5Hz_sma3nz_q=0.9,F0semitoneFrom27.5Hz_sma3nz_q=0.95,F0semitoneFrom27.5Hz_sma3nz_q=0.97,F0semitoneFrom27.5Hz_sma3nz_q=0.98,F0semitoneFrom27.5Hz_sma3nz_q=0.99,F0semitoneFrom27.5Hz_sma3nz_q=1
0,sentences_occ_0.wav,0.0,2,1277.0,12.435101,13.466747,13.814878,14.208255,15.278906,19.285241,...,23.580828,33.262379,34.514874,34.700854,34.983155,35.451363,35.759081,36.293914,37.114670,37.659714
1,sentences_occ_0.wav,0.0,3,1412.0,12.439996,13.128016,13.414092,13.858814,14.567879,16.099171,...,24.109766,34.545498,35.626716,35.856668,36.069822,36.454732,36.764560,37.275663,37.682383,38.439503
2,sentences_occ_0.wav,0.0,4,1425.0,12.037051,12.537432,12.777922,12.877586,13.102240,13.631784,...,19.587413,23.304682,24.538245,24.755706,25.053704,25.609200,25.926469,26.324248,26.992587,34.139542
3,sentences_occ_0.wav,0.0,5,1460.0,12.260077,13.294396,13.548588,13.964571,14.831079,18.212867,...,32.812402,34.778267,36.406091,36.671796,37.035429,37.782782,37.973423,38.117235,38.503780,39.581223
4,sentences_occ_0.wav,0.0,6,1387.0,12.124546,12.720390,13.350441,13.605247,14.101310,18.546955,...,30.957187,33.413910,36.554290,37.356292,37.810281,38.729762,39.351277,39.905043,40.727717,41.366283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9238,sentences_occ_99.wav,8.0,79,,,,,,,,...,,,,,,,,,,
9239,sentences_occ_99.wav,8.0,80,,,,,,,,...,,,,,,,,,,
9240,sentences_occ_99.wav,8.0,81,,,,,,,,...,,,,,,,,,,
9241,sentences_occ_99.wav,8.0,82,,,,,,,,...,,,,,,,,,,


### ComPaRE LLD

Calculate the F0-range features using the COMPARE LLD's.

In [11]:
df_compare_lld.filter(like="F0").columns


Index(['F0final_sma'], dtype='object')

In [12]:
s_name = "F0final_sma"

f_compare_lld_funcs: List[FuncWrapper] = [
    make_robust(
        FuncWrapper(quantile_nz, output_names=[s_name + f"_q={q}" for q in qs], q=qs),
        min_nb_samples=3,
        passthrough_nans=False,
    ),
    make_robust(FuncWrapper(nonzero_count, output_names=[f"{s_name}_nzcount"])),
]

df_compare_lld_F0 = df_compare_lld.groupby(
    by=["fileName", "fileNum", "participantNum"]
)[[s_name]].apply(
    lambda x: return_func_series_list(x.values, f_list=f_compare_lld_funcs)
)
display(df_compare_lld_F0)

df_compare_lld_F0.to_parquet(loc_data_dir.joinpath("df_compare_lld_F0.parquet"))


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,F0final_sma_nzcount,F0final_sma_q=0,F0final_sma_q=0.01,F0final_sma_q=0.02,F0final_sma_q=0.03,F0final_sma_q=0.05,F0final_sma_q=0.1,F0final_sma_q=0.15,F0final_sma_q=0.2,F0final_sma_q=0.5,F0final_sma_q=0.8,F0final_sma_q=0.85,F0final_sma_q=0.9,F0final_sma_q=0.95,F0final_sma_q=0.97,F0final_sma_q=0.98,F0final_sma_q=0.99,F0final_sma_q=1
fileName,fileNum,participantNum,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
sentences_occ_0.wav,0.0,2,1305.0,52.270561,55.465402,58.989787,60.276768,62.634573,75.496953,94.804201,104.724744,187.486130,201.684525,204.056589,207.396536,212.945419,216.872056,223.309805,234.459472,242.135727
sentences_occ_0.wav,0.0,3,1438.0,52.501904,55.533387,58.160414,59.298107,61.579994,68.426038,79.080122,102.008745,201.861099,215.115482,218.025338,220.745692,225.795508,229.817769,236.702899,242.387048,253.291046
sentences_occ_0.wav,0.0,4,1484.0,52.134758,53.612640,54.619430,55.355605,56.663897,58.621768,60.349733,66.947612,104.856693,112.976273,114.692679,116.485724,120.628426,122.790532,125.235626,130.250058,197.712845
sentences_occ_0.wav,0.0,5,1484.0,52.093735,54.789709,56.277618,58.652993,61.590350,70.324927,103.999651,180.025381,204.573944,225.088889,228.420805,233.426773,243.818459,246.425437,248.622313,254.159949,270.558350
sentences_occ_0.wav,0.0,6,1445.0,52.266987,53.462184,55.384617,56.776181,58.894749,63.552700,84.447299,149.108212,186.940353,224.350107,237.027103,243.401270,257.295056,266.497603,275.378060,288.549900,299.944214
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sentences_occ_99.wav,8.0,79,,,,,,,,,,,,,,,,,,
sentences_occ_99.wav,8.0,80,,,,,,,,,,,,,,,,,,
sentences_occ_99.wav,8.0,81,,,,,,,,,,,,,,,,,,
sentences_occ_99.wav,8.0,82,,,,,,,,,,,,,,,,,,


In [13]:
del df_compare_lld, df_gemaps_lld


# Join into one big dataframe

In [14]:
df_feat_tot = df_gemaps_func.merge(
    df_gemaps_lld_F0.reset_index(), on=["participantNum", "fileNum", "fileName"]
).merge(df_compare_lld_F0.reset_index(), on=["participantNum", "fileNum", "fileName"])
df_feat_tot


Unnamed: 0,file,start,end,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,...,F0final_sma_q=0.2,F0final_sma_q=0.5,F0final_sma_q=0.8,F0final_sma_q=0.85,F0final_sma_q=0.9,F0final_sma_q=0.95,F0final_sma_q=0.97,F0final_sma_q=0.98,F0final_sma_q=0.99,F0final_sma_q=1
0,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:22.759541667,21.179815,0.180716,18.353466,22.343269,24.074480,5.721014,190.196594,...,65.196510,99.138233,110.025970,112.053701,114.192257,117.051977,119.368555,121.879068,124.523561,142.359802
1,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:22.959541667,22.066591,0.160472,21.380507,22.858728,24.399647,3.019140,221.611267,...,91.588757,102.468681,112.185510,114.339508,117.557909,120.699886,123.230260,125.037055,135.241089,185.964005
2,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:22.879541667,22.135927,0.156605,21.288734,22.825014,24.613152,3.324417,216.903305,...,90.732471,102.212540,113.653793,116.742743,119.183051,122.307113,125.257780,127.775723,130.264922,149.820251
3,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:23.079541667,21.619080,0.194843,20.401165,22.689894,24.232353,3.831188,171.961365,...,70.690057,101.297112,110.718977,114.317660,116.602289,120.677321,123.679845,124.687764,127.257471,132.555847
4,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:24.879541667,21.188087,0.204497,16.445494,22.498318,24.085922,7.640429,347.091370,...,61.879944,99.730675,109.675598,111.428383,113.501854,116.884407,119.938060,122.286217,126.239799,150.601013
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
706,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:24.399541667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
707,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:22.519541667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
708,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:21.399541667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,
709,/users/jonvdrdo/jonas/data/aaa_contextaware/ra...,0 days,0 days 00:00:21.519541667,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,,,,,,,,,,


In [19]:
# save the features
df_feat_tot.to_feather(loc_data_dir.joinpath("df_speech_feat_tot.feather"))
