# Feature Extraction from Speech Data with OpenSMILE and GeMaps config

In [1]:
import glob
import os
import subprocess
import numpy as np
import pandas as pd
from scipy.io import arff
from io import StringIO
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler, PowerTransformer, QuantileTransformer, Normalizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from joblib import dump, load
from sklearn.svm import SVC
from sklearn.feature_selection import f_classif
from joblib import dump, load
from pathlib import Path

In [2]:
# Set Data Path
DATA_PATH = "C:/Users/mitch/OneDrive - UGent/UGent/Projects/5. VOP project/Voice_and_App/"
# Set OpenSMILE Config Path
OPENSMILE_CONFIG = '"C:/Users/mitch/OneDrive - UGent/UGent/Topics/Facial/OpenSmile/opensmile-2.3.0/config/gemaps/GeMAPSv01a.conf"'

FEATURES_PATH = DATA_PATH + "features_emobase/"
STRESSED_PATH = FEATURES_PATH + "stressed/"
UNSTRESSED_PATH = FEATURES_PATH + "unstressed/"

# Extracting features

In [3]:
## Only run this cell ONCE (set to False if exctracted)

if False:
    # Create paths if not existed
    Path(STRESSED_PATH).mkdir(parents=True, exist_ok=True)
    Path(UNSTRESSED_PATH).mkdir(parents=True, exist_ok=True)

    def extract(source, destination):
        # Generate extraction command
        cmd = "C:/Users/mitch/OneDrive/Documents/UGent/Topics/Facial/OpenSmile/opensmile-2.3.0/SMILExtract_Release -C {} -I {} -O {}".format(OPENSMILE_CONFIG, source, destination)
    #     print(cmd)
        p = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        p.wait()

    for wav_file in glob.glob(DATA_PATH + "*/*.wav"):
        participant_dir = wav_file.split("/")[-1]
        participant_dir = participant_dir.split("\\")[1]
        name = os.path.basename(wav_file).split(".")[0]

        if name == "stressed": # Post measures
            path = STRESSED_PATH
        elif name == "unstressed": # Pre measures
            path = UNSTRESSED_PATH

        out_path = path + name + "_" + participant_dir + ".arff"

        wav_file = '"' + wav_file.replace(os.sep, '/') + '"'
        out_path = '"' + out_path + '"'

        extract(wav_file, out_path)
#         break
        
    print(out_path)

# Load in data

In [4]:
def arff_to_dataframe(path):
    did = os.path.basename(path).split('_')[-1].split('.')[0]
    arff_content = open(path, "r").read()
    
    # scipy ARFF implementation doesn't support non-numeric attributes so remove them (aren't informative anyways)...
    # This is just a string replace, a little hacky, should probably look for a better ARFF library
    arff_content = arff_content.replace("'noname',", "").replace("@attribute name string", "").replace("'liveturn_0',", "").replace(",unassigned", "").replace("@attribute class numeric", "").replace("@attribute emotion unknown", "").replace(",?", "").replace("'unknown',", "")
    f = StringIO(arff_content)
    data = arff.loadarff(f)
    data = pd.DataFrame(data[0])
    data["id"] = did
    data["file"] = os.path.basename(stressed_file)
    if data.shape[0] > 1:
        data.drop(data.index[1], inplace=True)
    return data

In [5]:
stressed_datas = [] # stressed speech placeholder dataframe
unstressed_datas = [] # unstressed speech placeholder dataframe

for stressed_file in glob.glob(STRESSED_PATH + "*.arff"):
    data = arff_to_dataframe(stressed_file)
    stressed_datas.append(data)

for unstressed_file in glob.glob(UNSTRESSED_PATH + "*.arff"):
    data = arff_to_dataframe(unstressed_file)
    unstressed_datas.append(data)

stressed = pd.concat(stressed_datas)
unstressed = pd.concat(unstressed_datas)

stressed['stressed'] = 1
unstressed['stressed'] = 0
stressed["id"] = stressed["id"].astype(int)
unstressed["id"] = unstressed["id"].astype(int)
        
data = pd.concat([unstressed, stressed])
data = data.sort_values(['id', 'stressed'], ascending=[1, 0]).reset_index(drop=True)
data = data.drop(data[data.stressed == -1].index)

ids = data.pop('id')
files = data.pop('file')
labels = data.pop('stressed').values

# Select relevant features

In [6]:
data['F1F2Ratio'] = data['F1frequency_sma3nz_amean']/data['F2frequency_sma3nz_amean'] # Compute F1/F2 Ratio

cleanData = pd.DataFrame(data[['F0semitoneFrom27.5Hz_sma3nz_amean','jitterLocal_sma3nz_amean','F1F2Ratio','HNRdBACF_sma3nz_amean','MeanVoicedSegmentLengthSec','VoicedSegmentsPerSec']])

In [7]:
# Split up based on rows (starting with pre (rows are intermittent pre-post-pre-post - so duos per participant))
maxX = len(cleanData)
count = 0
dataPre = []
dataPost = []
temp = []
idPre = []
idPost = []

while count < maxX:
    count = count +1
    if count % 2 == 0:
        dataPost.append(cleanData.iloc[count-1])
        idPost.append(ids[count-1]) # create seperate column with actual participant no.
    else:
        dataPre.append(cleanData.iloc[count-1])
        idPre.append(ids[count-1]) # create seperate column with actual participant no.
    
dataPre = pd.DataFrame(dataPre)
dataPost = pd.DataFrame(dataPost)

dataPre.insert (0, "ID", idPre)
dataPost.insert (0, "ID", idPost)

dataPre.reset_index(inplace = True) 
dataPost.reset_index(inplace = True) 

dataPre.to_csv("../../Data/geMapsFeatures_pre.csv", index=False)
dataPost.to_csv("../../Data/geMapsFeatures_post.csv", index=False)

In [8]:
# Loop through all columns for both variables to rename variables to PRE and POST
# Simultaneously computing Delta scores
maxX = len(dataPre.columns)
dataDelta = pd.DataFrame()
count = 0

while count < maxX:
    if count > 1: # Skip IDs
        dataDelta['Delta_' + dataPre.columns[count]] = dataPost.iloc[:,count]-dataPre.iloc[:,count]
        dataPre.rename(columns={dataPre.columns[count]: 'PRE_' + dataPre.columns[count]}, inplace=True)
        dataPost.rename(columns={dataPost.columns[count]: 'POST_' + dataPost.columns[count]}, inplace=True)
    count = count + 1

In [9]:
# Put pre- and post- together in one complete file
dataPrePost = pd.concat([dataPre, dataPost, dataDelta], axis=1) # Concatting side-by-side pre and post data

dataPrePost.to_csv("../../Data/speechFeatures_geMaps_PrePostDelta.csv", index=False) # Writing to csv

# Merge voice feature data with other data

In [10]:
# Load other data - questionnaires - mood - demographics - EDA
# otherData = pd.read_csv("C:\\Users\\mitch\\OneDrive - UGent\\UGent\\Projects\\5. VOP project\\Paper\\Submissions\\Main folder\\Upload\\Data\\Data.csv")
otherData = pd.read_csv(
    "C:/Users/mitch/OneDrive - UGent/UGent/Projects/5. VOP project/Paper/Submissions/Main folder/Upload/Data/dataKristof.csv",
    sep=";")

df1 = pd.DataFrame(otherData[['ID','AGE','GENDER','KNOWS_EXPERIMENT_GOAL','RIGHTHANDED']])

df2 = otherData.filter(regex='DASS')
df2 = df2.stack().str.replace(',','.').unstack()
df2 = df2.apply(pd.to_numeric, errors='coerce')

df3 = otherData.filter(regex='RRS')
df3 = df3.stack().str.replace(',','.').unstack()
df3 = df3.apply(pd.to_numeric, errors='coerce')

df4 = otherData.filter(regex='SCL')
df4 = df4.stack().str.replace(',','.').unstack()
df4 = df4.apply(pd.to_numeric, errors='coerce')

df5 = pd.DataFrame(otherData[['Affect2','Affect3']])
df5 = df5.stack().str.replace(',','.').unstack()
df5 = df5.apply(pd.to_numeric, errors='coerce')
df5 = df5.rename(columns={"Affect2": "MoodPre", "Affect3": "MoodPost"})
df5["VAS_StressReactivity"] = df5["MoodPost"]-df5["MoodPre"]

df6 = pd.DataFrame(otherData[['NA2','NA3']])
df6 = df6.stack().str.replace(',','.').unstack()
df6 = df6.apply(pd.to_numeric, errors='coerce')
df6 = df6.rename(columns={"NA2": "NegAffPre", "NA3": "NegAffPost"})
df6["NA_Reactivity"] = df6["NegAffPost"]-df6["NegAffPre"]


df7 = pd.DataFrame(otherData[['MOE2','KRACHTIG2','BOOS2','TEVREDEN2','GESPANNEN2','NEERSLACHTIG2','PRETTIG2',
 'MOE3','KRACHTIG3','BOOS3','TEVREDEN3','GESPANNEN3','NEERSLACHTIG3','PRETTIG3']])
df7 = df7.rename(columns={'MOE2':'MOE1','KRACHTIG2':'KRACHTIG1','BOOS2':'BOOS1','TEVREDEN2':'TEVREDEN1','GESPANNEN2':'GESPANNEN1','NEERSLACHTIG2':'NEERSLACHTIG1','PRETTIG2':'PRETTIG1',
 'MOE3':'MOE2','KRACHTIG3':'KRACHTIG2','BOOS3':'BOOS2','TEVREDEN3':'TEVREDEN2','GESPANNEN3':'GESPANNEN2','NEERSLACHTIG3':'NEERSLACHTIG2','PRETTIG3':'PRETTIG2'})


# relevantData = [df1, df2, df3]
# relevantData = pd.concat(relevantData)
# print(pd.DataFrame(df2).head(140))
joe = pd.concat([df1, df2, df3, df4, df5, df6, df7], axis=1, sort=False)
joe['ID'] = joe['ID'].apply(pd.to_numeric, errors='coerce')
joe = joe.rename(columns={'ID':'IDS'})
joe.to_csv("../../Data/DataSeg.csv", index=False)

In [11]:
# Load other data - questionnaires - mood - demographics - EDA
# otherData = pd.read_csv("C:\\Users\\mitch\\OneDrive - UGent\\UGent\\Projects\\5. VOP project\\Paper\\Submissions\\Main folder\\Upload\\Data\\Data.csv")

otherData = pd.read_csv("../../Data/DataSeg.csv")
otherData = otherData.drop([148], axis=0) # Delete last row because something strange happening here when loading in the data

In [12]:
dataComplete = pd.concat([otherData, dataPrePost], axis=1) # Merging all data together in one dataframe
dataComplete.columns = map(str.upper, dataComplete.columns) # Capitalize column names

dataComplete.to_csv("../../Data/completeData.csv", index=False) # Writing to csv

In [13]:
otherData

Unnamed: 0,ID,AGE,GENDER,KNOWS_EXPERIMENT_GOAL,RIGHTHANDED,DASS1,DASS2,DASS3,DASS5,DASS6,...,GESPANNEN1,NEERSLACHTIG1,PRETTIG1,MOE2,KRACHTIG2,BOOS2,TEVREDEN2,GESPANNEN2,NEERSLACHTIG2,PRETTIG2
0,2.0,22,Vrouw,FALSE,TRUE,2.0,1.0,1.0,1.0,2.0,...,71,5,35,47,40,17,48,54,15,37
1,3.0,19,Man,FALSE,TRUE,1.0,1.0,0.0,1.0,2.0,...,28,0,85,35,35,0,72,50,0,49
2,4.0,27,Man,FALSE,TRUE,0.0,2.0,0.0,0.0,0.0,...,8,1,63,44,23,11,54,42,10,52
3,5.0,21,Man,FALSE,TRUE,1.0,3.0,2.0,1.0,0.0,...,19,47,40,44,44,40,61,28,40,53
4,6.0,23,Man,FALSE,TRUE,1.0,2.0,1.0,1.0,1.0,...,24,0,72,33,36,0,53,35,0,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143,154.0,58,Man,TRUE,TRUE,1.0,0.0,0.0,0.0,0.0,...,8,0,69,15,79,1,86,16,0,75
144,155.0,21,Man,FALSE,TRUE,0.0,0.0,1.0,2.0,1.0,...,8,6,78,9,100,70,70,44,29,69
145,156.0,21,Man,FALSE,TRUE,0.0,1.0,0.0,0.0,0.0,...,4,2,54,2,4,1,54,12,2,50
146,157.0,36,Vrouw,FALSE,TRUE,0.0,0.0,0.0,0.0,1.0,...,4,0,79,55,65,0,70,30,3,55


In [16]:
dataComplete.columns

Index(['ID', 'AGE', 'GENDER', 'KNOWS_EXPERIMENT_GOAL', 'RIGHTHANDED', 'DASS1',
       'DASS2', 'DASS3', 'DASS5', 'DASS6', 'DASS7', 'DASS8', 'DASS9', 'DASS10',
       'DASS11', 'DASS12', 'DASS13', 'DASS14', 'DASS15', 'DASS16', 'DASS17',
       'DASS18', 'DASS19', 'DASS20', 'DASS21', 'DASS4_IMPUTED', 'DASS_ANXIETY',
       'DASS_DEPRESSION', 'DASS_STRESS', 'RRS1', 'RRS3', 'RRS4', 'RRS5',
       'RRS6', 'RRS7', 'RRS8', 'RRS9', 'RRS10', 'RRS11', 'RRS12', 'RRS13',
       'RRS14', 'RRS15', 'RRS16', 'RRS17', 'RRS18', 'RRS19', 'RRS23', 'RRS24',
       'RRS25', 'RRS26', 'RRS_BROODING', 'RRS_REFLECTION',
       'RRS_TREYNOR_TOTAAL', 'MEAN_SCL_PRE_LEFT', 'MEAN_SCL_STRESS_LEFT',
       'MEAN_SCL_POST_LEFT', 'MEAN_SCL_PRE_RIGHT', 'MEAN_SCL_STRESS_RIGHT',
       'MEAN_SCL_POST_RIGHT', 'MOODPRE', 'MOODPOST', 'VAS_STRESSREACTIVITY',
       'MOE1', 'KRACHTIG1', 'BOOS1', 'TEVREDEN1', 'GESPANNEN1',
       'NEERSLACHTIG1', 'PRETTIG1', 'MOE2', 'KRACHTIG2', 'BOOS2', 'TEVREDEN2',
       'GESPANNEN2', 'NEERSL