# About

Proof of concept notebook for obtaining and preprocessing MapTask data from 
my MapTask pipeline. 

## Setup 

In [20]:
# Download libraries for environment. 

import sys 
import os 

# Env. vars to check if the notebook is running on colab, kaggle etc. 
IS_COLAB = "google.colab" in sys.modules 
IS_KAGGLE = "kaggle_secrets" in sys.modules 
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

if IS_COLAB:
    # Install the packages 
    %pip install -q -U tensorflow-addons
    %pip install -q -U transformers
    %pip install -q -U datasets
    print("You can safely ignore the package incompatibility errors.")
    # Mount the drive 
    from google.colab import drive 
    drive.mount("/drive")

In [21]:

import time
import pandas as pd
from sklearn import preprocessing 
import numpy as np
import scipy.io as io
import glob 
import shutil 
import torch 

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


In [22]:
# --  Set environment global vars. 

# Shared env. vars. 
GLOBAL_SEED = 42 
IS_CUDA_ENV = torch.cuda.is_available()
GLOBAL_DEVICE = torch.device('cuda') if IS_CUDA_ENV else torch.device('cpu')
SET_SEED = True # If true, sets the global seeds for this notebook. 

if IS_LOCAL:
    SMALL_DATASET = True if not IS_CUDA_ENV else False # Use a small dataset if no cuda env. 
    SMALL_DATASET_SIZE = 3 

if IS_COLAB:
    SMALL_DATASET = False 

In [23]:
# Configuring env. 
if SET_SEED:
    # to make this notebook's output stable across runs
    np.random.seed(GLOBAL_SEED) 
    torch.manual_seed(GLOBAL_SEED)

In [38]:
# Project Paths
NOTEBOOK_NAME = "1.0-MU-Maptask-preprocess-POC"
PROJECT_ROOT_DIR = "/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous" 
# --- Input data dirs. 
DATASET_NAME = "1.0-MU-Maptask-preprocess-POC"
DATASET_TYPE = "csv"
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "processed", DATASET_NAME)
RAW_DATA_DIR = os.path.join(PROJECT_ROOT_DIR,"data", "raw", "maptask")

# --- Result dirs. 
# NOTE: The model dir will have to change depending on where the models are stored. 
REPORTS_DIR = os.path.join(PROJECT_ROOT_DIR,"reports",NOTEBOOK_NAME)

os.makedirs(REPORTS_DIR,exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR,exist_ok=True)


In [39]:
MAPTASK_DIR = os.path.join(RAW_DATA_DIR,"maptaskv2-1")
GEMAPS_DIR = os.path.join(RAW_DATA_DIR,"audio_features/egemaps_v02_50ms")

# Paths within the maptask corpus 
STEREO_AUDIO_PATH = os.path.join(MAPTASK_DIR,"Data/signals/dialogues")
MONO_AUDIO_PATH = os.path.join(MAPTASK_DIR,"Data/signals/mono_signals")
# NOTE: The timed units are also used for Voice Activity annotations. 
TIMED_UNIT_PATHS = os.path.join(MAPTASK_DIR,"Data/timed-units") 
POS_PATH = os.path.join(MAPTASK_DIR,"Data/pos")


## MapTask Corpus

### Utility Methods

In [40]:
PARTICIPANT_LABELS_MAPTASK = ["f","g"] # NOTE: f = follower ; g = giver.


In [41]:
def get_maptask_participant(csv_path):
    filename, ext = os.path.splitext(os.path.basename(csv_path))
    filename_split = filename.split(".")
    participant = filename_split[1]
    return participant

def get_maptask_dialogue(csv_path):
    filename, ext = os.path.splitext(os.path.basename(csv_path))
    filename_split = filename.split(".")
    dialogue = filename_split[0]
    return dialogue


In [42]:
def read_data(dir_path,dialogue_name, participant,ext):
    """
    Assumption is that the basename . is the dialogue name. 
    """
    results = []
    data_paths = [p for p in os.listdir(dir_path)]
    data_paths = [os.path.join(dir_path,p) for p in data_paths if os.path.splitext(p)[1][1:] == ext]
    for path in data_paths:
       if get_maptask_dialogue(path) == dialogue_name and \
                get_maptask_participant(path) == participant:
            results.append(path)
    return results 

def get_mono_audio(dialogue_name, participant):
    return read_data(MONO_AUDIO_PATH,dialogue_name, participant,"wav")[0]

def get_stereo_audio(dialogue_name):
    return read_data(STEREO_AUDIO_PATH,dialogue_name,"mix","wav")[0]

def get_timed_unit(dialogue_name, participant):
    return read_data(TIMED_UNIT_PATHS,dialogue_name, participant,"xml")[0]


In [43]:
# Getting the name of all of the dialogues 
DIALOGUE_NAMES = sorted([ get_maptask_dialogue(p) for p in glob.glob("{}/*.xml".format(TIMED_UNIT_PATHS))])
if SMALL_DATASET:
    DIALOGUE_NAMES_SPLIT = DIALOGUE_NAMES[:SMALL_DATASET_SIZE]
else:
    DIALOGUE_NAMES_SPLIT = DIALOGUE_NAMES
DIALOGUE_NAMES_SPLIT

['q1ec1', 'q1ec1', 'q1ec2']

In [44]:
get_timed_unit(DIALOGUE_NAMES_SPLIT[0],"f")

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/raw/maptask/maptaskv2-1/Data/timed-units/q1ec1.f.timed-units.xml'

In [45]:
get_mono_audio(DIALOGUE_NAMES_SPLIT[0],"f")

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/raw/maptask/maptaskv2-1/Data/signals/mono_signals/q1ec1.f.wav'

In [46]:
get_stereo_audio(DIALOGUE_NAMES_SPLIT[0])

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/raw/maptask/maptaskv2-1/Data/signals/dialogues/q1ec1.mix.wav'

In [47]:
read_data(GEMAPS_DIR,DIALOGUE_NAMES_SPLIT[0], "f","csv")

['/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/raw/maptask/audio_features/egemaps_v02_50ms/q1ec1.f.eGeMAPSv02.csv']

### Feature Extraction - Skantze 2017 - GEeMaps

The goal in this section is to extract all the features that are required for the original LSTM model. 

These features include:
1. Voice Activity --> From dataset annotations 
2. Pitch --> From opensmile
3. Spectral Stability --> Not sure 
4. Parts of Speech --> Annotations supplied with the data 

In [48]:
FRAME_STEP_MS = 50 # In the original paper, features were extracted every 50 ms. 
FRAME_SIZE_MS = 50 # In the original paper, each frame was 50 ms long.

GEMAPS_CSV_DELIMITER = ";"

# NOTE: These categories are defined in the original paper. 
# NOTE: There are other features extracted by OpenSmile but they were not defined 
# in the original paper - and are not used here - but these might be useful later. 
# Ex. Mfccs. 
# The names have been adapted for use with the results produced by opensmile. 


GEMAPS_FREQUENCY_FEATURES = [
    'F0semitoneFrom27.5Hz_sma3nz', # Pitch: logarithmic F0 on a semitone frequency scale, starting at 27.5 Hz (semitone 0)
    "jitterLocal_sma3nz", # Jitter, deviations in individual consecutive F0 period lengths.
     # Formant 1, 2, and 3 frequency, centre frequency of first, second, and third formant
    "F1frequency_sma3nz",
    "F2frequency_sma3nz", 
    "F3frequency_sma3nz", 
    "F1bandwidth_sma3nz"
] 
    
GEMAPS_ENERGY_FEATURES = [
    "shimmerLocaldB_sma3nz", # Shimmer, difference of the peak amplitudes of consecutive F0 periods.
    "Loudness_sma3", # Loudness, estimate of perceived signal intensity from an auditory spectrum.
    "HNRdBACF_sma3nz" # Harmonics-to-Noise Ratio (HNR), relation of energy in harmonic components to energy in noiselike components.
]

GEMAPS_SPECTRAL_FEATURES = [
    "alphaRatio_sma3", #  Alpha Ratio, ratio of the summed energy from 50–1000 Hz and 1–5 kHz
    "hammarbergIndex_sma3",  # Hammarberg Index, ratio of the strongest energy peak in the 0–2 kHz region to the strongest peak in the 2–5 kHz region
    # Spectral Slope 0–500 Hz and 500–1500 Hz, linear regression slope of the logarithmic power spectrum within the two given bands
    "slope0-500_sma3", 
    "slope500-1500_sma3", 
    # Formant 1, 2, and 3 relative energy, as well as the ratio of the energy of the spectral harmonic
    # peak at the first, second, third formant’s centre frequency to the energy of the spectral peak at F0.
    "F1amplitudeLogRelF0_sma3nz", 
    "F2amplitudeLogRelF0_sma3nz", 
    "F3amplitudeLogRelF0_sma3nz", 
    "logRelF0-H1-H2_sma3nz", # Harmonic difference H1–H2, ratio of energy of the first F0 harmonic (H1) to the energy of the second F0 harmonic (H2)
    "logRelF0-H1-A3_sma3nz" # Harmonic difference H1–A3, ratio of energy of the first F0 harmonic (H1) to the energy of the highest harmonic in the third formant range (A3).
]

# These are all the GeMAPS features we are interested in. 
RELEVANT_GEMAP_FEATURES = GEMAPS_FREQUENCY_FEATURES + GEMAPS_ENERGY_FEATURES + \
    GEMAPS_SPECTRAL_FEATURES

In [49]:
# Utility function for reading GeMaps since the delimiter may be either , or ; 
# TODO: For some reason q1ec1 is using , as delimiter instead of ; 
def read_gemaps_as_df(path):
    for delimiter in (",",";"):
        try:
            gemaps_df = pd.read_csv(path,delimiter=delimiter,index_col=False)
            x = gemaps_df["frameTime"]
            return gemaps_df
        except:
            pass 

In [57]:


def verify_correct_gemaps(gemaps_dir, dialogue_name, participant,result_dir):
    """
    Verify that the gemaps data does not have any duplicated frame times and 
    that there are no null values. 
    """
    gemaps_paths = read_data(gemaps_dir,dialogue_name, participant,"csv")
    for path in gemaps_paths:
        filename = os.path.splitext(os.path.basename(path))[0]
        gemaps_df = read_gemaps_as_df(path)
        # NOTE: This is required because opensmile is producing some duplicated frameTimes. 
        gemaps_df.drop_duplicates(subset=['frameTime'], inplace=True)
        # Drop the 'name' column 
        gemaps_df.drop(columns=['name'],inplace=True)
        # Check that the frameTime steps are as expected 
        for i in range(len(gemaps_df['frameTime']) -1):
            difference = np.abs(gemaps_df['frameTime'].iloc[i+1] - gemaps_df['frameTime'].iloc[i])
            assert (FRAME_STEP_MS/1000 - 1e-3) < difference < (FRAME_STEP_MS/1000 + 1e-3) 
        # Ensure that none of the values is null. 
        assert not gemaps_df.isnull().values.any()
        gemaps_df.to_csv(os.path.join(result_dir,"{}.csv".format(filename)), sep=GEMAPS_CSV_DELIMITER)
    


In [58]:
CORRECTED_GEMAPS_DIR = os.path.join(PROCESSED_DATA_DIR,"corrected_gemaps_50ms")

In [59]:

if os.path.isdir(CORRECTED_GEMAPS_DIR):
    shutil.rmtree(CORRECTED_GEMAPS_DIR)
os.makedirs(CORRECTED_GEMAPS_DIR)

In [60]:
verify_correct_gemaps(GEMAPS_DIR,DIALOGUE_NAMES_SPLIT[0],PARTICIPANT_LABELS_MAPTASK[0],CORRECTED_GEMAPS_DIR) 

In [61]:

for dialogue_name in DIALOGUE_NAMES_SPLIT:
    for participant in PARTICIPANT_LABELS_MAPTASK:
        verify_correct_gemaps(GEMAPS_DIR, dialogue_name, participant,CORRECTED_GEMAPS_DIR) 

#### Voice Activity 

In [62]:
# Minimum utterance duration for it to be considered voice activity. 
MINIMUM_VA_CLASSIFICATION_TIME_MS = 25 
VOICE_ACTIVITY_LABEL = 1 # This means that voice activity was detected. 


In [63]:
import xml 

In [64]:

# NOTE: This voice activity is for 50ms intervals. 
def get_voice_activity_annotations(dialogue_name, participant):
    timed_unit_path = get_timed_unit(dialogue_name,participant)
    # Read the xml file 
    tree = xml.etree.ElementTree.parse(timed_unit_path).getroot()
    # Extracting the audio end time from te timed units file. 
    audio_end_time_ms = float(list(tree.iter())[-1].get('end')) *1000
    tu_tags = tree.findall('tu')
    # Getting all the times in which there are voice activity annotations in the corpus. 
    va_times = []
    for tu_tag in tu_tags:
        start_time_s = float(tu_tag.get('start'))
        end_time_s = float(tu_tag.get('end'))
        if end_time_s - start_time_s >= MINIMUM_VA_CLASSIFICATION_TIME_MS/1000:
            va_times.append((start_time_s,end_time_s))
    # Get the frame times based on the final times unit time. 
    # NOTE: This is being generated based on the step size for now. 
    frame_times_s = np.arange(0,audio_end_time_ms,FRAME_STEP_MS) / 1000
    # Array to store voice  activity - initially all zeros means no voice activity. 
    voice_activity = np.zeros((frame_times_s.shape[0]))
    # For each activity detected, get the start and end index of the nearest frame being 
    # considered from the input audio. 
    for start_time_s, end_time_s in va_times:
        # Obtaining index relative to the frameTimes being considered for 
        # which there is voice activity. 
        start_idx = np.abs(frame_times_s-start_time_s).argmin()
        end_idx = np.abs(frame_times_s-end_time_s).argmin()
        voice_activity[start_idx:end_idx+1] = VOICE_ACTIVITY_LABEL
    # Ensure that there are no nan values introduced in the data. 
    assert not np.isnan(voice_activity).any() and not np.isnan(frame_times_s).any()
    return pd.DataFrame({
        "frameTime" : frame_times_s,
        "voiceActivity" :  voice_activity
    })

In [65]:
# Getting VA annotations 
voice_activity = get_voice_activity_annotations(
    DIALOGUE_NAMES_SPLIT[0],PARTICIPANT_LABELS_MAPTASK[0])


In [66]:
np.array(np.where(voice_activity == 0)).shape, np.array(np.where(voice_activity == VOICE_ACTIVITY_LABEL)).shape

((2, 4939), (2, 365))

In [67]:
# NOTE: Percentage of frames with voice activity 
(np.array(np.where(voice_activity == VOICE_ACTIVITY_LABEL)).shape[1] / \
np.array(np.where(voice_activity == 0)).shape[1]) * 100 

7.390159951407167

In [69]:
voice_activity_save_dir = os.path.join(PROCESSED_DATA_DIR,"voice_activity_poc")
os.makedirs(voice_activity_save_dir,exist_ok=True)

In [70]:
# Save the voice activity for both f and g participants. 

voice_activity_f_df = get_voice_activity_annotations(
    DIALOGUE_NAMES_SPLIT[0],"f")
voice_activity_g_df = get_voice_activity_annotations(
    DIALOGUE_NAMES_SPLIT[0],"g")
voice_activity_f_df.to_csv("{}/{}.f.voice_activity.csv".format(voice_activity_save_dir,DIALOGUE_NAMES_SPLIT[0]))
voice_activity_g_df.to_csv("{}/{}.g.voice_activity.csv".format(voice_activity_save_dir,DIALOGUE_NAMES_SPLIT[0]))

#### Pitch 

In [71]:
PITCH_FEATURE_LABELS = "F0semitoneFrom27.5Hz_sma3nz"


In [72]:
# Read the frameTimes 
# NOTE: Not sure why the original code is doing this. 
gemaps_path = read_data(CORRECTED_GEMAPS_DIR,DIALOGUE_NAMES_SPLIT[0], "g","csv")[0]


In [73]:
# Define the amount by which the opensmile features are shifted back
# NOTE: This is because we are using a 50ms timestep and the extracted features 
# are also on a 50 ms timescale. 

# Read the gemaps feature file. 
gemaps_df = pd.read_csv(gemaps_path,delimiter=GEMAPS_CSV_DELIMITER,index_col=0)
# Extract the relevant raw gemap features into a separate file. 
relevant_gemaps_df = gemaps_df[RELEVANT_GEMAP_FEATURES]
# Obtain the z normalized values for each column individually. 
z_normalized_feat_df = relevant_gemaps_df.apply(
    lambda col: preprocessing.scale(col),axis=1,result_type='broadcast')
# Make sure there are no nan values introduced 
assert not relevant_gemaps_df.isnull().values.any()
assert not z_normalized_feat_df.isnull().values.any()


In [74]:
gemaps_df.shape, relevant_gemaps_df.shape, z_normalized_feat_df.shape

((5301, 26), (5301, 18), (5301, 18))

In [75]:
# The original paper uses both the absolute and relevant pitch values and a 
# binary label indicating whether the frame was voiced. 
absolute_pitch = relevant_gemaps_df[PITCH_FEATURE_LABELS]
z_normalized_pitch = z_normalized_feat_df[PITCH_FEATURE_LABELS]
assert len(absolute_pitch) == len(z_normalized_pitch)
# Determine whether frame was voiced 
frame_times_s = gemaps_df["frameTime"] 
data = {
    "frameTime" : frame_times_s,
    "{}Absolute".format(PITCH_FEATURE_LABELS) : absolute_pitch, 
    "{}Znormelized".format(PITCH_FEATURE_LABELS) : z_normalized_pitch, 
}
pitch_df = pd.DataFrame(data)
assert not pitch_df.isnull().values.any()

In [36]:
# TODO: -- we can merge the VA annotations later / when making the final dataframe. 
# Merge with VA annotations based on frameTime 
# NOTE: If we don't merge, then VA annotations df might have extra rows.
# pitch_df = pd.merge(pitch_df, voice_activity,on='frameTime')

In [76]:
pitch_save_dir = os.path.join(PROCESSED_DATA_DIR,"pitch_poc")
os.makedirs(pitch_save_dir ,exist_ok=True)

In [77]:
pitch_df.to_csv("{}/{}.g.pitch.csv".format(pitch_save_dir ,DIALOGUE_NAMES_SPLIT[0]))

In [78]:
# Creating a method for extraction. 

def extract_pitch(dialogue_name, participant):
    # Define the amount by which the opensmile features are shifted back
    # NOTE: This is because we are using a 50ms timestep and the extracted features 
    # are also on a 50 ms timescale. 

    # Read the gemaps feature file. 
    gemaps_path = read_data(CORRECTED_GEMAPS_DIR,dialogue_name, participant,"csv")[0]
    gemaps_df = pd.read_csv(gemaps_path,delimiter=GEMAPS_CSV_DELIMITER,index_col=0)
    # Extract the relevant raw gemap features into a separate file. 
    relevant_gemaps_df = gemaps_df[RELEVANT_GEMAP_FEATURES]
    # Obtain the z normalized values for each column individually. 
    z_normalized_feat_df = relevant_gemaps_df.apply(
        lambda col: preprocessing.scale(col),axis=1,result_type='broadcast')
    # Make sure there are no nan values introduced 
    assert not relevant_gemaps_df.isnull().values.any()
    assert not z_normalized_feat_df.isnull().values.any()
    # The original paper uses both the absolute and relevant pitch values and a 
    # binary label indicating whether the frame was voiced. 
    absolute_pitch = relevant_gemaps_df[PITCH_FEATURE_LABELS]
    z_normalized_pitch = z_normalized_feat_df[PITCH_FEATURE_LABELS]
    assert len(absolute_pitch) == len(z_normalized_pitch)
    # Determine whether frame was voiced 
    frame_times_s = gemaps_df["frameTime"] 
    data = {
        "frameTime" : frame_times_s,
        "{}Absolute".format(PITCH_FEATURE_LABELS) : absolute_pitch, 
        "{}Znormalized".format(PITCH_FEATURE_LABELS) : z_normalized_pitch, 
    }
    pitch_df = pd.DataFrame(data)
    assert not pitch_df.isnull().values.any()
    return pitch_df

In [79]:
# Save for both speakers in one file 
pitch_f_df = extract_pitch(DIALOGUE_NAMES_SPLIT[0], "f")
pitch_g_df = extract_pitch(DIALOGUE_NAMES_SPLIT[0], "g")


In [80]:
# NOTE: These files will no longer contain voice activity annotations as these 
# can be added later. 
pitch_f_df.to_csv("{}/{}.f.pitch.csv".format(pitch_save_dir ,DIALOGUE_NAMES_SPLIT[0]))
pitch_g_df.to_csv("{}/{}.g.pitch.csv".format(pitch_save_dir ,DIALOGUE_NAMES_SPLIT[0]))

#### Power / Intensity 

For now, we consider Loudness from the GeMAPS feature set as a measure of 
intensity. However, there are other energy related features that may be used 
later. 

In [82]:
power_save_dir = os.path.join(PROCESSED_DATA_DIR,"power_poc")
os.makedirs(power_save_dir ,exist_ok=True)

In [83]:
POWER_FEATURE_LABELS = "Loudness_sma3"

In [84]:
def extract_power(dialogue_name, participant):
    # Define the amount by which the opensmile features are shifted back
    # NOTE: This is because we are using a 50ms timestep and the extracted features 
    # are also on a 50 ms timescale. 
    # Read the gemaps feature file. 
    gemaps_path = read_data(CORRECTED_GEMAPS_DIR,dialogue_name, participant,"csv")[0]
    gemaps_df = pd.read_csv(gemaps_path,delimiter=GEMAPS_CSV_DELIMITER,index_col=0)
    # Extract the relevant raw gemap features into a separate file. 
    relevant_gemaps_df = gemaps_df[RELEVANT_GEMAP_FEATURES]
    # Obtain the z normalized values for each column individually. 
    z_normalized_feat_df = relevant_gemaps_df.apply(
        lambda col: preprocessing.scale(col),axis=1,result_type='broadcast')
    # Make sure there are no nan values introduced 
    assert not relevant_gemaps_df.isnull().values.any()
    assert not z_normalized_feat_df.isnull().values.any()
    # The original paper uses power / intensity in dB - 
    # TODO: Check what the units of loudness are in the GeMAPS set. 
    absolute_power = relevant_gemaps_df[POWER_FEATURE_LABELS]
    z_normalized_power = z_normalized_feat_df[POWER_FEATURE_LABELS]
    # Determine whether frame was voiced 
    frame_times_s = gemaps_df["frameTime"] 
    data = {
        "frameTime" : frame_times_s,
        "{}_Absolute".format(POWER_FEATURE_LABELS) :  absolute_power,
        "{}_Znormalized".format(POWER_FEATURE_LABELS) : z_normalized_power
    }
    power_df = pd.DataFrame(data)
    assert not power_df.isnull().values.any()
    return power_df

In [85]:
power_f_df = extract_power(DIALOGUE_NAMES_SPLIT[0], "f")
power_g_df = extract_power(DIALOGUE_NAMES_SPLIT[0], "g")

In [86]:
# Save 
power_f_df.to_csv("{}/{}.f.power.csv".format(power_save_dir,DIALOGUE_NAMES_SPLIT[0]))
power_g_df.to_csv("{}/{}.g.power.csv".format(power_save_dir,DIALOGUE_NAMES_SPLIT[0]))

#### Spectral stability 

In [87]:
spectral_save_dir = os.path.join(PROCESSED_DATA_DIR,"spectral_poc")
os.makedirs(spectral_save_dir ,exist_ok=True)

In [88]:
SPECTRAL_FEATURE_LABELS = 'spectralFlux_sma3'

In [89]:
def extract_spectral_flux(dialogue_name, participant):
    # Define the amount by which the opensmile features are shifted back
    # NOTE: This is because we are using a 50ms timestep and the extracted features 
    # are also on a 50 ms timescale. 
    # Read the gemaps feature file. 
    gemaps_path = read_data(CORRECTED_GEMAPS_DIR,dialogue_name, participant,"csv")[0]
    gemaps_df = pd.read_csv(gemaps_path,delimiter=GEMAPS_CSV_DELIMITER)
    # Extract the relevant raw gemap features into a separate file. 
    spectral_flux_df = gemaps_df[SPECTRAL_FEATURE_LABELS]
    # Obtain the z normalized values for each column individually. 
    z_normalized_spectral_flux = preprocessing.scale(spectral_flux_df)
    # Make sure there are no nan values introduced 
    assert not spectral_flux_df.isnull().values.any()
    assert not  np.isnan(z_normalized_spectral_flux).any()
    # Determine whether frame was voiced 
    frame_times_s = gemaps_df["frameTime"] 
    data = {
        "frameTime" : frame_times_s,
        "{}_Znormalized".format(SPECTRAL_FEATURE_LABELS) : z_normalized_spectral_flux
    }
    result_df = pd.DataFrame(data)
    assert not result_df.isnull().values.any()
    return result_df


In [90]:
spectral_flux_f_df = extract_spectral_flux(DIALOGUE_NAMES_SPLIT[0], "f")
spectral_flux_g_df = extract_spectral_flux(DIALOGUE_NAMES_SPLIT[0], "g")

In [91]:
# Save 
spectral_flux_f_df .to_csv("{}/{}.f.spectral_flux.csv".format(spectral_save_dir,DIALOGUE_NAMES_SPLIT[0]))
spectral_flux_g_df .to_csv("{}/{}.g.spectral_flux.csv".format(spectral_save_dir,DIALOGUE_NAMES_SPLIT[0]))

#### Parts of Speech Annotations

These are directly obtained from the annotations received with the MapTask corpus. 

In the original paper, there are 59 different POS tags, and all the tags are 
represented as a one hot feature vector. Additionally, to simulate the delay 
in extracting POS tags, the feature vector was set to 0 by default but the 
corresponding feature vector was set to 1 (since it is a one-hot encoded vector)
a 100ms after the word had ended. 

In [92]:
POS_DELAY_TIME_MS = 100 # Assume that each POS calculation is delayed by 100ms. 


# Create a vocabulary from all the POS annotation tags
# Documentation to the MapTask POS tags:  https://groups.inf.ed.ac.uk/maptask/interface/expl.html
POS_TAGS = [
    "vb", 
    "vbd", 
    "vbg",
    "vbn", 
    "vbz",
    "nn",
    "nns",
    "np",
    "jj",
    "jjr",
    "jjt",
    "ql",
    "qldt",
    "qlp",
    "rb",
    "rbr",
    "wql",
    "wrb",
    "not",
    "to",
    "be",
    "bem",
    "ber",
    "bez",
    "do",
    "doz",
    "hv",
    "hvz",
    "md",
    "dpr",
    "at",
    "dt",
    "ppg",
    "wdt",
    "ap",
    "cd",
    "od",
    "gen",
    "ex",
    "pd",
    "wps",
    "wpo",
    "pps",
    "ppss",
    "ppo",
    "ppl",
    "ppg2\"",
    "pr",
    "pn",
    "in",
    "rp",
    "cc",
    "cs",
    "aff",
    "fp",
    "noi",
    "pau",
    "frag",
    "sent"
]
len(POS_TAGS)

59

In [94]:
pos_save_dir = os.path.join(PROCESSED_DATA_DIR,"pos_poc")
os.makedirs(pos_save_dir ,exist_ok=True)

In [95]:
import xml 

In [96]:
# Need to read the timed-unit file for the corresponding start and end times 
# for the tags since we need to assign it
timed_unit_path = read_data(
    TIMED_UNIT_PATHS,DIALOGUE_NAMES_SPLIT[0],PARTICIPANT_LABELS_MAPTASK[1],"xml")[0]
tree_timed_unit = xml.etree.ElementTree.parse(timed_unit_path).getroot()
timed_unit_tags = list(tree_timed_unit.iter())
tu_tags = tree_timed_unit.findall("tu")



In [97]:
# Read the appropriate pos file
pos_path = read_data(
    POS_PATH,DIALOGUE_NAMES_SPLIT[0],PARTICIPANT_LABELS_MAPTASK[1],"xml")[0]
tree_pos = xml.etree.ElementTree.parse(pos_path).getroot()
# The pos is the tag attribute in all the tw tags. 
tw_tags = tree_pos.findall("tw")

In [98]:

# Getting all the times in which there are voice activity annotations in the corpus. 
va_times = []
for tu_tag in tu_tags:
    start_time_s = float(tu_tag.get('start'))
    end_time_s = float(tu_tag.get('end'))
    if end_time_s - start_time_s >= MINIMUM_VA_CLASSIFICATION_TIME_MS/1000:
        va_times.append((start_time_s,end_time_s))
        

In [99]:
# Extracting the audio end time from te timed units file. 
audio_end_time_ms = float(list(tree_timed_unit.iter())[-1].get('end')) *1000
# Get the frame times based on the final times unit time. 
# NOTE: This is being generated based on the step size for now. 
frame_times_s = np.arange(0,audio_end_time_ms,FRAME_STEP_MS) / 1000
# Ensure that there are no null values here. 
assert not np.isnan(frame_times_s).any()
frame_times_s

array([0.0000e+00, 5.0000e-02, 1.0000e-01, ..., 2.6495e+02, 2.6500e+02,
       2.6505e+02])

In [100]:
# We need to have a POS annotation per frame - not simply per detected word time. 
pos_annotations = [0] * frame_times_s.shape[0] 
len(pos_annotations)

5302

In [101]:
# Collecting the end time of the word and the corresponding POS tag. 
word_annotations = []
for tu_tag in tu_tags:
    tu_tag_id = tu_tag.get("id")[7:]
    end_time_s = float(tu_tag.get('end'))
    for tw_tag in tw_tags:
        # NOTE: Not sure if this is the correct way to extract the corresponding 
        # timed-unit id. 
        href = list(tw_tag.iter())[1].get("href")
        href_filename, href_ids = href.split("#")
        # Look at the appropriate file tags based on the filename. 
        href_ids = href_ids.split("..")
        for href_id in href_ids:
            href_id = href_id[href_id.find("(")+8:href_id.rfind(")")]
            if href_id == tu_tag_id:
                word_annotations.append((end_time_s,tw_tag.get("tag")))

In [102]:
pos_tags_to_idx = {}
idx_to_pos_tag = {}
# NOTE: Indices start from 1 here because 0 already represents unknown categories. 
for i,tag in enumerate(POS_TAGS):
    pos_tags_to_idx[tag] = i +1
    idx_to_pos_tag[i+1] = tag 


In [103]:
# For all the collected word end times and POS tags, we need to introduce 
# a delay and add the POS annotation to the delayed frame. 
pos_annotations = np.zeros((frame_times_s.shape[0]))
for end_time_s, pos_tag in word_annotations:
    frame_idx = np.abs(frame_times_s-(end_time_s +POS_DELAY_TIME_MS/1000)).argmin()
    pos_annotations[frame_idx] = pos_tags_to_idx[pos_tag] 

In [104]:
from sklearn.preprocessing import OneHotEncoder

In [105]:
# This encoder will ignore any unknown tags by replacing them with all zeros. 
onehot_encoder = OneHotEncoder(sparse=False,handle_unknown="ignore")
onehot_encoder.fit(np.asarray(list(pos_tags_to_idx.values())).reshape(-1,1))
onehot_encoder.categories_

[array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
        35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
        52, 53, 54, 55, 56, 57, 58, 59])]

In [106]:
onehot_encoder.categories_[0].shape

(59,)

In [107]:
# NOTE: The POS annotation file may have data for more time frames than the 
# audio features extracted from opensmile. Therefore, when merging later, 
# we should always merge the audio features first. 
encoded_pos = onehot_encoder.transform(pos_annotations.reshape(-1,1))
encoded_pos.shape

(5302, 59)

In [108]:
# Create a dataframe from the extracted features and saving. 
pos_annotations_df = pd.DataFrame(encoded_pos,columns=POS_TAGS)
pos_annotations_df


Unnamed: 0,vb,vbd,vbg,vbn,vbz,nn,nns,np,jj,jjr,...,in,rp,cc,cs,aff,fp,noi,pau,frag,sent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
# Combinging the above individual cells to create a single method to extract POS 
# features 

def extract_pos_annotations_with_delay(dialogue_name, participant):
    # Need to read the timed-unit file for the corresponding start and end times 
    # for the tags since we need to assign it
    timed_unit_path = read_data(
        TIMED_UNIT_PATHS,dialogue_name,participant,"xml")[0]
    tree_timed_unit = xml.etree.ElementTree.parse(timed_unit_path).getroot()
    tu_tags = tree_timed_unit.findall("tu")
    # Read the appropriate pos file
    pos_path = read_data(POS_PATH,dialogue_name,participant,"xml")[0]
    tree_pos = xml.etree.ElementTree.parse(pos_path).getroot()
    # The pos is the tag attribute in all the tw tags. 
    tw_tags = tree_pos.findall("tw")
    # Getting all the times in which there are voice activity annotations in the corpus. 
    va_times = []
    for tu_tag in tu_tags:
        start_time_s = float(tu_tag.get('start'))
        end_time_s = float(tu_tag.get('end'))
        if end_time_s - start_time_s >= MINIMUM_VA_CLASSIFICATION_TIME_MS/1000:
            va_times.append((start_time_s,end_time_s))
    # Extracting the audio end time from the timed units file. 
    audio_end_time_ms = float(list(tree_timed_unit.iter())[-1].get('end')) *1000
    # Get the frame times based on the final times unit time. 
    # NOTE: This is being generated based on the step size for now. 
    frame_times_s = np.arange(0,audio_end_time_ms,FRAME_STEP_MS) / 1000
    # Collecting the end time of the word and the corresponding POS tag. 
    word_annotations = []
    for tu_tag in tu_tags:
        tu_tag_id = tu_tag.get("id")[7:]
        end_time_s = float(tu_tag.get('end'))
        for tw_tag in tw_tags:
            # NOTE: Not sure if this is the correct way to extract the corresponding 
            # timed-unit id. 
            href = list(tw_tag.iter())[1].get("href")
            href_filename, href_ids = href.split("#")
            # Look at the appropriate file tags based on the filename. 
            href_ids = href_ids.split("..")
            for href_id in href_ids:
                href_id = href_id[href_id.find("(")+8:href_id.rfind(")")]
                if href_id == tu_tag_id:
                    if tw_tag.get("tag") in POS_TAGS:
                        word_annotations.append((end_time_s,tw_tag.get("tag")))
    # For all the collected word end times and POS tags, we need to introduce 
    # a delay and add the POS annotation to the delayed frame. 
    pos_annotations = np.zeros((frame_times_s.shape[0]))
    for end_time_s, pos_tag in word_annotations:
        frame_idx = np.abs(frame_times_s-(end_time_s +POS_DELAY_TIME_MS/1000)).argmin()
        # Convert to integer based on the vocabulary dictionary. 
        pos_annotations[frame_idx] = pos_tags_to_idx[pos_tag] 
    # The pos annotations should not have any nan values 
    assert not np.isnan(pos_annotations).any()
    # This encoder will ignore any unknown tags by replacing them with all zeros. 
    onehot_encoder = OneHotEncoder(sparse=False,handle_unknown="ignore")
    onehot_encoder.fit(np.asarray(list(pos_tags_to_idx.values())).reshape(-1,1))
    encoded_pos = onehot_encoder.transform(pos_annotations.reshape(-1,1))
    pos_annotations_df = pd.DataFrame(encoded_pos,columns=POS_TAGS)
    # Add frametimes to the df 
    pos_annotations_df.insert(0,"frameTime",frame_times_s)
    # Remove any duplicated frameTimes 
    pos_annotations_df.drop_duplicates(subset=['frameTime'], inplace=True)
    assert not pos_annotations_df.isnull().values.any()
    return pos_annotations_df
        

In [110]:
pos_annotations_f_df = extract_pos_annotations_with_delay(
    DIALOGUE_NAMES_SPLIT[0], "f")
pos_annotations_g_df = extract_pos_annotations_with_delay(
    DIALOGUE_NAMES_SPLIT[0], "g")
pos_annotations_f_df.to_csv("{}/{}.f.pos_onehot.csv".format(pos_save_dir,DIALOGUE_NAMES_SPLIT[0]))
pos_annotations_g_df.to_csv("{}/{}.g.pos_onehot.csv".format(pos_save_dir,DIALOGUE_NAMES_SPLIT[0]))

### Feature Extraction Pipeline

In [111]:
from functools import reduce, partial

In [113]:
pipeline_save_dir = os.path.join(PROCESSED_DATA_DIR,"pipeline_poc")
os.makedirs(pipeline_save_dir ,exist_ok=True)

In [114]:
# NOTE: Can be made more efficient by putting each extraction into its own thread.
def extract_skantze_2017_features(dialogue_name, participant, output_dir,feature_set):
    # Get the VA annotations from maptask. 
    voice_activity_df = get_voice_activity_annotations(
        dialogue_name, participant)

    # Get pitch, power, and flux from the opensmile audio features 
    pitch_df = extract_pitch(dialogue_name, participant)
    power_df = extract_power(dialogue_name, participant)
    spectral_flux_df = extract_spectral_flux(dialogue_name, participant)

    if feature_set == "prosody":
        # Does not include pos features
        result_df = reduce(lambda x,y: pd.merge(
        x,y, on='frameTime', how='inner'),
        [voice_activity_df,pitch_df,power_df,spectral_flux_df])
    # POS annotations only extracted for the full dataset. 
    elif feature_set =="full":
        pos_df = extract_pos_annotations_with_delay(dialogue_name, participant)
        result_df = reduce(lambda x,y: pd.merge(
            x,y, on='frameTime', how='inner'),
            [voice_activity_df,pitch_df,power_df,spectral_flux_df,pos_df])
    else:
        raise Exception("Feature set not supported")
        
    result_df.to_csv("{}/{}.{}.skantze_2017_features.{}.csv".format(
        output_dir,dialogue_name,participant,feature_set))
    # Make sure there are no nan values 
    assert not result_df.isnull().values.any()
    return result_df 
    

In [115]:
extract_skantze_2017_features(
    DIALOGUE_NAMES_SPLIT[0],PARTICIPANT_LABELS_MAPTASK[0],pipeline_save_dir,"full")

Unnamed: 0,frameTime,voiceActivity,F0semitoneFrom27.5Hz_sma3nzAbsolute,F0semitoneFrom27.5Hz_sma3nzZnormalized,Loudness_sma3_Absolute,Loudness_sma3_Znormalized,spectralFlux_sma3_Znormalized,vb,vbd,vbg,...,in,rp,cc,cs,aff,fp,noi,pau,frag,sent
0,0.00,0.0,0.0,-0.414430,0.181512,-0.414211,-0.355010,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.05,0.0,0.0,-0.409845,0.186722,-0.409593,-0.201524,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.10,0.0,0.0,-0.405466,0.172331,-0.405204,-0.103440,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.15,0.0,0.0,-0.411621,0.144899,-0.411431,-0.164591,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.20,0.0,0.0,-0.416682,0.104665,-0.416558,-0.233114,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5296,264.80,1.0,0.0,-0.389374,0.339836,-0.388882,4.040398,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5297,264.85,1.0,0.0,-0.394844,0.270952,-0.394457,3.761027,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5298,264.90,1.0,0.0,-0.402629,0.507726,-0.401937,2.955869,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5299,264.95,1.0,0.0,-0.410715,0.498677,-0.410069,3.894783,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [116]:
extract_skantze_2017_features(
    DIALOGUE_NAMES_SPLIT[0],PARTICIPANT_LABELS_MAPTASK[0], pipeline_save_dir,"prosody")

Unnamed: 0,frameTime,voiceActivity,F0semitoneFrom27.5Hz_sma3nzAbsolute,F0semitoneFrom27.5Hz_sma3nzZnormalized,Loudness_sma3_Absolute,Loudness_sma3_Znormalized,spectralFlux_sma3_Znormalized
0,0.00,0.0,0.0,-0.414430,0.181512,-0.414211,-0.355010
1,0.05,0.0,0.0,-0.409845,0.186722,-0.409593,-0.201524
2,0.10,0.0,0.0,-0.405466,0.172331,-0.405204,-0.103440
3,0.15,0.0,0.0,-0.411621,0.144899,-0.411431,-0.164591
4,0.20,0.0,0.0,-0.416682,0.104665,-0.416558,-0.233114
...,...,...,...,...,...,...,...
5296,264.80,1.0,0.0,-0.389374,0.339836,-0.388882,4.040398
5297,264.85,1.0,0.0,-0.394844,0.270952,-0.394457,3.761027
5298,264.90,1.0,0.0,-0.402629,0.507726,-0.401937,2.955869
5299,264.95,1.0,0.0,-0.410715,0.498677,-0.410069,3.894783


In [117]:
from multiprocessing import Pool 
# Multiprocessing has issues running in jupyter -  using multiprocess instead. 
import multiprocess as mp
import tqdm 

In [118]:
# Multi-threaded function to extract the Skantze features 

def extract_skantze_2017_features_multithreaded(
        dialogue_names_split, output_dir, feature_set="full", num_workers=4):

    start_time = time.time()
    print(output_dir)
    assert os.path.isdir(output_dir)
    print("Running pipeline for {} dialogues, each for participants {}...".format(
        len(dialogue_names_split),PARTICIPANT_LABELS_MAPTASK))
    # Collect all the arguments for the non-multithread method. 
    collected_args = []
    for dialogue_name in dialogue_names_split:
        for participant in PARTICIPANT_LABELS_MAPTASK:
            collected_args.append(
                (dialogue_name,participant, output_dir,feature_set))
    print("Using {} workers...".format(num_workers))
    with mp.Pool(num_workers) as pool:
        results = list(tqdm.tqdm(pool.starmap(
            extract_skantze_2017_features, collected_args),
            total=len(collected_args),desc="Extracting Features"))
    elapsed_time = time.time() - start_time
    print("Elapsed time: {:.3f} seconds".format(elapsed_time))
    print("Results saved to {}".format(output_dir))
    print("Completed!")



In [119]:
pipeline_results_path = os.path.join(PROCESSED_DATA_DIR,"skantze2017_pipeline")
os.makedirs(pipeline_results_path,exist_ok=True)
pipeline_results_path

'/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline'

In [120]:

extract_skantze_2017_features_multithreaded(
    DIALOGUE_NAMES_SPLIT,pipeline_results_path ,"full")

/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline
Running pipeline for 3 dialogues, each for participants ['f', 'g']...
Using 4 workers...


Extracting Features: 100%|██████████| 6/6 [00:00<00:00, 15837.52it/s]

Elapsed time: 16.695 seconds
Results saved to /Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline
Completed!





In [121]:
extract_skantze_2017_features_multithreaded(
    DIALOGUE_NAMES_SPLIT,pipeline_results_path ,"prosody")

/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline
Running pipeline for 3 dialogues, each for participants ['f', 'g']...
Using 4 workers...


Extracting Features: 100%|██████████| 6/6 [00:00<00:00, 29093.44it/s]

Elapsed time: 14.086 seconds
Results saved to /Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline
Completed!





In [122]:
# Read the results and make sure there are no nan values 
extracted_feature_set_paths = glob.glob("{}/*.csv".format(pipeline_results_path))
extracted_feature_set_paths

['/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline/q1ec2.g.skantze_2017_features.full.csv',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline/q1ec1.f.skantze_2017_features.full.csv',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline/q1ec2.f.skantze_2017_features.full.csv',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline/q1ec1.g.skantze_2017_features.full.csv',
 '/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_pipeline/q1ec1.f.skantze_

In [123]:
for path in extracted_feature_set_paths:
    feature_df = pd.read_csv(path,delimiter=",")
    assert not feature_df.isnull().values.any()

In [125]:
# NOTE: This cell extracts all the feature sets for all files - and will take 
# a while to run. 
# TODO: Still need to add ability to the pipeline to generate feature sets 
# with different frame step sizes (This notebook only does for 50ms). 
# Once that is done, UPDATE scripts to reflect the completed pipeline. 

for feature_set in ("full", "prosody"):
    # Set up the directories. 
    output_dir_path = os.path.join(PROCESSED_DATA_DIR, "skantze2017_feature_sets",feature_set)
    if os.path.isdir(output_dir_path):
        shutil.rmtree(output_dir_path)
    os.makedirs(output_dir_path)
    # First, produce a corrected version of all the dialogues
    for dialogue_name in DIALOGUE_NAMES:
        for participant in PARTICIPANT_LABELS_MAPTASK:
            verify_correct_gemaps(GEMAPS_DIR, dialogue_name, participant,CORRECTED_GEMAPS_DIR) 
    # Extract the features 
    extract_skantze_2017_features_multithreaded(
        DIALOGUE_NAMES, output_dir_path , feature_set)
    # Ensure that all the data is correct 
    extracted_feature_set_paths = glob.glob("{}/*.csv".format(output_dir_path))
    for path in extracted_feature_set_paths:
        feature_df = pd.read_csv(path,delimiter=",")
        assert not feature_df.isnull().values.any()
    
    

/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_feature_sets/full
Running pipeline for 256 dialogues, each for participants ['f', 'g']...
Using 4 workers...


Extracting Features: 100%|██████████| 512/512 [00:00<00:00, 704323.93it/s]


Elapsed time: 4405.649 seconds
Results saved to /Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_feature_sets/full
Completed!
/Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_feature_sets/prosody
Running pipeline for 256 dialogues, each for participants ['f', 'g']...
Using 4 workers...


Extracting Features: 100%|██████████| 512/512 [00:00<00:00, 445906.07it/s]


Elapsed time: 1870.222 seconds
Results saved to /Users/muhammadumair/Documents/Repositories/mumair01-repos/TRP-Modeling/skantze_2017_continuous/data/processed/1.0-MU-Maptask-preprocess-POC/skantze2017_feature_sets/prosody
Completed!
