# Track formants using esps "formant" and ifc "formant"

In [None]:
import os, re
import subprocess
import pandas as pd
from phonlab.utils import dir2df
import fnmatch

# Prepare the metadata and audio dataframes

These are directories containing the audio data and caches for the formant trackers on Meg Cychosz's laptop:

In [None]:
audiodir = '/home/ubuntu/Desktop/Shared/sf_Box_Sync/Diss_data/Fall_2019_LRAP/Word_lists/2018_vowels/2018_data'
ifcdir = '/home/ubuntu/Desktop/Shared/sf_Box_Sync/Diss_data/Fall_2019_LRAP/Word_lists/2018_vowels/2018_data/formants_ifc'  # For ifcformant output files
covdir = '/home/ubuntu/Desktop/Shared/sf_Box_Sync/Diss_data/Fall_2019_LRAP/Word_lists/2018_vowels/2018_data/formants_cov'  # For covariance method output files
acdir = '/home/ubuntu/Desktop/Shared/sf_Box_Sync/Diss_data/Fall_2019_LRAP/Word_lists/2018_vowels/2018_data/formants_ac'    # For autocorrelation method output files

# directories to place data from lower lpc orders:
#covdir_lpc8 = '/home/ubuntu/Desktop/Shared/sf_Box_Sync/Diss_data/Fall_2019_LRAP/Word_lists/2018_vowels/2018_data/formants_cov_lpcorder_8' # For covariance method output files of 4-6 y/os that are run with lpc order=8
#acdir_lpc8 = '/home/ubuntu/Desktop/Shared/sf_Box_Sync/Diss_data/Fall_2019_LRAP/Word_lists/2018_vowels/2018_data/formants_ac_lpcorder_8' # for autocorrelation method output files of 4-6 y/os that are run with lpc order=8

Load in participant metadata:

In [None]:
md = pd.read_csv('~/Desktop/Shared/sf_Box_Sync/Diss_data/Fall_2019_LRAP/Word_lists/speaker_metadata.csv')
md.head()

Add lpc parameters based on speaker type. These are strings because they are command line arguments.

In [None]:
# lpc order only needs to be specified for esps functions, not ifc

# where lpc order = (#formants*2) + 2
# it wouldn't surprise me if the child lpcorder/#formants parameters fail as currently specified - there might be a minimum of 10
lpcargs = pd.DataFrame.from_records([
    ('adult', '4', '12', '600'),
    ('10',  '3', '10', '700'),
    ('9',  '3', '10', '700'),
    ('8',  '3', '10', '700'),
    ('7',  '3', '10', '700'),
    ('6',  '3', '10', '700'), # adjust the #formants and lpc order here
    ('5', '3', '10', '700'),
    ('4', '3', '10', '700')

], columns=['age_yrs', 'num_form', 'lpc_order', 'nom_f1'])
lpcargs

put it all together:

In [None]:
md_merged = md.merge(lpcargs, on='age_yrs', how='left')
md_merged.head()

Now load in the audio files:

In [None]:
fnpat = '^(?P<spkr>[^_]+)_(?P<age_yrs>\d+|adult)_(?P<task>.+)\.wav$' # filenames


wavdf = dir2df(
    audiodir,
    fnpat=fnpat,
    addcols=['dirname', 'barename', 'ext']   # addcols spits out relevant file names
)
wavdf.head()

Merge audio files with metadata:

In [None]:
wavdf = wavdf.merge(
    md_merged,
    on=['spkr', 'age_yrs'], # merge both of these columns
    how='left'
)

# Identify which participants don't have ifc tracking already

First see if we have any exisiting ifc files in the cache so that we don't have to run them again. 

In [None]:
ifcdf = dir2df(ifcdir, addcols=['dirname', 'barename', 'ext'])

Merge the ifc file information with our metadata file to identify participants who are missing tracking information. Their ifc information will just come out as NaN. 

In [None]:
ifcdf = wavdf.merge(
    ifcdf,
    left_on=['barename'],
    right_on=['barename'],
    suffixes=['', '_ifc'],  # suffix _ifc instead of _y for duplicated columns
    how='left'
)
ifcdf.head()

Select those rows that don't have formant measurements yet (NaN values). We'll see a snippet of that resulting dataframe here:

In [None]:
missingifc = ifcdf[ifcdf.fname_ifc.isnull()]
missingifc.dirname_ifc = ifcdir
missingifc.fname_ifc = [x + '.ifc' for x in missingifc.barename]
missingifc.head()

missingifc.relpath.values

# Actually ready to track some formants

Get the rows from missingifc to use as input for ifcformant. Save the output to the cache directory. The run_ifcformant function uses subprocess to execute ifcformant on the values contained in the rows. Here we define that function:

In [None]:
def run_ifcformant(row, errors='raise'):
    '''Perform formant analysis with the ifcformant command.
    
    Parameters
    ----------
    
    row : namedtuple that contains formant analysis parameters
          in fields:
        'dirname' (base pathname to input .wav file),
        'relpath' (relative path to audio file from dirname - not currently needed),
        'fname' (name of .wav file),
        'barename' (name of .wav file without extension)
        'age' (ifcformant speaker type, one of 'female',
            'male', 'child')
        'dirname_ifc' (base cache directory name),
        'fname_ifc' (name of output .ifc file),
             
    errors : str (default 'raise')
        How to handle errors if `check_call()` fails. If
        'ignore', print debug statement to STDERR and return the
        ifcformant return code; if 'raise' immediately reraise
        the CalledProcessError.
        
    Returns
    -------
    
    The `ifcformant` return code is returned by this function,
    0 for success or non-zero for errors.
    '''
    try:
        subprocess.check_call([
            "ifcformant",
            "--speaker", row.age,    # female, male, or child
            "--print-header",
            "--output", os.path.join(row.dirname_ifc, row.relpath, row.fname_ifc),
            os.path.join(row.dirname, row.relpath, row.fname)
        ])
    except subprocess.CalledProcessError as e:
        if errors == 'ignore':
            msg = 'Caught error while invoking ifcformant:\n{:}'.format(e)
            sys.stderr.write(msg)
            return e.returncode
        else:
            raise e
    return 0

Before calling the run_ifcformant function, we check that the appropriate output directories in the cache directory are created. We create these by looping over the unique relpath values in missingifc and concatenating these with the base cache directory name found in ifcdir, then calling os.makedirs(). This is only relevant if subjects have their own directories, for example. 

In [None]:
#for cdir in missingifc.relpath.unique():
#    os.makedirs(os.path.join(ifcdir, cdir), exist_ok=True)

Actually run the function. This will loop over all of the rows in the missingifc dataframe. 

In [None]:
for row in missingifc.itertuples(): # add print message
    run_ifcformant(row)

Sanity check. Now when we reload filenames in the ifcformant cache directory, they should produce new files:

In [None]:
ifcdf = dir2df(ifcdir, addcols=['barename', 'dirname', 'ext'])
ifcdf.head()

# Define LPC covariance and autocorrelation tracking methods:

ESPS formant commands used. The tracking method - covariance or autocorrelation - can be selected with the lpc_type parameter. The other analysis parameters, lpc_order and nom_f1, are already included in wavdf and will be passed as part of a dataframe row.

In [None]:
def run_formant(row, lpc_type, errors='raise'):
    '''
    Run ESPS formant command with covariance or autocorrelation settings.
    
    Parameters
    ----------
    
    row : namedtuple that contains formant analysis parameters
          in fields:
        'dirname' (base pathname to input .wav file),
        'relpath' (relative path to audio file from dirname),
        'fname' (name of .wav file),
        'lpc_order' (order of lpc analysis)
        'nom_f1' (nominal value of first formant frequency, in Hz)
        'dirname_out' (base cache directory name) - currently missing this
        
    lpc_type : str ('cov' for covariance or 'ac' for autocorrelation)
             
    errors : str (default 'raise')
        How to handle errors if `check_call()` fails. If
        'ignore', print debug statement to STDERR and return the
        ifcformant return code; if 'raise' immediately reraise
        the CalledProcessError.
        
    Returns
    -------
    
    The `formant` return code is returned by this function,
    0 for success or non-zero for errors.
    '''
    bytes = '317520000'
    if lpc_type == 'cov':
        wdur = '0.025'
        lpc_opt = '-t1'
    elif lpc_type == 'ac':
        wdur = '0.049'
        lpc_opt = '-t0'
    try:
        subprocess.check_call([
            "formant",
            "-n", row.num_form,
            "-o", row.lpc_order,
            "-N", row.nom_f1,
            lpc_opt,
            "-w", wdur,
            "-O", os.path.join(row.dirname_out),
             os.path.join(row.dirname, row.fname),
            "-B", bytes
        ])

        
          #"--output", os.path.join(row.dirname_ifc, row.relpath, row.fname_ifc),
          #  os.path.join(row.dirname, row.relpath, row.fname)
            
    except subprocess.CalledProcessError as e:
        if errors == 'ignore':
            msg = 'Caught error while invoking formant:\n{:}'.format(e)
            sys.stderr.write(msg)
            return e.returncode
        else:
            raise e
    return 0

In [None]:
# Load cached covariance .fb files - this will be empty the first time it's run
covdf = dir2df(covdir, fnpat='\.fb$', addcols=['barename', 'dirname', 'ext'])

In [None]:
# Merge covariance df with `wavdf`
covdf = wavdf.merge(
    covdf,
    left_on=['barename'],
    right_on=['barename'],
    suffixes=['', '_out'],  # suffix _ifc instead of _y for duplicated columns
    how='left'
)
covdf.head()

In [None]:
# Find missing cached covariance files - who hasn't been run yet?
missingcov = covdf[covdf.fname_out.isnull()]
missingcov.dirname_out = covdir
missingcov.fname_out = [x + '.cov' for x in missingcov.barename]
missingcov.head()

In [None]:
# make sure that ESPS parameters are strings, not integers 
missingcov['lpc_order'] = missingcov['lpc_order'].astype(str)
missingcov['num_form'] = missingcov['num_form'].astype(str)
missingcov['nom_f1'] = missingcov['nom_f1'].astype(str)

In [None]:
# Ensure output directories are created. - this is only relevant when subjects each have their own directories
#for cdir in missingcov.relpath.unique():
#    os.makedirs(os.path.join(covdir, cdir), exist_ok=True)

In [None]:
# option to subset only 4, 5, 6-year olds
# to run lpc order=8 on them
#missingcov = missingcov[missingcov.age_yrs.isin(['4', '5', '6'])]
#missingcov

In [None]:
# Option 1: Run formant command with covariance method.
for row in missingcov.itertuples(): # regular lpc
    run_formant(row, 'cov')

In [None]:
# Check your work.
covdf = dir2df(covdir, fnpat='\.fb$', addcols=['barename', 'dirname', 'ext'])
covdf.head()

In [None]:
# Option 2: Autocorrelation method
# Load cached autocorrelation .fb files - again, this df will be empty the first time you run it
acdf = dir2df(acdir, fnpat='\.fb$', addcols=['barename', 'dirname', 'ext'])
acdf

In [None]:
# Merge with `wavdf`
acdf = wavdf.merge(
    acdf,
    left_on=['barename'],
    right_on=['barename'],
    suffixes=['', '_out'],
    how='left'
)
acdf.head()

In [None]:
# Find missing cached autocorrelation files.
missingac = acdf[acdf.fname_out.isnull()]
missingac.dirname_out = acdir
missingac.fname_out = [x + '.ac' for x in missingac.barename]
missingac.head()

In [None]:
# make sure that ESPS parameters are strings, not integers 
#missingac['lpc_order'] = missingac['lpc_order'].astype(str)
#missingac['num_form'] = missingac['num_form'].astype(str)
#missingac['nom_f1'] = missingac['nom_f1'].astype(str)

In [None]:
# option to subset only 4, 5, 6-year olds
# to run lpc order=8 on them
#missingac = missingac[missingac.age_yrs.isin(['4', '5', '6'])]
#missingac

In [None]:
# Ensure output directories are created. - only relevant when subjects are organized into individual directories
#for cdir in missingac.relpath.unique():
#    os.makedirs(os.path.join(acdir, cdir), exist_ok=True)

In [None]:
# Run formant command with autocorrelation method. 
for row in missingac.itertuples():
    run_formant(row, 'ac')

In [None]:
# Check your work.
acdf = dir2df(acdir, fnpat='\.fb$', addcols=['barename', 'dirname', 'ext'])
acdf.head()