# Hands-on ac2art in a few minutes

In [1]:
# If this fails, check out that dependencies are satisfied.
# Also check your config.json file in the package's folder.
import ac2art

### Preprocessing a corpus


We'll here use the mocha-timit corpus, whose support is implemented under `ac2art.copora.mocha`. Note that the latter must be explicitely imported to be used.

In [2]:
from ac2art.corpora import mocha

In [3]:
# Produce the acoustic and articulatory features.
# EMA data and binary voicing are always produced,
# however acoustic features must be specified.

# Here, we only use the abkhazia-computed MFCC features.
# For this function (and all others), use python's `help`
# function to access a detailed documentation on how to
# tweak it.

# Note that when used in console, the timing prints
# overwrite each other.

mocha.preprocess.extract_utterances_data('mfcc')

Running MFCC computation with abkhazia...
Successfully ran mfcc computation with abkhazia.
Successfully ran scp to txt conversion with kaldi.
Succesfully extracted 920 utterances' data to .npy files.
Done producing raw MFCC coefficients with abkhazia.
22:46:13 : Done with utterance msak0_460.


In [4]:
# Compute normalized versions of the features.
# The function must be used for each feature type,
# and can be tweaked to use alternative normalization
# parameters.

# The following parameters are equivalent to using CMVN
# parameters in abkhazia: use the latter when producing
# features to work with out of the supported corpora.

# Note that this will NOT overwrite existing normalization
# values generated with similar arguments.
# To overwrite them, call `mocha.preprocess.compute_moments`
# before (or delete the files containing the parameters).

mocha.preprocess.normalize_files('mfcc', norm_type='stds', scope='speaker')
mocha.preprocess.normalize_files('ema', norm_type='stds', scope='speaker')

In [5]:
# Split the corpus in train/validation/test sets.
# The split ensures that triphones coverage is respected
# among the subsets. Additionally, when there are multiple
# speakers for the corpus, the same utterances are used in
# each subset for the various speakers.

mocha.preprocess.split_corpus(pct_train=70)

### Using data from a corpus

Implemented under the `ac2art.corpora.<corpus>.load` submodule, `load_audio` and `load_ema` are fully modular functions to load either kind
of representations for a given utterance. However, a more practical way of loading the data is to set up a pool of arguments that will be used as (overridable) default arguments by other data loading functions wrapping the former (`load_utterance` and `load_dataset`).

In [6]:
# Read-only access to the default arguments.
mocha.load.get_loading_setup()

{'audio_type': 'mfcc_stds_byspeaker',
 'articulators': 'all',
 'context_window': 5,
 'dynamic_ema': True,
 'ema_norm': 'mean_byspeaker',
 'zero_padding': True}

In [7]:
# Change some of the arguments.
# For this, just pass non-None values to them.

mocha.load.change_loading_setup(
    ema_norm='stds_byspeaker', context_window=0
)
mocha.load.get_loading_setup()

{'audio_type': 'mfcc_stds_byspeaker',
 'articulators': 'all',
 'context_window': 0,
 'dynamic_ema': True,
 'ema_norm': 'stds_byspeaker',
 'zero_padding': True}

In [8]:
# Example of utterance data loading.
# The arguments used are (by default)
# set by the "loading setup" dict.

acoustic, articulatory = mocha.load.load_utterance('fsew0_001')
print(acoustic.shape, articulatory.shape)

(132, 48) (132, 43)


In [9]:
# To load a full subset of utterances.
# The resulting data are two flat numpy arrays
# (acoustic / articulatory data respectively)
# containing 2-D numpy arrays of data, each
# associated to an utterance.

# Use `get_utterances_list` to... well, get the list
# of utterances of the corpus, or of one of the subsets.

x_train, y_train = mocha.load.load_dataset('train')
x_valid, y_valid = mocha.load.load_dataset('validation')
print(x_train.shape, y_train.shape)

(644,) (644,)


### Setting up and using a neural network for ac2art inversion

In [10]:
# Here we focus on the end-to-end approach.
# All other implemented classes of models
# (so far) inherit from this one.

from ac2art.networks import MultilayerPerceptron

In [11]:
rnn = MultilayerPerceptron(
    input_shape=(None, None, 48),  # batches of time sequences of input vectors made of 48 coefficients
    n_targets=15,                  # 14 continuous targets + 1 binary track (voicing)
    binary_tracks=[14],            # last output dimension is binary
    use_dynamic=True,              # add delta and deltadelta to the continuous targets (total dim: 43)
    layers_config=[                # hidden layers stack:
        ('dense_layer', 300),      #    fully-connected layer of 300 units with default (relu) activation
        ('bi_rnn_stack', 300)      #    bidirectional RNN with single layers of 300 units of default activation,
                                   #    default cell-type (lstm) and default aggregation (concatenate)
    ],
    top_filter=(                   # filter the outputs using a fixed lowpass frequency of 20 Hz
        'lowpass_filter', 20, {'learnable': False}
    )
)

In [12]:
# Process one (or more) input vectors - i.e. invert them.
# Note: here the results are uninteresting, as the model
# has not been trained yet.

rnn.predict(x_train[0])

array([[ 0.11681165,  0.32188216, -0.09232032, ...,  0.01175005,
         0.0210929 ,  0.02007481],
       [ 0.19524243,  0.39777154, -0.12937222, ...,  0.0157961 ,
         0.02269749,  0.01722598],
       [ 0.28528634,  0.4140218 , -0.14360052, ...,  0.01892811,
         0.02223003,  0.01154162],
       ...,
       [ 0.16779609,  0.23129499,  0.13174754, ..., -0.0017896 ,
         0.00045088, -0.04568204],
       [ 0.1097638 ,  0.18255684,  0.14121087, ..., -0.00273296,
        -0.00193995, -0.03676203],
       [ 0.06221345,  0.12735417,  0.11843851, ..., -0.00432341,
        -0.00389861, -0.02653102]], dtype=float32)

### Scoring models

In [13]:
# Score on one (or more) utterance.

rnn.score(x_valid[0], y_valid[0])

array([[0.9393352 , 0.9238293 , 1.1329341 , 0.87665   , 1.1511712 ,
        1.1322352 , 1.5790507 , 1.0087804 , 1.09318   , 0.9669999 ,
        1.2592554 , 1.2350925 , 0.65715945, 0.55236495, 0.5742287 ,
        0.11269346, 0.10926371, 0.10734623, 0.10094201, 0.11920125,
        0.09529941, 0.06640099, 0.11326186, 0.08011833, 0.0745648 ,
        0.06267537, 0.13867272, 0.06208453, 0.05442723, 0.0305159 ,
        0.03192915, 0.02956705, 0.02804492, 0.03618066, 0.02752729,
        0.02645637, 0.02847949, 0.02768154, 0.02494742, 0.02511004,
        0.03402368, 0.0351573 , 0.0294042 ]], dtype=float32)

In [14]:
# For score or predict, if the model is set up
# for batch processing, you may pass a list of
# utterance data and get a similar output.
# The implementation takes care of zero-padding,
# avoiding useless computations on the pads and
# removing them in the end.

rnn.score(x_valid[:2], y_valid[:2])

array([[0.93933517, 0.92382926, 1.1329341 , 0.87665   , 1.1511712 ,
        1.1322353 , 1.5790509 , 1.0087804 , 1.09318   , 0.9669999 ,
        1.2592555 , 1.2350925 , 0.6571595 , 0.55236495, 0.5742287 ,
        0.11269345, 0.10926372, 0.10734624, 0.10094201, 0.11920125,
        0.09529941, 0.06640099, 0.11326185, 0.08011833, 0.0745648 ,
        0.06267537, 0.13867272, 0.06208453, 0.05442725, 0.03051589,
        0.03192915, 0.02956706, 0.02804492, 0.03618066, 0.02752729,
        0.02645637, 0.0284795 , 0.02768154, 0.02494742, 0.02511004,
        0.03402368, 0.0351573 , 0.0294042 ],
       [0.9110082 , 1.2083749 , 1.0382272 , 1.021703  , 1.0071024 ,
        1.192074  , 0.61097956, 0.9199053 , 1.2847241 , 1.2516372 ,
        0.74275976, 0.91278875, 0.39567268, 0.78875756, 0.5710404 ,
        0.09672309, 0.09738339, 0.09096681, 0.10522888, 0.09947834,
        0.08568613, 0.04871767, 0.06752534, 0.06458824, 0.08320254,
        0.06654856, 0.12617496, 0.04241813, 0.03431138, 0.02548349,
   

In [15]:
# To compute a scoring metrics over an iterable
# of utterances, use score_corpus.

rnn.score_corpus(x_valid[:2], y_valid[:2])

array([0.92950416, 1.03156981, 1.10099993, 0.92955761, 1.10328573,
       1.15329568, 1.32569464, 0.97892711, 1.16319145, 1.07440933,
       1.1075561 , 1.13369805, 0.57989904, 0.64439732, 0.57312193,
       0.10713908, 0.10513674, 0.10164146, 0.10241399, 0.11234034,
       0.09196135, 0.06027194, 0.09737282, 0.07476444, 0.07756144,
       0.06404831, 0.13433413, 0.05521093, 0.04744617, 0.02871928,
       0.03017358, 0.03125241, 0.02714093, 0.03272434, 0.02815488,
       0.02826203, 0.02547628, 0.02952454, 0.02303402, 0.02558742,
       0.03369197, 0.03112822, 0.03004007])

### Training models

In [16]:
# Run a training step, using some input / output
# vector(s).

rnn.run_training_function(x_train[:2], y_train[:2])

In [17]:
# Here is an example of (shortened) training loop.

# You may want to add further actions on the regular
# score check-ups, such as focusing on parts of the
# score output, de-normalizing it, checking if it is
# going better, saving the model and/or triggering
# early stopping critieria...

# Here, we train on random batches of ten utterances
# and check what the (raw) scores are every twenty
# batches.

import numpy as np

for i in range(100):
    if not i % 20:
        scores = rnn.score_corpus(x_valid, y_valid)
        print('Step %3i:' % i, scores)
    batch = np.random.choice(len(x_train), replace=False, size=10)
    rnn.run_training_function(x_train[batch], y_train[batch])

Step   0: [1.02978767 1.06072456 0.98860962 1.10188606 1.18272216 1.30777922
 1.02376768 0.91330324 1.16536146 0.98272865 1.50169419 1.00942094
 1.73960479 2.46286461 0.58550236 0.10208998 0.11745203 0.09855865
 0.1080503  0.1004002  0.10037166 0.07912243 0.08744027 0.06995177
 0.08590896 0.09131701 0.12749374 0.07887209 0.07203317 0.03217299
 0.03263936 0.03188306 0.02505049 0.03192314 0.02903864 0.02360825
 0.02817176 0.02425792 0.0212578  0.02957154 0.03363444 0.03118113
 0.04180009]
Step  20: [0.86497475 0.80858884 0.83272146 0.87148201 0.8509646  0.72916925
 0.90430639 0.76306025 0.91811251 0.86246549 0.97012825 0.79723529
 0.85910932 0.93779301 0.35138728 0.08955047 0.0976416  0.08224566
 0.09176718 0.08457348 0.07618139 0.06889063 0.06859979 0.06323359
 0.07398233 0.08094213 0.09633651 0.06750751 0.05592361 0.04564988
 0.0606576  0.04556537 0.04400237 0.04660961 0.06413935 0.03622612
 0.05571635 0.02378946 0.04429782 0.02989118 0.07068502 0.04077778
 0.03127039]
Step  40: [0.796

### Saving and restoring models

In [18]:
# Save the model to a .npy file.

rnn.save_model('dummy_rnn.npy')

In [19]:
# Reset the model's weights randomly.

rnn.reset_model()

In [20]:
# Restore the model's weights from a previous state.

rnn.restore_model('dummy_rnn.npy')

In [21]:
# Re-instantiate and restore the model.

rnn = ac2art.networks.load_dumped_model('dummy_rnn.npy')

### Running the inversion

In [22]:
# This requires to have produced normalized acoustic features
# and trained a neural network for ac2art inversion.

# Do read the docs to see which data formats may be used as inputs
# and outputs.

help(ac2art.run_inversion)

Help on function run_inversion in module ac2art._invert:

run_inversion(source, inverter, destination, keep_channels=None)
    Run acoustic-to-articulatory inversion of a set of features.
    
    Requires pre-computed acoustic features and a pre-trained
    acoustic-to-articulatory inverter neural network.
    
    source        : path to the **normalized** input features, which may
                    be stored as a single ark, scp or ark-like txt file,
                    or as npy files in a given folder
    inverter      : NeuralNetwork-inheriting instance, or path to
                    a .npy file recording a dumped model of such kind
    destination   : path where to output the inverted features, which
                    may be written as .npy files in a given folder or
                    compiled in a .ark, .scp or ark-like .txt file
    keep_channels : optional list of indexes of channel of inverted
                    features to keep (default None, implying all)



### Extract features to compute ABX metrics

In [23]:
# Use the <corpus>.abx.extract_h5_features function.

help(mocha.abx.extract_h5_features)

Help on function extract_h5_features in module ac2art.corpora.prototype.abx._abx:

extract_h5_features(audio_features=None, ema_features=None, inverter=None, output_name='mocha_features', articulators=None, dynamic_ema=True, sampling_rate=100)
    Build an h5 file recording audio features associated with mocha data.
    
    audio_features : optional name of audio features to use, including
                     normalization indications
    ema_features   : optional name of ema features' normalization to use
                     (use '' for raw data and None for no EMA data)
    inverter       : optional acoustic-articulatory inverter whose
                     predictions to use, based on the audio features
    output_name    : base name of the output file (default 'mocha_features')
    articulators   : optional list of articulators to keep among EMA data
    dynamic_ema    : whether to include dynamic articulatory features
                     (bool, default True)
    sampling_rate  

In [24]:
# Here, we extract normalized articulatory features.

mocha.abx.extract_h5_features(
    ema_features='stds_byspeaker', output_name='mocha_ema'
)

In [25]:
# Here, we invert normalized acoustic features using the trained rnn.

mocha.abx.extract_h5_features(
    audio_features='mfcc_stds_byspeaker', inverter=rnn,
    output_name='mocha_inv_rnn'
)

### Compute ABX metrics

In [26]:
# Use the <corpus>.abx.abx_from_features function.

help(mocha.abx.abx_from_features)

Help on function abx_from_features in module ac2art.corpora.prototype.abx._abx:

abx_from_features(features, fileset=None, byspeaker=True, limit_phones=False, n_jobs=1)
    Run the ABXpy pipeline on a set of pre-extracted mocha features.
    
    features     : name of a h5 file of mocha features created with
                   the `extract_h5_features` function (str)
    fileset      : optional name of a fileset whose utterances'
                   features to use (str)
    byspeaker    : whether to discriminate pairs from the same
                   speaker only (bool, default True)
    limit_phones : whether to aggregate some phonemes, using
                   the 'ipa_reduced' column of the mocha symbols
                   file as mapping (bool, default False)
    n_jobs       : number of CPU cores to use (positive int, default 1)



In [27]:
# Here, we compute ABX metrics on the validation set
# using the previously extracted true articulatory data.

mocha.abx.abx_from_features(
    features='mocha_ema', fileset='validation'
)

Using found /home/archeo/Bureau/stage_coml/datasets/processed_mocha_timit/abx/mocha_validation_byspk_task.abx file.
ABXpy distance module was successfully run.
ABXpy score module was successfully run.
ABXpy analyze module was successfully run.
Done running ABXpy. Results were written to '/home/archeo/Bureau/stage_coml/datasets/processed_mocha_timit/abx/mocha_ema_validation_byspk_abx.csv'.
Replacing phoneme symbols with IPA ones...
Done updating scores file.


### Load the computed ABX scores

In [28]:
# Use the <corpus>.abx.load_abx_scores function.
# This function reads the produced csv file and aggregates scores on the fly.

mocha.abx.load_abx_scores('mocha_ema_validation_byspk')

Unnamed: 0_level_0,score,n
phones,Unnamed: 1_level_1,Unnamed: 2_level_1
aɪ_iː,1.000000,4
aɪ_j,1.000000,4
aɪ_m,1.000000,4
aɪ_y,1.000000,4
aɪ_æ,1.000000,4
aɪ_ɑː,0.937500,16
aɪ_ə,0.850000,100
aɪ_ɛ,0.583333,12
aɪ_ɛɪ,0.500000,8
aɪ_ɪ,0.833333,24
