# Training Acoustic Model with Connectionist Temporal Classification (CTC) Criteria
CNTK implementation of CTC is based on the paper by A. Graves et al. *"Connectionist temporal classification: labeling unsegmented sequence data with recurrent neural networks"*. The implementation runs training on several sequences in parallel either on GPU or CPU, achieving maximal utilization of the hardware. 

## Data Preparation
CNTK consumes Acoustic Model (AM) training data in HTK/MLF format and typically expects 3 input files
* [SCP file with features](https://github.com/Microsoft/CNTK/blob/master/Tests/EndToEndTests/Speech/Data/glob_0000.scp)
* [MLF file with labels](https://github.com/Microsoft/CNTK/blob/master/Tests/EndToEndTests/Speech/Data/glob_0000.mlf)
* [States list file](https://github.com/Microsoft/CNTK/blob/master/Tests/EndToEndTests/Speech/Data/state_ctc.list)

The example state list file contains the CTC blank label "s_blank" as the last entry, i.e. at index 132.

## Feature Input Definition


## Normalize Features and Define a Network with LSTM Layers

In [1]:
import os
import cntk as C
import numpy as np


# Select the right target device
if 'TEST_DEVICE' in os.environ:
    if os.environ['TEST_DEVICE'] == 'cpu':
        C.device.try_set_default_device(C.device.cpu())
    else:
        C.device.try_set_default_device(C.device.gpu(0))

data_dir = os.path.join("..", "Tests", "EndToEndTests", "Speech", "Data")
print("Current directory {0}".format(os.getcwd()))
if os.path.realpath(data_dir) != os.path.realpath(os.getcwd()):
    print("Changing to data directory {0}".format(data_dir))
    os.chdir(data_dir)

feature_dimension = 33
feature = C.sequence.input((feature_dimension))

label_dimension = 133
label = C.sequence.input((label_dimension))

train_feature_filepath = "glob_0000.scp"
train_label_filepath = "glob_0000.mlf"
mapping_filepath = "state_ctc.list"
train_feature_stream = C.io.HTKFeatureDeserializer(C.io.StreamDefs(amazing_feature = C.io.StreamDef(shape = feature_dimension, scp = train_feature_filepath)))
train_label_stream = C.io.HTKMLFDeserializer(mapping_filepath, C.io.StreamDefs(awesome_label = C.io.StreamDef(shape = label_dimension, mlf = train_label_filepath)), True)
train_data_reader = C.io.MinibatchSource([train_feature_stream, train_label_stream], frame_mode = False)
train_input_map = {feature: train_data_reader.streams.amazing_feature, label: train_data_reader.streams.awesome_label}


Current directory D:\CNTK\CNTK\Tutorials
Changing to data directory ..\Tests\EndToEndTests\Speech\Data


In [2]:
feature_mean = np.fromfile(os.path.join("GlobalStats", "mean.363"), dtype=float, count=feature_dimension)
feature_inverse_stddev = np.fromfile(os.path.join("GlobalStats", "var.363"), dtype=float, count=feature_dimension)

feature_normalized = (feature - feature_mean) * feature_inverse_stddev

with C.default_options(activation=C.sigmoid):
	z = C.layers.Sequential([
        C.layers.For(range(3), lambda: C.layers.Recurrence(C.layers.LSTM(1024))),
        C.layers.Dense(label_dimension)
    ])(feature_normalized)

## Define Training Parameters, Criteria and Error
CTC criteria is implemented by combination of the **labels_to_graph** and **forward_backward** functions.

In [3]:
mbsize = 1024
mbs_per_epoch = 10
max_epochs = 5

criteria = C.forward_backward(C.labels_to_graph(label), z, blankTokenId=132, delayConstraint=3)
err = C.edit_distance_error(z, label, squashInputs=True, tokensToIgnore=[132])
lr = C.learning_rate_schedule([(3, .01), (1,.001)], C.UnitType.sample)
mm = C.momentum_schedule([(1000, 0.9), (0, 0.99)], mbsize)
learner = C.momentum_sgd(z.parameters, lr, mm)
trainer = C.Trainer(z, (criteria, err), learner)

## Train and Save the Model

In [4]:
C.logging.log_number_of_parameters(z)
progress_printer = C.logging.progress_print.ProgressPrinter(tag='Training', num_epochs = max_epochs)

for epoch in range(max_epochs):
	for mb in range(mbs_per_epoch):
		minibatch = train_data_reader.next_minibatch(mbsize, input_map = train_input_map)
		trainer.train_minibatch(minibatch)
		progress_printer.update_with_trainer(trainer, with_metric = True)

	print('Trained on a total of ' + str(trainer.total_number_of_samples_seen) + ' frames')
	progress_printer.epoch_summary(with_metric = True)

# Uncomment to save the model
# z.save('CTC_' + str(max_epochs) + 'epochs_' + str(mbsize) + 'mbsize_' + str(mbs_per_epoch) + 'mbs.model')

Training 21255301 parameters in 11 parameter tensors.
Trained on a total of 8428 frames
Finished Epoch[1 of 5]: [Training] loss = 3.720116 * 8428, metric = 100.00% * 8428 46.220s (182.3 samples/s);
Trained on a total of 17094 frames
Finished Epoch[2 of 5]: [Training] loss = 3.513337 * 8666, metric = 98.07% * 8666 34.085s (254.2 samples/s);
Trained on a total of 25662 frames
Finished Epoch[3 of 5]: [Training] loss = 3.498874 * 8568, metric = 98.23% * 8568 38.782s (220.9 samples/s);
Trained on a total of 35282 frames
Finished Epoch[4 of 5]: [Training] loss = 3.512962 * 9620, metric = 98.23% * 9620 36.212s (265.7 samples/s);
Trained on a total of 43890 frames
Finished Epoch[5 of 5]: [Training] loss = 3.508142 * 8608, metric = 98.12% * 8608 37.102s (232.0 samples/s);


# Test the Model 

For simplicity, we will use a portion of the train set for testing here

In [5]:
test_data = train_data_reader.next_minibatch(mbsize, input_map = train_input_map)

print(round(trainer.test_minibatch(test_data), 2))

0.98
