# CNN

The following notebook presents the CNN approach to sentence boundary detection. Similarily to previous attempts is also has the problem of not allowing variable length. Thus, what we concentrate here on is a given amount of time before and afterwards of the break. This might (in case of a pause) or might not imply discontinuities. In general all features are initially calculated on the original, long data and then split into smaller word elements depending on the frame size. This is very important as if we calculate the features on the cut data we will influence the edges and hence change how the situation looks like.   

If one wants not to use all possible GPUs, the following will set the devices to be used. Here, we use device with id 1 and 2.

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"

We start by importing all the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import glob as glob
import os
from sklearn.utils import class_weight
from keras.layers import Dense, Dropout, Concatenate, Input, MaxPool1D, MaxPool2D, AveragePooling1D, AveragePooling2D, Flatten, Conv2D, Conv1D, BatchNormalization, ReLU
from keras.models import Model, load_model
from keras import optimizers
from keras.constraints import max_norm
from keras.regularizers import l2
import keras.backend as K
from keras.callbacks import ModelCheckpoint, TensorBoard, EarlyStopping
from datetime import datetime
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
import librosa
from sklearn.model_selection import train_test_split
from preprocessing import create_data_frame
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

Using TensorFlow backend.


We create the training files and validation files lists by globbing the initially prepared directories. The data format expected is rttm which exists in the broadcast and switchboard datasets.

In [2]:
training_files = glob.glob("data/broadcast/train/*.rttm")
validation_files = glob.glob("data/broadcast/validation/*.rttm")
#training_files, validation_files = train_test_split(files, test_size=0.1)

The prepare data routine allows us to, by providing the basic information about the file, obtain all the necessary features. It operates on frames rather than time, hence, the factor $\dfrac{sr}{512}$.

In [3]:
def prepare_data(df, audio, sr, size):
    # Obtain the chroma (pitch), mfcc and energy features
    print("Obtaining chroma")
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    print("Obtaining chroma [DONE]")
    print("Obtaining mfcc")
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=50)
    print("Obtaining mfcc [DONE]")
    print("Obtaining energy")
    energy = librosa.feature.rms(y=audio)
    print("Obtaining energy [DONE]")
    
    mfcc_features = []
    chroma_features = []
    energy_features = []
    pause = []
    turn = []
    y = []
    
    # Crawl through the rows of the data frame and add the features     
    for index, row in df.iterrows():
        start = int(round(row['beginning_time']*sr/512))
        finish = int(round(row['final_time']*sr/512))
                
        try:
            start_next = int(round(df.loc[index+1, 'beginning_time']*sr/512))
            moved_finish = finish - size
            moved_next_start = start_next + size
            
            mfcc_features.append(np.concatenate((mfcc[:, moved_finish:finish], mfcc[:, start_next:moved_next_start]), axis=1))
            chroma_features.append(np.concatenate((chroma[:, moved_finish:finish], chroma[:, start_next:moved_next_start]), axis=1))
            energy_features.append(np.concatenate((energy[:, moved_finish:finish], energy[:, start_next:moved_next_start]), axis=1))
            
        except KeyError:
            moved_finish = finish - size
            if energy.shape[1] > finish + size:
                mfcc_features.append(mfcc[:, moved_finish:finish + size])
                chroma_features.append(chroma[:, moved_finish:finish + size])
                energy_features.append(energy[:, moved_finish:finish + size])
            else:
                mfcc_features.append(np.concatenate((mfcc[:, moved_finish:finish], np.zeros([mfcc.shape[0], size])), axis=1))
                chroma_features.append(np.concatenate((chroma[:, moved_finish:finish], np.zeros([chroma.shape[0], size])), axis=1))
                energy_features.append(np.concatenate((energy[:, moved_finish:finish], np.zeros([energy.shape[0], size])), axis=1))
                
        pause.append(row['pause'])
        turn.append(row['turn'])
        y.append(row['end_of_sentence'])

    return mfcc_features, chroma_features, energy_features, pause, turn, y

Before using the data we need to normalise it. This step is quite popular in machine learning as it allows to drastically decrease the training time and improves the convergance of the function. Here a typical normalisation is used, i.e. $\dfrac{x-\mu}{\sigma}$, where $\mu$ is the mean and $\sigma$ is the standard deviation.

In [5]:
def preprocess(files, mfcc_norm=None, energy_norm=None, pause_norm=None):
    features = [[], [], [], [], []]
    y = []
    
    for index, file in enumerate(files):
        print(f"Starting analysing file number {index + 1} out of {len(files)}")
        df = create_data_frame((file,))
        base = os.path.splitext(file)[0] + '.wav'
        
        print("Reading audio")
        audio, sr = librosa.load(base, sr=None)
        print("Reading audio [DONE]")
        current = prepare_data(df, audio, sr, 10)
        features[0] += current[0]
        features[1] += current[1]
        features[2] += current[2]
        features[3] += current[3]
        features[4] += current[4]
        
        y += current[5]
        
    print("Normalising data")
    if not mfcc_norm:
        mfcc_norm = (np.mean(features[0]), np.std(features[0]))

    if not energy_norm:
        energy_norm = (np.mean(features[2]), np.std(features[2]))

    if not pause_norm:
        pause_norm = (np.mean(features[3]), np.std(features[3]))

    features[0] = (np.stack(features[0]) - mfcc_norm[0])/mfcc_norm[1]
    features[0] = np.reshape(features[0], (features[0].shape[0], features[0].shape[1], features[0].shape[2], 1))
    
    features[1] = np.stack(features[1])
    features[1] = np.reshape(features[1], (features[1].shape[0], features[1].shape[1], features[1].shape[2], 1))

    features[2] = np.transpose((np.stack(features[2]) - energy_norm[0])/energy_norm[1], (0, 2, 1))
    features[3] = (np.stack(features[3]) - pause_norm[0])/pause_norm[1]
    features[4] = np.stack(features[4])
    print("Normalising data [DONE]")
    
    return features, np.stack(y), mfcc_norm, energy_norm, pause_norm

Here we obtain the features and also obtain the normalising factors to be used for the testing data.

In [6]:
x_train, y_train, mfcc_norm, energy_norm, pause_norm = preprocess(training_files)
x_test, y_test, _, _, _ = preprocess(validation_files, mfcc_norm, energy_norm, pause_norm) 

Starting analysing file number 1 out of 19
Reading audio
Reading audio [DONE]
Obtaining chroma
Obtaining chroma [DONE]
Obtaining mfcc
Obtaining mfcc [DONE]
Obtaining energy
Obtaining energy [DONE]
Starting analysing file number 2 out of 19
Reading audio
Reading audio [DONE]
Obtaining chroma
Obtaining chroma [DONE]
Obtaining mfcc
Obtaining mfcc [DONE]
Obtaining energy
Obtaining energy [DONE]
Starting analysing file number 3 out of 19
Reading audio
Reading audio [DONE]
Obtaining chroma
Obtaining chroma [DONE]
Obtaining mfcc
Obtaining mfcc [DONE]
Obtaining energy
Obtaining energy [DONE]
Starting analysing file number 4 out of 19
Reading audio
Reading audio [DONE]
Obtaining chroma
Obtaining chroma [DONE]
Obtaining mfcc
Obtaining mfcc [DONE]
Obtaining energy
Obtaining energy [DONE]
Starting analysing file number 5 out of 19
Reading audio
Reading audio [DONE]
Obtaining chroma
Obtaining chroma [DONE]
Obtaining mfcc
Obtaining mfcc [DONE]
Obtaining energy
Obtaining energy [DONE]
Starting analys

As a sanity check we confirm that the shape of the training data is consitent. This test is quite manual as it requires to set the shape to be checked.

In [6]:
# Check whether all dimensions are the same
for index, i in enumerate(x_train[0]):
    if i.shape != (50, 20, 1):
        print("ups")
        print(i.shape)
        print(index)

We also define the recall, presicion, f1 and nist scores that are unavailable in keras/tensorflow by default.

In [7]:
# Define help functions for f-score, etc.
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

def nist(y_true, y_pred):
    predicted = K.round(K.clip(y_pred, 0, 1))
    return K.sum(K.abs(predicted-y_true))/K.sum(y_true)

The neural model used in this notebook is a CNN network. It consists of multiple layers and uses some of the common techniques to counteract problems such as overfitting and data imbalance.
We start with five inputs. Each of them is treated a bit differently. All the time dependent qunatities (such as MFCC, chroma and energy) are fed through separate CNN's with variable 2D conv layers. All the layers are normalised using batch normalisation and ReLu is the activation function used. Batch normalisation is a very good measure against overfitting and comes from the idea that if it is good to normalise the input data, why not normalise the intermediate stages as well. In the end we flatten the layers and concatenate them with the pause duration and information about the turn of the speaker. Afterwards, this is fed through a fully connected layers that can be used toghether with a dropout layer. The dropout layer here counteracts overfitting and in essence gives the network the opprotunity to test multiple models in one. The adam optimizer is used as it is almost always the standard. Other optimizers seem not to converge as nicely as adam does and take more time to conduct. The loss function used is the binary crossentropy.

In [7]:
def CNN_model(conv_size, number_of_deep_layers, size_of_deep_layers, dropout_size):
    # Define inputs
    mfcc_input = Input(shape=(50, 20, 1))
    chroma_input = Input(shape=(12, 20, 1))
    energy_input = Input(shape=(20, 1))
    pause_input = Input(shape=(1,))
    turn_input = Input(shape=(1,))
    
    # MFCC CNN
    mfcc = Conv2D(16, (3, 3), padding='same')(mfcc_input)
    mfcc = BatchNormalization()(mfcc)
    mfcc = ReLU()(mfcc)
    mfcc = MaxPool2D()(mfcc)
    for _ in range(conv_size):
        mfcc = Conv2D(32, (3, 3), padding='same')(mfcc)
        mfcc = BatchNormalization()(mfcc)
        mfcc = ReLU()(mfcc)
    mfcc = AveragePooling2D()(mfcc)
    mfcc = Flatten()(mfcc)
    
    # Chroma CNN
    chroma = Conv2D(16, (3, 3), padding='same')(chroma_input)
    chroma = BatchNormalization()(chroma)
    chroma = ReLU()(chroma)
    chroma = MaxPool2D()(chroma)
    for _ in range(conv_size):
        chroma = Conv2D(32, (3, 3), padding='same')(chroma)
        chroma = BatchNormalization()(chroma)
        chroma = ReLU()(chroma)
    chroma = AveragePooling2D()(chroma)
    chroma = Flatten()(chroma)
    
    # Energy CNN
    energy = Conv1D(16, 3, padding='same')(energy_input)
    energy = BatchNormalization()(energy)
    energy = ReLU()(energy)
    energy = MaxPool1D()(energy)
    for _ in range(conv_size):
        energy = Conv1D(32, 3, padding='same')(energy)
        energy = BatchNormalization()(energy)
        energy = ReLU()(energy)
    energy = AveragePooling1D()(energy)
    energy = Flatten()(energy)
    
    # Concatenate layers
    x = Concatenate()([mfcc, chroma, energy, pause_input, turn_input])
    #x = Dropout(dropout_size)(x)
    # Post dense layers
    for i in range(number_of_deep_layers):
        x = Dense(size_of_deep_layers, activation='relu')(x)
        #x = Dropout(dropout_size)(x)
    
    x = Dense(1, activation='sigmoid')(x)
    
    # Compile the model
    model = Model(inputs=[mfcc_input, chroma_input, energy_input, pause_input, turn_input], outputs=x)
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy', f1_m, precision_m, recall_m, nist])
    
    return model

As an example we can summarise a given network and see how many parameters we will obtain. Depending on the setup this can vary significantly, from couple thousand to millions of parameters. Those control how complex the function we are estimating can be and how easily it can overfit. Hence, too small number and we will underfit, too big and we will overfit. To help visualise the network we also plot it.

In [8]:
# Summarize and plot example model
model = CNN_model(0, 1, 652, 0.1)
model.summary()
SVG(model_to_dot(model).create(prog='dot', format='svg'))

W1006 19:35:01.345139 140189795104576 deprecation_wrapper.py:119] From /local/scratch/mac224/anaconda/envs/SUNDER/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1006 19:35:01.416724 140189795104576 deprecation_wrapper.py:119] From /local/scratch/mac224/anaconda/envs/SUNDER/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1006 19:35:01.443614 140189795104576 deprecation_wrapper.py:119] From /local/scratch/mac224/anaconda/envs/SUNDER/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1006 19:35:01.489161 140189795104576 deprecation_wrapper.py:119] From /local/scratch/mac224/anaconda/envs/SUNDER/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The n

NameError: name 'f1_m' is not defined

To tune the hyperparameters we define some lists to test over. Those allow us to run long experiments and see in which direction we should perturb the network in order for it to work the best. Validation set can have a big influence on the obtained result, hence, we use a constant one obtained above.

In [None]:
# Define layers to test
conv_sizes = [2,]#np.arange(0, 10)
numbers_hidden = [1,]#np.arange(1, 5)
sizes_hidden = [652,]#np.arange(20, 160, 40)
dropout_sizes = [0,]#np.arange(0.2, 0.7, 0.1)

# Define the wieghts to use
weights = class_weight.compute_class_weight('balanced', np.unique(y_train), y_train)
weigths = {index: value for index, value in enumerate(weights)}

# Go through the posssible sizes and evaluate the model
for dropout in dropout_sizes:
    for conv in conv_sizes:
        for size in sizes_hidden:
            for number in numbers_hidden:
                model = CNN_model(conv, number, size, dropout)

                name = f'Broadcast-new-validation-no-dropout-CNN-mfcc-chroma-energy-pause-turn-automatic-input-weights-number-conv-{conv}-dropout-{dropout}-number-hidden-{number}-size-hidden-{size}-{datetime.now().strftime("%d_%m_%Y-%H_%M_%S")}'
                checkpoint = ModelCheckpoint(f'training/{name}.h5', monitor='val_f1_m', verbose=1, save_best_only=True, mode='max')
                tensorboard = TensorBoard(log_dir=f"training/tensorboard/{name}")
                #early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=6, min_delta=0.001)

                model.fit(x_train, y_train, epochs=1000, batch_size=1000, callbacks=[checkpoint, tensorboard], validation_data=(x_test, y_test), shuffle=True, class_weight=weights)