In [None]:
from sklearn.metrics import f1_score
import os
import pickle
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Reshape, BatchNormalization, Dropout
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, ReLU
import numpy as np
from scipy.io import wavfile
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef
from sklearn.metrics import precision_score, f1_score, confusion_matrix
from sklearn.metrics._plot.confusion_matrix import ConfusionMatrixDisplay
from python_speech_features import logfbank
from KWS_utils import *

In [None]:
def create_model():
    """
    Feature extractor for hotword detection
    :return: Model
    """

    model = Sequential()

    # Normalization layer
    model.add(Reshape(input_shape=INPUT_SHAPE, target_shape=TARGET_SHAPE))
    model.add(BatchNormalization())

    for num_filters in filters:
        # Convolutional layers
        model.add(Conv2D(num_filters, kernel_size=KERNEL_SIZE, padding="same"))
        model.add(BatchNormalization())
        model.add(ReLU())

        # Pooling
        model.add(MaxPooling2D(pool_size=POOL_SIZE))
        model.add(Dropout(DROPOUT))

    # Classification layers
    model.add(Flatten())
    model.add(Dense(DENSE_1, name="features512"))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(DROPOUT))
    model.add(Dense(DENSE_2, name="features256"))
    model.add(BatchNormalization())
    model.add(ReLU())
    model.add(Dropout(DROPOUT))
    model.add(Dense(NUM_CLASSES, activation="softmax"))

    model.summary()

    return model

In [None]:
def model_train():
    """
    Trains model which is used as a feature extractor
    :return:None
    """

    # Download data
    downloadData(data_path="/input/speech_commands/")

    # Get data dictionary
    dataDict = getDataDict(data_path="/input/speech_commands/")

    # Obtain dataframe for each dataset
    trainDF = getDataframe(dataDict["train"])
    valDF = getDataframe(dataDict["val"])
    devDF = getDataframe(dataDict["dev"])
    testDF = getDataframe(dataDict["test"])

    print("Dataset statistics")
    print("Train files: {}".format(trainDF.shape[0]))
    print("Validation files: {}".format(valDF.shape[0]))
    print("Dev test files: {}".format(devDF.shape[0]))
    print("Test files: {}".format(testDF.shape[0]))

    # Use TF Data API for efficient data input
    train_data, train_steps = getDataset(df=trainDF, batch_size=BATCH_SIZE, cache_file="train_cache", shuffle=True)

    val_data, val_steps = getDataset(df=valDF, batch_size=BATCH_SIZE, cache_file="val_cache", shuffle=False)

    model = create_model()
    model.summary()

    # Stop training if the validation accuracy doesn't improve
    earlyStopping = EarlyStopping(monitor="val_loss", patience=PATIENCE, verbose=1)

    # Reduce LR on validation loss plateau
    reduceLR = ReduceLROnPlateau(monitor="val_loss", patience=PATIENCE, verbose=1)

    # Compile the model
    model.compile(
        loss="sparse_categorical_crossentropy",
        optimizer=Adam(learning_rate=LEARNING_RATE),
        metrics=["sparse_categorical_accuracy"],
    )

    # Train the model
    history = model.fit(
        train_data.repeat(),
        steps_per_epoch=train_steps,
        validation_data=val_data.repeat(),
        validation_steps=val_steps,
        epochs=EPOCHS,
        callbacks=[earlyStopping, reduceLR],
    )

    # Save model
    print("Saving model")
    model.save("../models/sheila_kws.h5")

    # Save history data
    print("Saving training history")
    with open("../models/sheila_kws_history.pickle", "wb") as file:
        pickle.dump(history.history, file, protocol=pickle.HIGHEST_PROTOCOL)

    plot_history(history=history)


In [None]:
def model_test():
    """
    Tests the Sheila hotword detector
    :return: None
    """

    # Download data
    downloadData(data_path="/input/speech_commands/")

    # Get dictionary with files and labels
    dataDict = getDataDict(data_path="/input/speech_commands/")

    # Obtain dataframe by merging dev and test dataset
    devDF = getDataframe(dataDict["dev"], include_unknown=True)
    testDF = getDataframe(dataDict["test"], include_unknown=True)

    evalDF = pd.concat([devDF, testDF], ignore_index=True)

    print("Test files: {}".format(evalDF.shape[0]))

    # Obtain Sheila - Other separated data
    evalDF["class"] = evalDF.apply(lambda row: 1 if row["category"] == "sheila" else -1, axis=1)
    evalDF.drop("category", axis=1)
    test_true_labels = evalDF["class"].tolist()

    eval_data, _ = getDataset(df=evalDF, batch_size=BATCH_SIZE, cache_file="kws_val_cache", shuffle=False)

    # Load trained model
    model = load_model("../models/_kws.h5")

    layer_name = "features256"
    feature_extractor = Model(inputs=model.input, outputs=model.get_layer(layer_name).output)

    # Extract the feature embeddings and evaluate using SVM
    X_test = feature_extractor.predict(eval_data, use_multiprocessing=True)


    OC_Statistics(X_test, test_true_labels, "sheila_cm_without_noise")


In [None]:
if __name__ == "main":
  model_train()
  model_test()
