# Code for creating classes and for running experiments

At the beginnning of our work we got inspired by [this](https://github.com/rachhek/speech_recognition_using_lstm) repository.

In [None]:
import copy
import csv
import os
import warnings
from itertools import compress

from matplotlib import pyplot
import numpy as np
import pandas as pd
import seaborn as sns 
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,normalize
from sklearn.preprocessing import MinMaxScaler

import scipy.io.wavfile as wav
from scipy.signal import resample
from python_speech_features import mfcc
from python_speech_features import logfbank

import keras
from keras.layers import SimpleRNN, LSTM, GRU
from keras.layers import Dense, Dropout, Embedding, Masking, Bidirectional, Flatten, SpatialDropout1D, SpatialDropout2D, SpatialDropout3D
from keras.callbacks import EarlyStopping
from keras.models import Sequential, load_model
from keras.optimizers import Adam
from keras.utils import plot_model
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

from sklearn.metrics import accuracy_score

warnings.filterwarnings("ignore", category=DeprecationWarning)

AUDIO_FORMAT = "WAV"

In [None]:
def read_data(
    train_audio_path_without_silence, 
    amount=100, 
    new_sample_rate = 8000, 
    number_of_cepstral_coefficients = 13, 
    if_unknown=False,
    silence_mode="silence",
):

    """
    Function reading data from given location and preprocessing them.

    """

    class_names = []

    for file in os.listdir(train_audio_path_without_silence):
        d = os.path.join(train_audio_path_without_silence, file)
        if os.path.isdir(d):
            class_names.append(str(os.path.basename(d)))
    
    if not if_unknown:
        class_names.remove('unknown')

    if silence_mode == "silence":
        class_names.remove("silence_background_noise")
    else:
        class_names.remove("silence")

    scaler = MinMaxScaler(feature_range=(0,1))
    class_list_train = []
    mfcc_list_train = []

    for class_name in class_names:
        class_path_without_silence = f"{train_audio_path_without_silence}/{class_name}"
        number=0
        
        class_files = [f for f in os.listdir(class_path_without_silence) 
            if os.path.isfile(os.path.join(class_path_without_silence, f))]
        for class_file in class_files:
            
            if number> amount:
                print(f'class: {class_name}, number: {number}')
                break

            try:
                # read wavfile
                sample_rate, sample = wav.read(f"{class_path_without_silence}/{class_file}")

                # resample 16k -> 8k
                sample = resample(sample, int(new_sample_rate/sample_rate * sample.shape[0]))

                # padding
                sample = pad_sequences([sample], maxlen=8000, dtype='float', padding='post', truncating='post', value=0.0)

                # normalization
                if max:= np.max(np.abs(sample)) != 0:
                    sample = sample / max
        
                # mfcc
                mfccs = mfcc(
                    sample,
                    new_sample_rate,
                    numcep=number_of_cepstral_coefficients,
                )

                # normalization of mfcc
                scaler = scaler.fit(mfccs)
                normalized = scaler.transform(mfccs)

                mfcc_list_train.append(normalized)
                class_list_train.append(class_name)

                number+=1
            except ValueError:
                pass


    #One hot encoding the labels
    label_encoder_test = LabelEncoder()
    class_list_train_2=copy.deepcopy(class_list_train)
    class_list_train_2.append("zzz_unknown") #potrzebne do test√≥w z unknown
    vec_labels_train = label_encoder_test.fit_transform(class_list_train_2)

    one_hot_labels_train = keras.utils.to_categorical(vec_labels_train[:-1], num_classes=len(class_names))
    print(vec_labels_train)
    print(one_hot_labels_train.shape)
    Y_train = one_hot_labels_train
    X_train = np.array(mfcc_list_train,dtype=np.float32)

    return X_train, Y_train, label_encoder_test, class_names


In [None]:
train_audio_path_without_silence = '/work/deep_learning/project_2/train/audio_without_silence'


X_train, Y_train, label_encoder_unknownTrue_silence, class_names_unknownTrue_silence = read_data(train_audio_path_without_silence, amount=50, if_unknown=True, silence_mode="silence")
X_train_unknownTrue_silence, X_valid_unknownTrue_silence, y_train_unknownTrue_silence, y_valid_unknownTrue_silence = train_test_split(X_train, Y_train, test_size=0.33, random_state=42, shuffle=True)

X_train, Y_train, label_encoder_unknownFalse_silence, class_names_unknownFalse_silence = read_data(train_audio_path_without_silence, amount=50, if_unknown=False, silence_mode="silence")
X_train_unknownFalse_silence, X_valid_unknownFalse_silence, y_train_unknownFalse_silence, y_valid_unknownFalse_silence = train_test_split(X_train, Y_train, test_size=0.33, random_state=42, shuffle=True)

X_train, Y_train, label_encoder_unknownTrue_silencebn, class_names_unknownTrue_silencebn = read_data(train_audio_path_without_silence, amount=50, if_unknown=True, silence_mode="silence_background_noise")
X_train_unknownTrue_silencebn, X_valid_unknownTrue_silencebn, y_train_unknownTrue_silencebn, y_valid_unknownTrue_silencebn = train_test_split(X_train, Y_train, test_size=0.33, random_state=42, shuffle=True)

X_train, Y_train, label_encoder_unknownFalse_silencebn, class_names_unknownFalse_silencebn = read_data(train_audio_path_without_silence, amount=50, if_unknown=False, silence_mode="silence_background_noise")
X_train_unknownFalse_silencebn, X_valid_unknownFalse_silencebn, y_train_unknownFalse_silencebn, y_valid_unknownFalse_silencebn = train_test_split(X_train, Y_train, test_size=0.33, random_state=42, shuffle=True)

class: eight, number: 51
class: cat, number: 51
class: house, number: 51
class: down, number: 51
class: on, number: 51
class: off, number: 51
class: sheila, number: 51
class: four, number: 51
class: six, number: 51
class: seven, number: 51
class: one, number: 51
class: five, number: 51
class: happy, number: 51
class: bird, number: 51
class: go, number: 51
class: bed, number: 51
class: nine, number: 51
class: marvin, number: 51
class: right, number: 51
class: no, number: 51
class: left, number: 51
class: silence, number: 51
class: dog, number: 51
class: stop, number: 51
[ 6  6  6 ... 24 24 25]
(1230, 25)


In [None]:
class SimpleRNN_class:
    """
    Class for creating and dealing with Simple RNN network.
    """

    def __init__(self, units, num_classes, bidirectional=False, dropout=0.0):
        self.units = units
        self.num_classes = num_classes

        rnn = SimpleRNN(self.units, return_sequences=False)
        if bidirectional:
            rnn = Bidirectional(rnn)

        self.model = Sequential()
        self.model.add(rnn)
        self.model.add(Dropout(dropout))      
        self.model.add(Dense(self.num_classes, activation='softmax'))

        self.bidirectional = bidirectional
        self.dropout = dropout
  
    def train(self, X_train, y_train, X_valid, y_valid, learning_rate=0.001, epochs=300, batch_size=64, silence="silence", unknown_mode=2, list_of_good_clases=None, label_encoder=None):
        """
        Training model, saving to the folder.
        """
        optimizer = Adam(amsgrad=True, lr=learning_rate)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
           
        callbacks = [EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, mode='min')]
        
        samples_per_epoch = len(range(0, len(X_train), batch_size))
        training = self.model.fit(
            X_train, 
            y_train,
            validation_data=(X_valid, y_valid), 
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1,
        )
        
        if unknown_mode == 1:
            y_pred = self.test_change_pred(X_valid, list_of_good_clases, label_encoder)      
        elif unknown_mode == 2:
            y_pred = np.argmax(self.model.predict(X_valid), axis=1)
        else:
            y_pred = self.test_pred_proba(X_valid, label_encoder) 

        y_valid=np.argmax(y_valid, axis=1)
        accuracy=accuracy_score(y_valid, y_pred)
        
        self.model.save(f"./models/simple_rnn_{silence}_unknownmode_{unknown_mode}_bidir_{self.bidirectional}_dropout_{self.dropout}_lr_{learning_rate}_batchsize_{batch_size}_acc_{round(accuracy, 3)}")

        return training, y_pred

    def test_change_pred(self, X, list_good_classes, label_encoder):
        """
        Changing labels of predictions, unknown handling version 1.
        """

        list_good_classes_encoded=label_encoder.transform(list_good_classes)
        unknown_encoded=label_encoder.transform(['zzz_unknown'])
        y_pred_org = np.argmax(self.model.predict(X), axis=1)

        for i in range(len(y_pred_org)):
            if y_pred_org[i] not in list_good_classes_encoded:
                y_pred_org[i]=unknown_encoded

        return y_pred_org

    def test_pred_proba(self, X, label_encoder, num_classes=5):
        """
        Changing labels of predictions, unknown handling version 3.
        """
        unknown_encoded=label_encoder.transform(['zzz_unknown'])
        y_pred_proba=self.model.predict(X)
        y_pred_org=copy.deepcopy(y_pred_proba)
        y_pred=[]

        for i in range(len(y_pred_proba)):
            y_pred_proba[i].sort()
            y_pred_proba[i]=y_pred_proba[i][::-1]
            if abs(y_pred_proba[i][num_classes-1]-y_pred_proba[i][0])<0.05:
                y_pred.append(unknown_encoded[0])
            else:
                y_pred.append(np.argmax(y_pred_org[i]))
        
        return y_pred

# # Test
# rnn = SimpleRNN_class(
#     units=200, 
#     num_classes=len(class_names_unknownFalse_silence),
#     bidirectional=True,
#     dropout=0.2,
# )
# training, y_pred = rnn.train(
#     X_train_unknownFalse_silence, y_train_unknownFalse_silence, X_valid_unknownFalse_silence, y_valid_unknownFalse_silence,
#     learning_rate=0.001,
#     batch_size=64,
#     epochs=3,
#     unknown_mode=1,
#     list_of_good_clases=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence'],
#     label_encoder=label_encoder_unknownFalse_silence,
# )

In [None]:
class GRU_class:
    """
    Class for creating and dealing with GRU network.
    """
    def __init__(self, units, num_classes, bidirectional=False, dropout=0.0):
        self.units = units
        self.num_classes = num_classes
        self.bidirectional = bidirectional
        self.dropout = dropout

        rnn = GRU(self.units, return_sequences=False)
        if bidirectional:
            rnn = Bidirectional(rnn)

        self.model = Sequential()
        self.model.add(rnn)
        self.model.add(Dropout(dropout))
        self.model.add(Dense(self.num_classes, activation='softmax'))
  
    def train(self, X_train, y_train, X_valid, y_valid, learning_rate=0.001, epochs=300, batch_size=64, silence="silence", unknown_mode=2, list_of_good_clases=None, label_encoder=None):
        """
        Training model, saving to the folder.
        """
        optimizer = Adam(amsgrad=True, lr=learning_rate)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
           
        callbacks = [EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, mode='min')]
        
        training = self.model.fit(
            X_train, 
            y_train,
            validation_data=(X_valid, y_valid), 
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1,
        )
        
        if unknown_mode == 1:
            y_pred = self.test_change_pred(X_valid, list_of_good_clases, label_encoder)      
        elif unknown_mode == 2:
            y_pred = np.argmax(self.model.predict(X_valid), axis=1)
        else:
            y_pred = self.test_pred_proba(X_valid, label_encoder) 

        y_valid=np.argmax(y_valid, axis=1)
        accuracy=accuracy_score(y_valid, y_pred)
        
        self.model.save(f"./models/gru_{silence}_unknownmode_{unknown_mode}_bidir_{self.bidirectional}_dropout_{self.dropout}_lr_{learning_rate}_batchsize_{batch_size}_acc_{round(accuracy, 3)}")

        return training, y_pred

    def test_change_pred(self, X, list_good_classes, label_encoder):
        """
        Changing labels of predictions, unknown handling version 1.
        """

        list_good_classes_encoded=label_encoder.transform(list_good_classes)
        unknown_encoded=label_encoder.transform(['zzz_unknown'])
        y_pred_org = np.argmax(self.model.predict(X), axis=1)

        for i in range(len(y_pred_org)):
            if y_pred_org[i] not in list_good_classes_encoded:
                y_pred_org[i]=unknown_encoded

        return y_pred_org

    def test_pred_proba(self, X, label_encoder, num_classes=5):
        """
        Changing labels of predictions, unknown handling version 3.
        """

        unknown_encoded=label_encoder.transform(['zzz_unknown'])
        y_pred_proba=self.model.predict(X)
        y_pred_org=copy.deepcopy(y_pred_proba)
        y_pred=[]

        for i in range(len(y_pred_proba)):
            y_pred_proba[i].sort()
            y_pred_proba[i]=y_pred_proba[i][::-1]
            if abs(y_pred_proba[i][num_classes-1]-y_pred_proba[i][0])<0.05:
                y_pred.append(unknown_encoded[0])
            else:
                y_pred.append(np.argmax(y_pred_org[i]))
        
        return y_pred


# # Test
# gru = GRU_class(
#     units=200, 
#     num_classes=len(class_names_unknownFalse_silence),
#     bidirectional=True,
#     dropout=0.2,
# )
# training, y_pred = gru.train(
#     X_train_unknownFalse_silence, y_train_unknownFalse_silence, X_valid_unknownFalse_silence, y_valid_unknownFalse_silence,
#     learning_rate=0.001,
#     batch_size=64,
#     epochs=2,
#     unknown_mode=1,
#     list_of_good_clases=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence'],
#     label_encoder=label_encoder_unknownFalse_silence,
# )


In [None]:
class LSTM_class:
    """
    Class for creating and dealing with LSTM network.
    """
    def __init__(self, units, input_shape, num_classes, bidirectional=False, dropout=0.0):
        self.units = units
        self.input_shape = input_shape
        self.num_classes = num_classes
        self.bidirectional = bidirectional
        self.dropout = dropout
        
        rnn = LSTM(self.units, input_shape=self.input_shape, return_sequences=False)
        if bidirectional:
            rnn = Bidirectional(rnn)

        self.model = Sequential()
        self.model.add(rnn)
        self.model.add(Dropout(dropout))
        self.model.add(Dense(self.num_classes, activation='softmax'))
  
    def train(self, X_train, y_train, X_valid, y_valid, learning_rate=0.001, epochs=300, batch_size=64, silence="silence", unknown_mode=2, list_of_good_clases=None, label_encoder=None):
        """
        Training model, saving to the folder.
        """
        optimizer = Adam(amsgrad=True, lr=learning_rate)
        self.model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
           
        callbacks = [EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5, mode='min')]
        
        training = self.model.fit(
            X_train, 
            y_train,
            validation_data=(X_valid, y_valid), 
            shuffle=True,
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
            verbose=1,
        )
        
        if unknown_mode == 1:
            y_pred = self.test_change_pred(X_valid, list_of_good_clases, label_encoder)      
        elif unknown_mode == 2:
            y_pred = np.argmax(self.model.predict(X_valid), axis=1)
        else:
            y_pred = self.test_pred_proba(X_valid, label_encoder) 

        y_valid=np.argmax(y_valid, axis=1)
        accuracy=accuracy_score(y_valid, y_pred)
        
        self.model.save(f"./models/lstm_{silence}_unknownmode_{unknown_mode}_bidir_{self.bidirectional}_dropout_{self.dropout}_lr_{learning_rate}_batchsize_{batch_size}_acc_{round(accuracy, 3)}")

        return training, y_pred

    def test_change_pred(self, X, list_good_classes, label_encoder):
        """
        Changing labels of predictions, unknown handling version 1.
        """

        list_good_classes_encoded=label_encoder.transform(list_good_classes)
        unknown_encoded=label_encoder.transform(['zzz_unknown'])
        y_pred_org = np.argmax(self.model.predict(X), axis=1)

        for i in range(len(y_pred_org)):
            if y_pred_org[i] not in list_good_classes_encoded:
                y_pred_org[i]=unknown_encoded

        return y_pred_org

    def test_pred_proba(self, X, label_encoder, num_classes=5):
        """
        Changing labels of predictions, unknown handling version 3.
        """
        unknown_encoded=label_encoder.transform(['zzz_unknown'])
        y_pred_proba=self.model.predict(X)
        y_pred_org=copy.deepcopy(y_pred_proba)
        y_pred=[]

        for i in range(len(y_pred_proba)):
            y_pred_proba[i].sort()
            y_pred_proba[i]=y_pred_proba[i][::-1]
            if abs(y_pred_proba[i][num_classes-1]-y_pred_proba[i][0])<0.05:
                y_pred.append(unknown_encoded[0])
            else:
                y_pred.append(np.argmax(y_pred_org[i]))
        
        return y_pred

# # Test
# lstm = LSTM_class(
#     units=200, 
#     input_shape=(X_train.shape[1], X_train.shape[2]),
#     num_classes=len(class_names_unknownTrue_silence),
#     bidirectional=True,
#     dropout=0.2,
# )
# training, y_pred = lstm.train(
#     X_train_unknownFalse_silence, y_train_unknownFalse_silence, X_valid_unknownFalse_silence, y_valid_unknownFalse_silence,
#     learning_rate=0.001,
#     batch_size=64,
#     epochs=2,
#     unknown_mode=1,
#     list_of_good_clases=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence'],
#     label_encoder=label_encoder_unknownFalse_silence,
# )

In [None]:
def train_change_pred(y, list_good_classes, label_encoder):
    """
        Changing labels of predictions for train dataset.
        """
    y_max=np.argmax(y, axis=1)
    list_good_classes_encoded=label_encoder.transform(list_good_classes)
    unknown_encoded=label_encoder.transform(['zzz_unknown'])

    for i in range(len(y_max)):
        if y_max[i] not in list_good_classes_encoded:
            y_max[i]=unknown_encoded

    return y_max

### Experiments

#### For silence from background noise

In [2]:
list_of_good_classes = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence_background_noise']

unknown_modes = [1, 2, 3]

# architecture parameter
dropouts = [0.1, 0.3]
bidirectionals = [True, False]

# training parameter 
learning_rates = [0.001, 0.01] 
batch_sizes = [32, 64]

epochs = 1

for unknown_mode in unknown_modes:
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:
                    if unknown_mode in [1, 3]:
                        X_train = X_train_unknownFalse_silencebn
                        y_train = y_train_unknownFalse_silencebn
                        X_valid = X_valid_unknownFalse_silencebn
                        y_valid = y_valid_unknownFalse_silencebn
                        class_names = class_names_unknownFalse_silencebn
                        label_encoder = label_encoder_unknownFalse_silencebn

                    else:
                        X_train = X_train_unknownTrue_silencebn
                        y_train = y_train_unknownTrue_silencebn
                        X_valid = X_valid_unknownTrue_silencebn
                        y_valid = y_valid_unknownTrue_silencebn
                        class_names = class_names_unknownTrue_silencebn
                        label_encoder = label_encoder_unknownTrue_silencebn

                    print(f"LSTM {unknown_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    lstm = LSTM_class(
                        units=200, 
                        input_shape=(X_train.shape[1], X_train.shape[2]), 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout
                        
                    )          
                    training, y_pred = lstm.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence_background_noise",
                        unknown_mode=unknown_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )           

                    print(f"GRU {unknown_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    gru = GRU_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )          
                    training, y_pred = gru.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence_background_noise",
                        unknown_mode=unknown_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )  

                    print(f"SimpleRNN {unknown_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    rnn = SimpleRNN_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )
                    training, y_pred = rnn.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence_background_noise",
                        unknown_mode=unknown_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )

#### For silence from all audios

In [3]:
list_of_good_classes = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence']

unknown_modes = [1, 2, 3]

# architecture parameter
dropouts = [0.1, 0.3]
bidirectionals = [True, False]

# training parameter 
learning_rates = [0.001, 0.01] 
batch_sizes = [32, 64]

epochs = 1


for unknown_mode in unknown_modes:
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:
                    if unknown_mode in [1, 3]:
                        X_train = X_train_unknownFalse_silence
                        y_train = y_train_unknownFalse_silence
                        X_valid = X_valid_unknownFalse_silence
                        y_valid = y_valid_unknownFalse_silence
                        class_names = class_names_unknownFalse_silence
                        label_encoder = label_encoder_unknownFalse_silence

                    else:
                        X_train = X_train_unknownTrue_silence
                        y_train = y_train_unknownTrue_silence
                        X_valid = X_valid_unknownTrue_silence
                        y_valid = y_valid_unknownTrue_silence
                        class_names = class_names_unknownTrue_silence
                        label_encoder = label_encoder_unknownTrue_silence

                    print(f"LSTM {unknown_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    lstm = LSTM_class(
                        units=200, 
                        input_shape=(X_train.shape[1], X_train.shape[2]), 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout
                        
                    )          
                    training, y_pred = lstm.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=unknown_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )           

                    print(f"GRU {unknown_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    gru = GRU_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )          
                    training, y_pred = gru.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=unknown_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )  

                    print(f"SimpleRNN {unknown_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    rnn = SimpleRNN_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )
                    training, y_pred = rnn.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=unknown_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )

In [None]:
list_of_good_classes = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence']

X_train = X_train_unknownFalse_silence
y_train = y_train_unknownFalse_silence
X_valid = X_valid_unknownFalse_silence
y_valid = y_valid_unknownFalse_silence
class_names = class_names_unknownFalse_silence
label_encoder = label_encoder_unknownFalse_silence

y_valid_unknownFalse_silencebn_with_unknown = train_change_pred(y_valid,  
                                                               list_good_classes=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence'], 
                                                               label_encoder=label_encoder_unknownFalse_silence,)

u_modes = [1,2,3]

dropouts = [0.3]
bidirectionals = [True]
learning_rates = [0.01] 
batch_sizes = [32]

epochs = 10
u_mode = 2
iterations = 3

for iter in range(iterations):
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:

                    print(f"SimpleRNN {iter = }, {u_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    rnn = SimpleRNN_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )
                    training, y_pred = rnn.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=u_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )

dropouts = [0.3]
bidirectionals = [True]
learning_rates = [0.001] 
batch_sizes = [64]

for iter in range(iterations):
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:


                    print(f"LSTM {iter = } {u_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    lstm = LSTM_class(
                        units=200, 
                        input_shape=(X_train.shape[1], X_train.shape[2]), 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout
                        
                    )          
                    training, y_pred = lstm.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=u_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )           

dropouts = [0.3]
bidirectionals = [True]
learning_rates = [0.001] 
batch_sizes = [32]

for iter in range(iterations):
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:

                    print(f"GRU {iter = } {u_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    gru = GRU_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )          
                    training, y_pred = gru.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=u_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )  

In [None]:
list_of_good_classes = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence']

X_train = X_train_unknownFalse_silence
y_train = y_train_unknownFalse_silence
X_valid = X_valid_unknownFalse_silence
y_valid = y_valid_unknownFalse_silence
class_names = class_names_unknownFalse_silence
label_encoder = label_encoder_unknownFalse_silence

y_valid_unknownFalse_silencebn_with_unknown=train_change_pred(y_valid,  
                                                               list_good_classes=['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence'], 
                                                               label_encoder=label_encoder_unknownFalse_silence,)

u_modes = [1,2,3]

dropouts = [0.3]
bidirectionals = [True]
learning_rates = [0.01] 
batch_sizes = [32]

epochs = 10

for u_mode in u_modes:
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:

                    print(f"LSTM {u_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    rnn = LSTM_class(
                        units=200, 
                        num_classes=len(class_names),
                        input_shape=(X_train.shape[1], X_train.shape[2]),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )
                    training, y_pred = rnn.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=u_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )
                    if u_mode==1:
                        y_pred_unknown=rnn.test_change_pred(X_valid, list_of_good_classes, label_encoder)
                        accuracy=balanced_accuracy_score(y_valid_unknownFalse_silencebn_with_unknown, y_pred_unknown)
                    elif u_mode==3:
                        y_pred_unknown=rnn.test_change_pred(X_valid, list_of_good_classes, label_encoder)
                        accuracy=balanced_accuracy_score(y_valid_unknownFalse_silencebn_with_unknown, y_pred_unknown)
                    else:
                        accuracy=10
                    print(f'LSTM Mode: {u_mode}, balanced accuracy: {accuracy}.')


dropouts = [0.3]
bidirectionals = [True] 
learning_rates = [0.001] 
batch_sizes = [32]

for u_mode in u_modes:
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:

                    print(f"GRU {u_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    rnn = GRU_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )
                    training, y_pred = rnn.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=u_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )
                    if u_mode==1:
                        y_pred_unknown=rnn.test_change_pred(X_valid, list_of_good_classes, label_encoder)
                        accuracy=balanced_accuracy_score(y_valid_unknownFalse_silencebn_with_unknown, y_pred_unknown)
                    elif u_mode==3:
                        y_pred_unknown=rnn.test_change_pred(X_valid, list_of_good_classes, label_encoder)
                        accuracy=balanced_accuracy_score(y_valid_unknownFalse_silencebn_with_unknown, y_pred_unknown)
                    else:
                        accuracy=10
                    print(f'GRU Mode: {u_mode}, balanced accuracy: {accuracy}.')

dropouts = [0.3]
bidirectionals = [True] 
learning_rates = [0.001] 
batch_sizes = [64]

for u_mode in u_modes:
    for dropout in dropouts:
        for bidirectional in bidirectionals:
            for learning_rate in learning_rates:
                for batch_size in batch_sizes:

                    print(f"GRU {u_mode = }, {dropout = }, {bidirectional = }, {learning_rate = }, {batch_size = }")
                    rnn = SimpleRNN_class(
                        units=200, 
                        num_classes=len(class_names),
                        bidirectional=bidirectional,
                        dropout=dropout,
                    )
                    training, y_pred = rnn.train(
                        X_train, 
                        y_train,
                        X_valid,
                        y_valid,
                        learning_rate=learning_rate,
                        batch_size=batch_size,
                        epochs=epochs,
                        silence="silence",
                        unknown_mode=u_mode,
                        list_of_good_clases=list_of_good_classes,
                        label_encoder=label_encoder,
                    )
                    if u_mode==1:
                        y_pred_unknown=rnn.test_change_pred(X_valid, list_of_good_classes, label_encoder)
                        accuracy=balanced_accuracy_score(y_valid_unknownFalse_silencebn_with_unknown, y_pred_unknown)
                    elif u_mode==3:
                        y_pred_unknown=rnn.test_change_pred(X_valid, list_of_good_classes, label_encoder)
                        accuracy=balanced_accuracy_score(y_valid_unknownFalse_silencebn_with_unknown, y_pred_unknown)
                    else:
                        accuracy=10
                    print(f'SimpleRNN Mode: {u_mode}, balanced accuracy: {accuracy}.')

## Change test data

In [None]:
test_audio_path = 'test/audio'
test_audio_path_without_silence = 'test/audio_without_silence'

files = [f for f in os.listdir(test_audio_path) if os.path.isfile(os.path.join(test_audio_path, f))]

for file in files:

    # remove silence  
    sound = AudioSegment.from_file(f"{test_audio_path}/{file}", format=AUDIO_FORMAT) 
    
    if sound.dBFS >= -45:
        audio_chunks = split_on_silence(
            sound,
            min_silence_len = 100,
            silence_thresh = -45,
            keep_silence = 50
        )

    combined = AudioSegment.empty()
    for chunk in audio_chunks:
        combined += chunk

    if len(combined) == 0:
        sound.export(f"{test_audio_path_without_silence}/{file}", format=AUDIO_FORMAT)
    else: 
        combined.export(f"{test_audio_path_without_silence}/{file}", format=AUDIO_FORMAT)



files = [f for f in os.listdir(test_audio_path) if os.path.isfile(os.path.join(test_audio_path, f))]
for file in files:
    try:
        # read wavfile
        sample_rate, sample = wav.read(f"{test_audio_path}/{file}")

        # resample 16k -> 8k
        sample = resample(sample, int(new_sample_rate/sample_rate * sample.shape[0]))

        # padding
        sample = pad_sequences([sample], maxlen=8000, dtype='float', padding='post', truncating='post', value=0.0)

        # normalization
        if max:= np.max(np.abs(sample)) != 0:
            sample = sample / max

        # mfcc
        mfccs = mfcc(
            sample,
            new_sample_rate,
            numcep=number_of_cepstral_coefficients,
        )

        # normalization of mfcc
        scaler = scaler.fit(mfccs)
        normalized = scaler.transform(mfccs)

        mfcc_list_train.append(normalized)

        number+=1
    except ValueError:
        pass

X_test = np.array(mfcc_list_train,dtype=np.float32)
np.save('X_test', X_test)

## Kaggle competition

In [None]:
X_test = np.load('./X_test.npy')
print(X_test.shape)

models = [
    'models_grid_search\gru_silence_unknownmode_2_bidir_True_dropout_0.3_lr_0.001_batchsize_32_acc_0.909',
    'models_grid_search\lstm_silence_unknownmode_2_bidir_True_dropout_0.3_lr_0.001_batchsize_32_acc_0.9',
]

j = 0
for model in models:
    model = load_model(model)

    y_pred = model.predict(X_test)

    list_of_good_classes = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence']
    def train_change_pred(y, list_good_classes, label_encoder):
        y_max=np.argmax(y, axis=1)
        list_good_classes_encoded=label_encoder.transform(list_good_classes)
        unknown_encoded=label_encoder.transform(['zzz_unknown'])

        for i in range(len(y_max)):
            if y_max[i] not in list_good_classes_encoded:
                y_max[i]=unknown_encoded

        return y_max

    dict_classes = {
        30: 'yes', 
        14: 'no', 
        28: 'up', 
        4: 'down', 
        11: 'left', 
        18: 'right', 
        16: 'on', 
        15: 'off', 
        23: 'stop', 
        8: 'go', 
        21: 'silence',
        32: 'unknown',
    }

    y_pred_unknown = train_change_pred(y_pred, list_of_good_classes, label_encoder_unknownTrue_silence)

    y_pred_labels = np.copy(y_pred_unknown)
    y_pred_labels = list(y_pred_labels)

    for key, value in zip(dict_classes, dict_classes.values()):
        for i in range(len(y_pred_labels)):
            if key == y_pred_labels[i]:
                y_pred_labels[i] = value

    test_audio_path = 'test/audio'

    files = [f for f in os.listdir(test_audio_path) if os.path.isfile(os.path.join(test_audio_path, f))]

    df = pd.DataFrame({
        "fname": files,
        "label": y_pred_labels
    })

    df.to_csv(f"submission_{j}.csv", index=False)
    j += 1

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6ec8773e-c89e-4114-ab03-f7a026dc9b15' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>