# A demo narrow domain specific speech recognition neural network for YUHacks
recognizes following 6 words:
- spirit animal
- shit
- poggers
- lit
- boomer
- simp

In [1]:
import os
import tensorflow as tf
import numpy as np
from tensorflow.keras import datasets, layers, models
from scipy.io import wavfile        # scipy module for reading wavs

In [2]:
# setup random seeds
tf.random.set_seed(5619)
np.random.seed(124497)

# prepare the dataset
words = ["spirit_animal", "shit", "poggers", "lit", "boomer", "simp"]
dataset_folders = ["Joanna_audio_dataset", "jadas audio", "kevin recordings", "marias_audio", "pranish_audio"]
training_set = {}
validation_set = {}

train_valid_split = 0.8

fftsize = 512 # size of a Short time fourier transform (STFT) window
windowsize = 256 # size of/distance between each window

for word in words:
    training_set[word] = []
    validation_set[word] = []

# read all files from folders
for folder in dataset_folders: # for each folder (one person)
    files = os.listdir(folder)
    # all clips spoken by this person
    clips = {}
    # each element in this dict is list of clips of <folder> person saying <word> word
    for word in words:
        clips[word] = []
    for file in files:
        # read the .wav file into a numpy array
        samplerate, data = wavfile.read(os.path.join(folder, file))
        
        # if the clip is stereo, take only the first channel
        if len(data.shape) >= 2:
            data = data[:, 0]
        
        # get the number of samples
        numsamples = data.shape[0]
        
        if numsamples > 32000: # if clip longer than 2 seconds
            data = data[:32000] # cut it off at 2 seconds
        elif numsamples < 32000: # if too short
            data = np.concatenate((data, np.zeros(32000 - numsamples))) # pad silence until 2 seconds
        
        # re-define numsamples to 32000
        numsamples = 32000
        
        fft_results = []
        index_ct = 0
        # for each period of <fftsize> samples
        while index_ct < numsamples - fftsize:
            # grab <fftsize> samples from the current part of the file
            elems = data[index_ct:index_ct+fftsize]
            
            # debugging
            # print(elems.shape)
            
            # perform the fft
            res = np.fft.fft(elems)
            res = np.abs(res) # only take the magnitude of the sequence
            # add it to temp array
            fft_results.append(res)
            # to the next section
            index_ct += windowsize
        # turn it into a single 2D tensor
        fft_results = np.array(fft_results)
        
        # normalize all elements into a real number between 0 and 1 (essentially normalizing volume)
        if np.amax(fft_results) != 0:
            fft_results = fft_results / np.amax(fft_results)
        
        # the corresponding word for this clip (label)
        word = "_".join(file.split("_")[0:-1])
        # put in dictionary
        clips[word].append(fft_results)
    # now we need to 'scramble' it, draw 8 into training set and 2 into validation set
    for key in clips:
        length = len(clips[key])
        # split them to train and validation sets
        training_set[key].extend(clips[key][:int(length*train_valid_split)])
        validation_set[key].extend(clips[key][int(length*train_valid_split):])

# the tensorflow readable train and valid set
train_input = []
train_output = []
valid_input = []
valid_output = []

# now we convert the datasets into tensorflow readable format
for i in range(len(words)): # for each word
    word = words[i]
    # put the train set and its labels
    for item in training_set[word]:
        train_input.append(item) # the item/clip as input
        train_output.append(i) # the output/label
    # put the valid set and its labels
    for item in validation_set[word]:
        valid_input.append(item) # the item/clip as input
        valid_output.append(i) # the output/label

train_input = np.array(train_input)
tshape = train_input.shape
train_input = train_input.reshape((tshape[0], tshape[1], tshape[2], 1))

train_output = np.array(train_output)

valid_input = np.array(valid_input)
vshape = valid_input.shape
valid_input = valid_input.reshape((vshape[0], vshape[1], vshape[2], 1))

valid_output = np.array(valid_output)

In [3]:
print(valid_input.shape)

(60, 123, 512, 1)


In [4]:
# creates a simple sequential convolutional neural network using tensorflow keras
def create_model():
    model = models.Sequential()
    model.add(layers.Conv2D(32, (5, 5), activation='relu', input_shape=(123, fftsize, 1)))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (5, 5), activation='relu'))
    model.add(layers.MaxPooling2D((2, 2)))
    model.add(layers.Conv2D(64, (5, 5), activation='relu'))
    
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    # model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(6, activation='softmax'))
    
    return model

In [5]:
# creates a model and train it using training and validation sets
model = create_model()
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
history = model.fit(train_input, train_output, epochs=20, validation_data=(valid_input, valid_output))

# save the model
model.save("model_1")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
INFO:tensorflow:Assets written to: model_1\assets
