# Optimized Keras Sequential Conv1D version

In [None]:
# io
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Scientific Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Visualization
import matplotlib.pyplot as plt
import tensorflow as tf
import plotly.offline as py
import plotly.graph_objs as go

# Audio
import IPython.display as ipd

# Deep learning
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras import Input, layers
from tensorflow.keras import backend as K
from tensorflow.keras.models import load_model

import random
import copy
import librosa

%matplotlib inline

In [None]:
print(os.listdir('../input/tensorflow-speech-recognition-challenge/'))

In [None]:
train_audio_path = '../input/tensorflow-speech-recognition-challenge/train/audio/'
print(os.listdir(train_audio_path))

In [None]:
# Load Data
dirs = [f for f in os.listdir(train_audio_path) if isdir(join(train_audio_path, f))]
dirs.sort()
print('Number of labels: ' + str(len(dirs[1:])))
print(dirs)

In [None]:
all_wav = []
unknow_wav = []
label_all = []
label_value = {}
target_list = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
unknow_list = set([d for d in dirs if d not in target_list and d != '_background_noise_'])
print('target_list: ', end='')
print(target_list)
print('unknowns_list: ', end='')
print(unknow_list)
print('silence: _background_noise_')
background = [f for f in os.listdir(join(train_audio_path, '_background_noise_')) if f.endswith('.wav')]
background_noise = []
for wav in background:
    samples, sample_rate = librosa.load(join(train_audio_path, '_background_noise_', wav))
    samples = librosa.resample(samples, sample_rate, 8000)
    background_noise.append(samples)
for i,d in enumerate(dirs[1:]):
    waves = [f for f in os.listdir(join(train_audio_path, d)) if f.endswith('.wav')]
    label_value[d] = i
    print(str(i)+':'+str(d)+' ', end='')
    for wav in waves:
        samples, sample_rate = librosa.load(join(train_audio_path, d, wav), sr=16000)
        samples = librosa.resample(samples, sample_rate, 8000)
        if len(samples) != 8000: continue
        if d in unknow_list:
            unknow_wav.append(samples)
        else:
            label_all.append(d)
            all_wav.append([samples, d])

In [None]:
# split wav, label
wav_all = np.reshape(np.delete(all_wav, 1, 1), (len(all_wav)))
label_all = [i for i in np.delete(all_wav, 0, 1).tolist()]

In [None]:
# Data Augmentation, 10% amplitude from 8000 samples in 1 sec noise
def get_one_noise(noise_num=0):
    selected_noise = background_noise[noise_num]
    start_idx = random.randint(0, len(selected_noise)-1-8000)
    return selected_noise[start_idx:(start_idx + 8000)]

In [None]:
max_ratio = 0.1
noised_wav = []
augment = 1
delete_index = []
for i in range(augment):
    new_wav = []
    noise = get_one_noise(i)
    for i, s in enumerate(wav_all):
        if len(s) != 8000:
            delete_index.append(i)
            continue
        s += max_ratio * noise
        noised_wav.append(s)
np.delete(wav_all, delete_index)
np.delete(label_all, delete_index)

In [None]:
wav_vals = np.array([x for x in wav_all])
label_vals = [x for x in label_all]
wav_vals.shape

In [None]:
labels = copy.deepcopy(label_vals)
for _ in range(augment):
    label_vals = np.concatenate((label_vals, labels), axis=0)
label_vals = label_vals.reshape(-1, 1)

In [None]:
# Random sampling from unknown wav data
unknown = unknow_wav
augment_unknown = 2
np.random.shuffle(unknow_wav)
unknown = np.array(unknown)
unknown = unknown[:2000*(augment_unknown+1)]
unknown_label = np.array(['unknown' for _ in range(2000*(augment_unknown+1))])
unknown_label = unknown_label.reshape(2000*(augment_unknown+1), 1)

In [None]:
# some data may have different length, so delete them
delete_index = []
for i, w in enumerate(unknown):
    if len(w) != 8000:
        delete_index.append(i)
unknown = np.delete(unknown, delete_index, axis=0)
unknown_label = np.delete(unknown_label, delete_index, axis=0)

In [None]:
# Random samping from 'background_noise'
silence_wav = []
augment_silence = 1
num_wav = (2000*(augment_silence+1)) // len(background_noise)
for i, _ in enumerate(background_noise):
    for _ in range((2000*(augment_silence+1))//len(background_noise)):
        silence_wav.append(get_one_noise(i))
silence_wav = np.array(silence_wav)
silence_label = np.array(['silence' for _ in range(num_wav*len(background_noise))])
silence_label = silence_label.reshape(-1, 1)
silence_wav.shape, silence_label.shape

In [None]:
shape_ = (-1, 8000)
wav_vals = np.reshape(wav_vals, shape_)
noised_wav = np.reshape(noised_wav, shape_)
unknown = np.reshape(unknown, shape_)
silence_wav = np.reshape(silence_wav, shape_)

In [None]:
# check dimensions
print(wav_vals.shape)
print(noised_wav.shape)
print(unknown.shape)
print(silence_wav.shape)

In [None]:
print(label_vals.shape)
print(unknown_label.shape)
print(silence_label.shape)

In [None]:
# concatenate wavs, labels
wav_vals = np.concatenate((wav_vals, noised_wav), axis=0)
wav_vals = np.concatenate((wav_vals, unknown), axis=0)
wav_vals = np.concatenate((wav_vals, silence_wav), axis=0)

In [None]:
label_vals = np.concatenate((label_vals, unknown_label), axis=0)
label_vals = np.concatenate((label_vals, silence_label), axis=0)

In [None]:
len(wav_vals), len(label_vals)

In [None]:
# Prepare train and validation data
# train_wav, test_wav, train_label, test_label = train_test_split(wav_vals, label_vals,\
#     test_size=0.2, random_state=1993, shuffle=True)

In [None]:
# Prepare train data
train_wav, train_label = shuffle(wav_vals, label_vals)

In [None]:
print(len(train_wav))
# print(len(test_wav))

In [None]:
# Hyper-parameters
lr = .001
generations = 20000
num_gens_to_wait = 250
batch_size = 512
drop_out_rate = .5
input_shape = (8000, 1)

In [None]:
train_wav.shape

In [None]:
# For Conv1D and Channel
train_wav = train_wav.reshape(-1, 8000, 1)
# test_wav = test_wav.reshape(-1, 8000, 1)

In [None]:
print(train_wav.shape)
# print(test_wav.shape)

In [None]:
label_value = target_list
label_value.append('unknown')
label_value.append('silence')

In [None]:
new_label_value = {}
for i, la in enumerate(label_value):
    new_label_value[la] = i
label_value = new_label_value
label_num2word = {v:k for k, v in label_value.items()}

In [None]:
label_value, label_num2word

In [None]:
# Make label data 'string' to class num
temp = []
for v in train_label:
    temp.append(label_value[v[0]])
train_label = np.array(temp)
# temp = []
# for v in test_label:
#     temp.append(label_value[v[0]])
# test_label = np.array(temp)

In [None]:
# Make label data one hot vector
train_label = keras.utils.to_categorical(train_label, len(label_value))
# test_label = keras.utils.to_categorical(test_label, len(label_value))

In [None]:
print('train_wav dimension: ' + str(train_wav.shape))
print('train_label dimension: ' + str(train_label.shape))
# print('test_wav dimension: ' + str(test_wav.shape))
# print('test_label dimension: ' + str(test_label.shape))
print('number of labels: ' + str(len(label_value)))

In [None]:
# Conv1D Model
input_tensor = Input(shape=(input_shape))
x = layers.Conv1D(8, 11, padding='valid', activation='relu', strides=1)(input_tensor)
x = layers.MaxPooling1D(2)(x)
x = layers.Dropout(drop_out_rate)(x)
x = layers.Conv1D(16, 7, padding='valid', activation='relu', strides=1)(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Dropout(drop_out_rate)(x)
x = layers.Conv1D(32, 5, padding='valid', activation='relu', strides=1)(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Dropout(drop_out_rate)(x)
x = layers.Conv1D(64, 5, padding='valid', activation='relu', strides=1)(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Dropout(drop_out_rate)(x)
x = layers.Conv1D(128, 3, padding='valid', activation='relu', strides=1)(x)
x = layers.MaxPooling1D(2)(x)
x = layers.Flatten()(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(drop_out_rate)(x)
x = layers.Dense(128, activation='relu')(x)
x = layers.Dropout(drop_out_rate)(x)
output_tensor = layers.Dense(len(label_value), activation='softmax')(x)

model = tf.keras.Model(input_tensor, output_tensor)
model.compile(loss=keras.losses.categorical_crossentropy, \
    optimizer=keras.optimizers.Adam(lr=lr), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# train with validation set
# history = model.fit(train_wav, train_label, validation_data=[test_wav, test_label],
#     batch_size=batch_size, epochs=100, verbose=1)

In [None]:
# train without validation set
history = model.fit(train_wav, train_label, batch_size=batch_size, epochs=100, verbose=1)

In [None]:
# plot loss & accuracy for both train and validation set
# plt.plot(history.history['acc'])
# plt.plot(history.history['val_acc'])
# plt.title('model accuracy')
# plt.ylabel('accuracy')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()
# # summarize history for loss
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('model loss')
# plt.ylabel('loss')
# plt.xlabel('epoch')
# plt.legend(['train', 'test'], loc='upper left')
# plt.show()

In [None]:
# plot loss & accuracy for train set
plt.plot(history.history['acc'])
plt.plot(history.history['loss'])
plt.title('model acc/loss')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy', 'loss'], loc='upper left')
plt.show()

In [None]:
# save model weights together with architecture
model.save('model.h5')

In [None]:
# test_idx = 888
# res = model.predict(test_wav[test_idx].reshape((-1, 8000, 1)))
# print(label_num2word[np.argmax(res)], label_num2word[np.argmax(test_label[test_idx])])
# librosa.output.write_wav('x.wav', test_wav[test_idx], 8000)
# sr, ss = wavfile.read('x.wav')
# ipd.Audio(ss, rate=8000)

In [None]:
# Y_test = np.argmax(test_label, axis=1) # Convert one-hot to index
# y_pred = np.argmax(model.predict(test_wav), axis=1) # Convert one-hot to index
# print(classification_report(Y_test, y_pred))

In [None]:
# https://www.kaggle.com/hemingwei/tensorflow-speech-recognition-public-test-set
# !7z x ../input/tensorflow-speech-recognition-public-test-set/test.7z

In [None]:
# predict for test data and output result
# test_path = './test/audio/'
# df = pd.read_csv('../input/tensorflow-speech-recognition-challenge/sample_submission.csv')
# # test_data = np.zeros((158538, 8000))
# # for i in range(len(df)):
# #     samples, sample_rate = librosa.load(join(test_path, df.loc[i]['fname']), sr=16000)
# #     samples = librosa.resample(samples, sample_rate, 8000)
# #     test_data[i] = samples
# # test_data = test_data.reshape(-1, 8000, 1)
# # result = model.predict(test_data)
# for i in range(len(df)):
#     samples, sample_rate = librosa.load(join(test_path, df.loc[i]['fname']), sr=16000)
#     samples = librosa.resample(samples, sample_rate, 8000)
#     result = model.predict(np.reshape(samples, (-1, 8000, 1)))
#     df.loc[i]['label'] = label_num2word[np.argmax(result)]
# df.to_csv('submission.cvs', index=False)

In [None]:
# ls -lh

In [None]:
# download output file
# from IPython.display import FileLink
# FileLink('model.h5')

In [None]:
# bbb = load_model('model.h5')

In [None]:
# train_wav[:1].shape

In [None]:
# bbb.predict(train_wav[:1])