### Preprocessing 

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
import librosa
import cv2
import os
import datetime

def float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def load_audio_file(file_path):
    input_length = 16000
    data = librosa.core.load(file_path, sr=16000)[0]
    if len(data) >= input_length:
        data = data[:input_length]
    else:
        data = np.pad(data, (0, 16000 - len(data)), "constant", constant_values=0)
    return data

def get_spectdata(wav, sr=16000, size=12800):
    spect = librosa.feature.melspectrogram(wav, sr=sr, hop_length=161, n_fft=2048)
    log_spect = librosa.core.amplitude_to_db(spect)
    data = np.asarray(log_spect).reshape(size)
    return data, log_spect

def speed_tuning(wav):
    speed_rate = np.random.uniform(0.9, 1.1)
    wav_speed_tune = cv2.resize(wav, (1, int(len(wav) * speed_rate))).squeeze()

    if len(wav_speed_tune) < 16000:
        pad_len = 16000 - len(wav_speed_tune)
        wav_speed_tune = np.r_[np.random.uniform(-0.001, 0.001, int(pad_len / 2)),
                               wav_speed_tune,
                               np.random.uniform(-0.001, 0.001, int(np.ceil(pad_len / 2)))]
    else:
        cut_len = len(wav_speed_tune) - 16000
        wav_speed_tune = wav_speed_tune[int(cut_len / 2): int(cut_len / 2) + 16000]
    return wav_speed_tune

def pitch_tuning(wav, sample_rate=16000):
    bins_per_octave = 24
    pitch_pm = 4
    pitch_change = pitch_pm * 2 * (np.random.uniform() - 0.5)

    wav_pitch_changed = librosa.effects.pitch_shift(wav.astype('float64'),
                                                    sample_rate,
                                                    n_steps=pitch_change,
                                                    bins_per_octave=bins_per_octave)
    return wav_pitch_changed

def bg_mixing(wav, bg):
    start_ = np.random.randint(bg.shape[0] - 16000)
    bg_slice = bg[start_: start_ + 16000]
    wav_with_bg = wav * np.random.uniform(0.8, 1.2) + bg_slice * np.random.uniform(0, 0.1)
    return wav_with_bg

def choice_bg():
    return librosa.load(bg_path + bg_list[np.random.randint(0, 5)], sr=None)[0]


target_label = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "unknown", "silence"]
valid_labels = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "silence"]

lb = LabelBinarizer().fit(target_label)

bg_path = './train/audio/_background_noise_/'
bg_list = os.listdir(bg_path)
bg_list.remove('README.md')

train_audio_path = './train/audio/'
test_audio_path = './test/audio/'
labels = os.listdir(train_audio_path)

train_path = "./tfrecord_new/train.tfrecord"
test_path = "./tfrecord_new/test.tfrecord"

train_writer = tf.python_io.TFRecordWriter(train_path)
test_writer = tf.python_io.TFRecordWriter(test_path)


for label in labels:
    if label == '.DS_Store':
        continue
    else:
        orig_label = label
        file_path = train_audio_path + label + "/"
        files = os.listdir(file_path)

        if label not in valid_labels:
            label = "unknown"

        encoded_label = lb.transform([label])[0]

        for file in files:
            if file == '.DS_Store' or file == 'README.md':
                continue
            else:
                filename = file_path + file

                signal = load_audio_file(filename)

                signal_dict = {}

                signal_dict[0] = signal
                signal_dict[1] = speed_tuning(signal)
                signal_dict[2] = pitch_tuning(signal)

                bg = choice_bg()
                signal_dict[3] = bg_mixing(signal, bg)

                bg = choice_bg()
                signal_dict[4] = bg_mixing((speed_tuning(signal)), bg)

                signal_dict[5] = speed_tuning(pitch_tuning(signal))

                bg = choice_bg()
                signal_dict[6] = bg_mixing(pitch_tuning(signal), bg)

                bg = choice_bg()
                signal_dict[7] = bg_mixing(speed_tuning(pitch_tuning(signal)), bg)

                specs = [get_spectdata(signal_dict[j])[0] for j in signal_dict.keys()]

                for spec in specs:
                    feature = {
                        "spectrum": float_feature(spec),
                        "label": int64_feature(encoded_label)
                    }

                    features = tf.train.Features(feature=feature)
                    example = tf.train.Example(features=features)
                    train_writer.write(example.SerializeToString())

train_writer.close()

file_path = test_audio_path
files = os.listdir(file_path)

for file in files:
    if file == '.DS_Store' or file == 'README.md':
        continue
    else:
        filename = file_path + file

        signal = load_audio_file(filename)
        spec = get_spectdata(signal)[0]

        print(spec.shape)

        feature = {
            "spectrum": float_feature(spec),
        }

        features = tf.train.Features(feature=feature)
        example = tf.train.Example(features=features)
        test_writer.write(example.SerializeToString())

test_writer.close()


### Model

In [None]:
import tensorflow as tf
import numpy as np


class DenseNet:
    def batch_norm_relu(self, inputs, is_training, reuse, name):
        bn = tf.layers.batch_normalization(inputs, 
                                           training=is_training, 
                                           reuse=reuse, 
                                           name=name)
        outputs = tf.nn.relu(bn)
        
        return outputs
    
    
    def initial_conv(self, inputs, reuse=False):
        l = tf.layers.conv2d(inputs=inputs,
                             filters=16,
                             kernel_size=3,
                             strides=2,
                             padding='SAME',
                             name='init_conv',
                             reuse=reuse)
        
        return l

    
    def composite_layer(self, inputs, keep_prob, name, is_training=True, reuse=False):
        l = inputs
        l = self.batch_norm_relu(l, is_training, reuse, name=name+'_bn1')
        l = tf.layers.conv2d(l, 4 * 12, 1, 1, 
                             padding='SAME', name=name+'_conv1', reuse=reuse)
            
        l = self.batch_norm_relu(l, is_training, reuse, name=name+'_bn2')
        
        l = tf.layers.conv2d(l, 12, 3, 1, 
                             padding='SAME', name=name+'_conv2', reuse=reuse)
        
        l = tf.layers.dropout(l, keep_prob, training=is_training)
        
        return tf.concat([inputs, l], axis=3) 


    def transition_layer(self, inputs, name, is_training=True, reuse=False):
        shape = inputs.get_shape().as_list()
        n_filters = int(shape[3] * 0.5)
        
        l = self.batch_norm_relu(inputs, is_training, reuse, name=name + '_bn')
        l = tf.layers.conv2d(l, n_filters, 1, 1, padding='SAME', name=name + '_conv', reuse=reuse)
        l = tf.layers.average_pooling2d(l, 2, 2, name='pool')

        return l
    
    def dense_net(self, inputs, keep_prob=0.2, is_training=True, reuse=False):
        l = self.initial_conv(inputs=inputs, reuse=reuse)
        
        with tf.variable_scope('block1') as scope:
            for i in range(6):
                l = self.composite_layer(l, 
                                         keep_prob, 
                                         name='dense_layer{}'.format(i), 
                                         is_training=is_training,
                                         reuse=reuse)
                
            l = self.transition_layer(l, 
                                     name='transition1',
                                     is_training=is_training,
                                     reuse=reuse)
                
        with tf.variable_scope('block2') as scope:
            for i in range(12):
                l = self.composite_layer(l, 
                                         keep_prob, 
                                         name='dense_layer{}'.format(i),
                                         is_training=is_training,
                                         reuse=reuse)

            l = self.transition_layer(l, 
                                     name='transition2',
                                     is_training=is_training,
                                     reuse=reuse)

        with tf.variable_scope('block3') as scope:
            for i in range(24):
                l = self.composite_layer(l, 
                                         keep_prob, 
                                         name='dense_layer{}'.format(i),
                                         is_training=is_training,
                                         reuse=reuse)
                
            l = self.transition_layer(l, 
                                     name='transition3',
                                     is_training=is_training,
                                     reuse=reuse)

        with tf.variable_scope('block4') as scope:
            for i in range(16):
                l = self.composite_layer(l, 
                                         keep_prob, 
                                         name='dense_layer{}'.format(i),
                                         is_training=is_training,
                                         reuse=reuse)
                
        return l
    
    
    def get_logits(self, inputs, is_training=True, reuse=False):
        l = self.dense_net(inputs, keep_prob=0.2, is_training=is_training, reuse=reuse)
        
        outputs = self.batch_norm_relu(l, is_training, reuse, name='last_bn')

        shape = outputs.get_shape().as_list()
        
        pool_size = (shape[1], shape[2])
        outputs= tf.layers.average_pooling2d(outputs, pool_size=pool_size, strides=1, padding='VALID')
        
        outputs = tf.layers.flatten(outputs)
        outputs = tf.layers.dense(outputs, 12, name='final_dense', reuse=reuse)
        
        return outputs
    
    
class CnnLstm:
    def __init__(self):
        self.num_classes = 12
        self.num_filters = [8, 16, 32, 32]
        self.filter_sizes = [7, 3, 3, 3]
        self.pool_sizes = [2, 2, 1, 1]
        self.cnn_dropout_keep_prob = [0, 0.3, 0.4, 0.4]
        self.fc_hidden_units = [1028, 512, 256]
        self.fc_dropout_keep_prob = [0.2, 0.3, 0.35]
        self.lstm_n_hiddens = [512]
        self.lstm_dropout_keep_prob = [0.5]
        self.idx_convolutional_layers = range(1, len(self.filter_sizes) + 1)
        self.idx_fc_layers = range(1, len(self.fc_hidden_units) + 1)
        self.idx_lstm_layers = range(1, len(self.lstm_n_hiddens) + 1)
        
        
    def convolutional_layer(self, inputs, is_training=True, reuse=False):
        l = inputs
        
        for i, num_filter, filter_size, pool_size, keep_prob in zip(self.idx_convolutional_layers,
                                                                    self.num_filters,
                                                                    self.filter_sizes,
                                                                    self.pool_sizes,
                                                                    self.cnn_dropout_keep_prob):
            l = tf.layers.conv2d(l, 
                                 filters=num_filter, 
                                 kernel_size=filter_size, 
                                 strides=1, 
                                 padding="SAME", 
                                 name="conv"+str(i),
                                 reuse=reuse)
            
            l = tf.layers.batch_normalization(l, training=is_training, name="conv_bn"+str(i), reuse=reuse)
            l = tf.nn.relu(l, name="conv_relu"+str(i))
            l = tf.layers.dropout(l, rate=keep_prob, training=is_training, name="conv_dropout"+str(i))

            if pool_size != 1:
                l = tf.layers.max_pooling2d(l, pool_size=pool_size, strides=pool_size, padding="SAME")
                
        return l
        
    
    def fc_layer(self, inputs, is_training=True, reuse=False):
        l = inputs
        
        for i, units, keep_prob in zip(self.idx_fc_layers, self.fc_hidden_units, self.fc_dropout_keep_prob):
            l = tf.layers.dense(inputs, units=units, reuse=reuse, name="fc"+str(i))
            l = tf.layers.batch_normalization(l, training=is_training, name="fc_bn"+str(i), reuse=reuse)
            l = tf.nn.relu(l, name="fc_relu"+str(i))
            l = tf.layers.dropout(l, rate=keep_prob, training=is_training, name="fc_dropout"+str(i))
            
        return l
  

    def lstm_layer(self, inputs, is_training=True, reuse=False):
        if is_training:
            keep_probs = [0.5]
            
        else:
            keep_probs = [1]
            
        cell = tf.nn.rnn_cell.BasicLSTMCell(512, reuse=reuse)
        cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=keep_probs[0])
        
        outputs, states = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
        outputs = tf.transpose(outputs, [1, 0, 2])
        outputs = outputs[-1]
        
        return outputs
 

    def get_reshaped_cnn_to_rnn(self, inputs):
        shape = inputs.get_shape().as_list() 
        inputs = tf.transpose(inputs, [0, 2, 1, 3])
        reshaped_inputs = tf.reshape(inputs, [-1, shape[2], shape[1] * shape[3]])
        
        return reshaped_inputs
  

    def get_logits(self, inputs, is_training=True, reuse=False):
        with tf.variable_scope("conv_layers") as scope:
            l = inputs
            l = self.convolutional_layer(l, is_training, reuse)
            
        with tf.variable_scope("lstm_layers") as scope:
            reshaped_l = self.get_reshaped_cnn_to_rnn(l)
            
            l = self.lstm_layer(reshaped_l, is_training, reuse)
            
        with tf.variable_scope("fc_layers") as scope:
            l = tf.layers.flatten(l)
            l = self.fc_layer(l, is_training, reuse)
                
        output = tf.layers.dense(l, units=self.num_classes, reuse=reuse, name='out')
            
        return output
    

def train_parser(serialized_example):
    features = {
        "spectrum": tf.FixedLenFeature([12800], tf.float32),
        "label": tf.FixedLenFeature([12], tf.int64)
    }

    parsed_feature = tf.parse_single_example(serialized_example, features)

    spec = parsed_feature['spectrum']
    label = parsed_feature['label']

    return spec, label
        
    
def test_parser(serialized_example):
    features = {
        "spectrum": tf.FixedLenFeature([12800], tf.float32),
    }

    parsed_feature = tf.parse_single_example(serialized_example, features)

    spec = parsed_feature['spectrum']

    return spec

### Train

In [None]:
import tensorflow as tf
import numpy as np
from model import DenseNet, CnnLstm, train_parser


batch_size = 128

tf.reset_default_graph()

train_data_dir = "./tfrecords/train.tfrecord"

train_dataset = tf.data.TFRecordDataset(train_data_dir).map(train_parser)
train_dataset = train_dataset.shuffle(500000, seed=1, reshuffle_each_iteration=True)
train_dataset = train_dataset.batch(batch_size)

train_itr = tf.contrib.data.Iterator.from_structure(train_dataset.output_types, train_dataset.output_shapes)

spec, label = train_itr.get_next()
spec = tf.reshape(spec, [-1, 128, 100, 1])
spec = tf.cast(spec, tf.float32)

train_init_op = train_itr.make_initializer(train_dataset)

model = DenseNet()
# model = CnnLstm()

height = 128
width = 100
num_classes = 12
learning_rate = 0.01
epochs = 5

model_path = "./densenet/"
model_file = "densenet"

with tf.device('/gpu:0'):
    X = tf.placeholder(tf.float32, [None, height, width, 1])
    Y = tf.placeholder(tf.float32, [None, num_classes])
    global_step = tf.Variable(0, trainable=False, name='global_step')

    logits_train = model.get_logits(X)
    
    loss = tf.losses.softmax_cross_entropy(Y, logits_train)

    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
    with tf.control_dependencies(update_ops):    
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step)
        
    logits_eval = model.get_logits(X, is_training=False, reuse=True)
    predict_proba_ = tf.nn.softmax(logits_eval)
    prediction = tf.argmax(predict_proba_, 1)
    accuracy = tf.metrics.accuracy(tf.argmax(Y, 1), prediction)
                
    tf.summary.scalar('loss', loss)
    tf.summary.scalar('accuracy', accuracy[1])
        
    merged = tf.summary.merge_all()
    

saver = tf.train.Saver(tf.global_variables())

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))


sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

writer = tf.summary.FileWriter(model_path, sess.graph)
        
for epoch in range(epochs):
    sess.run(train_init_op)
    acc = []
    
    while True:
        try:
            step = sess.run(global_step)
            
            _spec, _label = sess.run([spec, label])
                
            _, c, _summ = sess.run([optimizer, loss, merged], feed_dict = {X: _spec, Y: _label})
            acc_train = sess.run(accuracy, feed_dict = {X: _spec, Y: _label})
            
            acc.append(acc_train[1])
            
            writer.add_summary(_summ, step)
            
            if step % 500 == 0:
                print('step: {}, cost: {}'.format(step, c))
                
        except tf.errors.OutOfRangeError:
            break
            
    print('epoch: {}, cost : {}, train_acc: {}'.format(epoch, c, np.mean(acc)))


saver.save(sess, model_path + model_file + '.ckpt', global_step=sess.run(global_step))

print("Model is saved.")


### Prediction


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import csv
from model import DenseNet, CnnLstm, test_parser


batch_size = 128

tf.reset_default_graph()

test_data_dir = "./tfrecords/test.tfrecord"

test_dataset = tf.data.TFRecordDataset(test_data_dir).map(test_parser)
test_dataset = test_dataset.batch(batch_size)

test_itr = tf.contrib.data.Iterator.from_structure(test_dataset.output_types, test_dataset.output_shapes)

test_spec = test_itr.get_next()

test_spec = tf.reshape(test_spec, [-1, 128, 100, 1])
test_spec = tf.cast(test_spec, tf.float32)

test_init_op = test_itr.make_initializer(test_dataset)

height = 128
width = 100

model = DenseNet()
# model = CnnLstm()

with tf.device('/gpu:0'):
    X = tf.placeholder(tf.float32, [None, height, width, 1])
    
    logits_test = model.get_logits(X, is_training=False, reuse=False)
    test_predict_proba_ = tf.nn.softmax(logits_test)
    test_prediction = tf.argmax(test_predict_proba_, 1)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)

sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())


# Restore model
imp_model = tf.train.import_meta_graph('./densenet/densenet.ckpt-44810.meta')
imp_model.restore(sess, tf.train.latest_checkpoint('./densenet/'))


# Create submission file
sess.run(test_init_op)

test_spec_ = sess.run(test_spec)

predict = sess.run(test_prediction, feed_dict={X: test_spec_})

while True:
    try:
        test_spec_ = sess.run(test_spec)

        predict = np.hstack([predict, sess.run(test_prediction, feed_dict={X: test_spec_})])
        
    except tf.errors.OutOfRangeError:
        break

class_names = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence', 'stop', 'unknown', 'up', 'yes']

df = pd.read_csv("sample_submission.csv")
files = df['fname']

model_path = "./densenet/"
model_file = "densenet"

with open(model_path + 'sub_' + model_file + '.csv', 'w') as f:
    fieldnames = ['fname', 'label']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    
    writer.writeheader()
    
    for i in range(len(predict)):
        writer.writerow({'fname': files[i], 'label': class_names[predict[i]]})
        
print("Submission file is created.")


# Create predict proba file
sess.run(test_init_op)

test_spec_ = sess.run(test_spec)

predict_proba = sess.run(test_predict_proba_, feed_dict={X: test_spec_})

while True:
    try:
        test_spec_ = sess.run(test_spec)
        
        predict_proba = np.vstack([predict_proba, sess.run(test_predict_proba_, feed_dict={X: test_spec_})])
            
    except tf.errors.OutOfRangeError:
        break
        
predict_proba = np.array(predict_proba)
print(predict_proba.shape)

pp = pd.DataFrame(predict_proba, index=files)
pp.to_csv(model_path + 'proba_' + model_file + '.csv', index=False)

print("Proba file is created.")
