Based on tensorflow starter code from https://www.kaggle.com/alexozerin/end-to-end-baseline-tf-estimator-lb-0-72

In [3]:
from utils import *
import array 

from pydub import AudioSegment
import tensorflow as tf

from keras.models import Model, Sequential
from keras.layers import Input, Conv2D, Flatten, MaxPooling2D, Activation, BatchNormalization, GlobalAveragePooling2D, GlobalMaxPool2D, concatenate, Dense, Dropout
from keras.optimizers import Adam
from tensorflow.python.keras.utils import to_categorical




from keras_tqdm import TQDMNotebookCallback
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
# import matplotlib.pyplot as plt
# %matplotlib inline

In [4]:
POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}
len(id2name)

12

In [40]:
def load_data(data_dir):
    np.random.seed = 1
    
    """ Return 2 lists of tuples:
    [(class_id, user_id, path), ...] for train
    [(class_id, user_id, path), ...] for validation
    """
    # Just a simple regexp for paths with three groups:
    # prefix, label, user_id
#     pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_.+wav")
    pattern  =  re.compile("(.+[\/\\\\])?(\w+)[\/\\\\]([^_]+)_.+wav")
    all_files = glob(os.path.join(data_dir, 'train/audio/*/*wav'))

    with open(os.path.join(data_dir, 'train/validation_list.txt'), 'r') as fin:
        validation_files = fin.readlines()
        
    valset = set()
    for entry in validation_files:
        r = re.match(pattern, entry)
        if r:
            valset.add(r.group(3))
    
    possible = set(POSSIBLE_LABELS)
    
    train, val, silent, unknown = [], [],[],[]
    
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label, uid = r.group(2), r.group(3)
            
            if label == '_background_noise_': #we've already split up noise files into 1 seg chunks under 'silence' folder
                continue
                
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]
            sample = (label, label_id, uid, entry)
            
            if label == "unknown":
                unknown.append(sample)
            elif label == "silence":
                silent.append(sample)
                
            elif uid in valset:    
                val.append(sample)
            else:
                train.append(sample)

    print('There are {} train and {} val samples'.format(len(train), len(val)))
    
    columns_list = ['label', 'label_id', 'user_id', 'wav_file']
    

    train_df = pd.DataFrame(train, columns = columns_list)
    valid_df = pd.DataFrame(val, columns = columns_list)
    silent_df = pd.DataFrame(silent, columns = columns_list)
    unknown_df = pd.DataFrame(unknown, columns = columns_list)
    
    return train_df, valid_df, unknown_df, silent_df

In [41]:
train_df, valid_df, unknown_df, silent_df = load_data('./data/')

There are 21105 train and 2577 val samples


In [42]:
train_df.head(1)

Unnamed: 0,label,label_id,user_id,wav_file
0,left,4,cb8f8307,./data/train/audio/left/cb8f8307_nohash_1.wav


In [43]:
train_df.label.value_counts()

stop     2134
yes      2116
up       2115
go       2112
right    2111
on       2110
left     2106
no       2105
off      2101
down     2095
Name: label, dtype: int64

In [44]:
valid_df.label.value_counts()

no       270
down     264
yes      261
up       260
go       260
on       257
off      256
right    256
left     247
stop     246
Name: label, dtype: int64

In [46]:
#augment validation set with silence and unknown files, made with step=250 when generating silence files
extra_data_size = int(valid_df.shape[0]*0.1)

unknown_val = unknown_df.sample(extra_data_size,random_state=1)
unknown_df = unknown_df[~unknown_df.index.isin(unknown_val.index.values)]

silent_val = silent_df.sample(extra_data_size,random_state=1)
silent_df = silent_df[~silent_df.index.isin(silent_val.index.values)]


valid_df = pd.concat([valid_df,silent_val,unknown_val],axis=0)

In [11]:
# silence_files = train_df[train_df.label == 'silence']
# train_df      = train_df[train_df.label != 'silence']

In [47]:
%%time

silence_files_AS = [AudioSegment.from_wav(x) for x in silent_df.wav_file.values]

CPU times: user 196 ms, sys: 64 ms, total: 260 ms
Wall time: 260 ms


In [48]:
random.choice(silence_files_AS)

In [49]:
filler = AudioSegment.silent(duration=1000, frame_rate = 16000)


In [124]:
def fill_to_1sec(wav):
    #fill to 1 second
    L = 1000 #16000  # 1 sec
    sample_rate = 16000
    
    if len(wav) > L:
        i = np.random.randint(0, len(wav) - L)
        wav = wav[i:(i+L)]
    elif len(wav) < L:
        rem_len = L - len(wav)
        wav = AudioSegment.silent(rem_len,frame_rate=sample_rate) + wav
        
    return wav    

In [148]:
def augment_wav(wav,pval=0.5):
    sample_rate = 16000
    L = 1000 #16000  # 1 sec
    
    #adjust speed, with 50% chance
    wav = speed_change(wav,1.+random.uniform(-1, 1)*0.05) if np.random.random() < pval else wav
    
    
    #adjust volume
    db_adjustment = random.uniform(-1, 1)*10
    wav = wav + db_adjustment if np.random.random() < pval else wav
     
        
    #fill to 1 second
    wav = fill_to_1sec(wav)        
        
    #shift the audio by 10 ms
    shift_length = 100
    if np.random.random() < 0.5: #shift to left
        wav = wav[:L-shift_length]+ AudioSegment.silent(shift_length,frame_rate=sample_rate)
    else: #shift to right
        wav = AudioSegment.silent(shift_length,frame_rate=sample_rate) + wav[shift_length:]
        
        
        
    #blend original file with background noise     
    if np.random.random() < pval:
        noise = random.choice(silence_files_AS)
        db_delta = (wav.dBFS - noise.dBFS) -10.

        if db_delta< 0: #reduce intensity of loud background; if it's too silent, leave it be
            noise = noise  + db_delta
        wav = wav.overlay(noise)
 
    return wav

In [150]:

def process_wav_file(record, reshape=False, augment=True,pval=0.5):
    
    if type(record) == str: # test files
        fname = record
        label = "test"
    else:    
        fname  = record.wav_file
        label = record.label
    
    wav = AudioSegment.from_wav(fname)

    
    if (not label in ["silence"]) and augment: #no augmentation for sample files 
        wav = augment_wav(wav,pval)

    else:
        #make sure segment is 1 second
        wav = fill_to_1sec(wav)

    samples = AS_to_raw(wav)
    

    return log_mel(samples)


In [130]:
def train_generator(train_batch_size):
    while True:
        this_train = train_df.groupby('label_id').apply(lambda x: x.sample(n = 2000))
        extra_data_size = int(this_train.shape[0]* 0.1)
        this_train = pd.concat([silent_df.sample(extra_data_size),
                                this_train,
                                unknown_df.sample(extra_data_size)])
        
        shuffled_ids = random.sample(range(this_train.shape[0]), this_train.shape[0])
        for start in range(0, len(shuffled_ids), train_batch_size):
            x_batch = []
            y_batch = []
            end = min(start + train_batch_size, len(shuffled_ids))
            i_train_batch = shuffled_ids[start:end]
            for i in i_train_batch:
                x_batch.append(process_wav_file(this_train.iloc[i],reshape=True))
                y_batch.append(this_train.label_id.values[i])
            x_batch = np.array(x_batch)
            y_batch = to_categorical(y_batch, num_classes = len(POSSIBLE_LABELS))
            yield x_batch, y_batch

In [131]:
def valid_generator(val_batch_size):
    while True:
        ids = list(range(valid_df.shape[0]))
        for start in range(0, len(ids), val_batch_size):
            x_batch = []
            y_batch = []
            end = min(start + val_batch_size, len(ids))
            i_val_batch = ids[start:end]
            for i in i_val_batch:
                x_batch.append(process_wav_file(valid_df.iloc[i],reshape=True,augment=False))
                y_batch.append(valid_df.label_id.values[i])
#                 try:
#                     if np.array(x_batch[-1]).shape != (128,32,1):
#                         print np.array(x_batch[-1]).shape
#                 except:
#                     print x_batch[-1] 
            x_batch = np.array(x_batch)
            y_batch = to_categorical(y_batch, num_classes = len(POSSIBLE_LABELS))
            yield x_batch, y_batch

In [132]:
from keras.callbacks import TensorBoard
exp_name = "pyramid_conv"
callbacks = [EarlyStopping(monitor='val_loss',
                           patience=5,
                           verbose=1,
                           mode='min'),
             ReduceLROnPlateau(monitor='val_loss',
                               factor=0.1,
                               patience=3,
                               verbose=1,
                               epsilon=0.01,
                               mode='min'),
             ModelCheckpoint(monitor='val_loss',
                             filepath='weights/starter_{}.hdf5'.format(exp_name),
                             save_best_only=True,
                             save_weights_only=True,
                             mode='min'),
#              TQDMNotebookCallback(), 
            TensorBoard(log_dir='./logs_{}'.format(exp_name), histogram_freq=0, batch_size=64, write_graph=True)]

In [133]:
#a common practice is to choose a filter size in time which spans 2/3 o

In [134]:
p = 0.5

x_in = Input(shape = (128,32,1)) #1 channel, 99 time, 161 freqs # S : np.ndarray [shape=(n_mels, t)]

x = BatchNormalization()(x_in)

x = Conv2D(64, (9,10),activation='relu',padding='same')(x)
x = Dropout(p)(x)
x = BatchNormalization()(x)
x = MaxPooling2D((3,2),padding='same')(x)

x = Conv2D(128, (4,5),activation='relu',padding='same')(x)
x = Dropout(p)(x)
x = BatchNormalization()(x)
x = MaxPooling2D((3,2),padding='same')(x)


x = Conv2D(128, (2,2),activation='relu',padding='same')(x)
x = Dropout(p)(x)
x = BatchNormalization()(x)
x = MaxPooling2D(padding='same')(x)



x = GlobalMaxPool2D()(x)

# x = Flatten()(x)
x = Dense(64, activation = 'relu')(x) #
x = Dropout(p)(x)

# x = Dense(64, activation = 'relu')(x)
# x = Dropout(0.3)(x)

x = Dense(len(POSSIBLE_LABELS), activation = 'softmax', name='targets')(x)




model = Model(inputs = x_in, outputs = x)
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['accuracy'])
# model.summary()

In [135]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 128, 32, 1)        0         
_________________________________________________________________
batch_normalization_33 (Batc (None, 128, 32, 1)        4         
_________________________________________________________________
conv2d_25 (Conv2D)           (None, 128, 32, 64)       5824      
_________________________________________________________________
dropout_32 (Dropout)         (None, 128, 32, 64)       0         
_________________________________________________________________
batch_normalization_34 (Batc (None, 128, 32, 64)       256       
_________________________________________________________________
max_pooling2d_25 (MaxPooling (None, 43, 16, 64)        0         
_________________________________________________________________
conv2d_26 (Conv2D)           (None, 43, 16, 128)       163968    
__________

In [136]:
# model.load_weights('weights/starter.hdf5')

In [137]:
valid_df.shape[0]/64

48

Epoch 14/100
300/300 [==============================] - 214s - loss: 0.7612 - acc: 0.7370 - val_loss: 1.0481 - val_acc: 0.6508



Epoch 31/100
350/350 [==============================] - 227s - loss: 0.4294 - acc: 0.8518 - val_loss: 0.9436 - val_acc: 0.7179
Epoch 32/100



In [138]:
history = model.fit_generator(generator=train_generator(64),
                              steps_per_epoch=train_df.shape[0]/64,
                              epochs=100,
                              callbacks=callbacks,
                              validation_data=valid_generator(64),
                              validation_steps=int(np.ceil(valid_df.shape[0]/64)))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 00040: reducing learning rate to 0.00010000000475.
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 00053: reducing learning rate to 1.0000000475e-05.
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 00058: reducing learning rate to 1.00000006569e-06.
Epoch 60/100
Epoch 61/100
Epoch 00060: early stopping


In [139]:
model.load_weights('./weights/starter_pyramid_conv.hdf5')

In [140]:
test_paths = glob(os.path.join('./data/', 'test/audio/*wav'))

In [141]:
len(test_paths)

158538

In [155]:
def test_generator(test_batch_size,augment=False):
    while True:
        for start in range(0, len(test_paths), test_batch_size):
            x_batch = []
            end = min(start + test_batch_size, len(test_paths))
            this_paths = test_paths[start:end]
            for x in this_paths:
                x_batch.append(process_wav_file(x,reshape=True,augment=augment,pval=0.5))
            x_batch = np.array(x_batch)
            yield x_batch

In [143]:
predictions = model.predict_generator(test_generator(64,augment=False), int(np.ceil(len(test_paths)/64.)), verbose=1)



In [152]:
np.save("cache/predictions_pyramid_noaug.npy",predictions)

In [156]:
num_aug = 2 
for i in range(num_aug):
    predictions +=  model.predict_generator(test_generator(64,augment=True), int(np.ceil(len(test_paths)/64.)), verbose=1)
    



In [157]:
predictions = predictions/(num_aug + 1.0)

In [158]:
classes = np.argmax(predictions, axis=1)

In [159]:
# last batch will contain padding, so remove duplicates
submission = dict()
for i in range(len(test_paths)):
    fname, label = os.path.basename(test_paths[i]), id2name[classes[i]]
    submission[fname] = label

In [160]:
with open('subm/starter_submission{}_3xtta.csv'.format(exp_name), 'w') as fout:
    fout.write('fname,label\n')
    for fname, label in submission.items():
        fout.write('{},{}\n'.format(fname, label))