In [None]:
import pickle
import gzip
import numpy as np

INPUT_DIR = '../input/'
OUTPUT_DIR = '../output/'
MAIN_DIR = 'C:/data/tf_speech/'
TRAIN_DIR = MAIN_DIR + 'train/'
TRAIN_MODIFIED_DIR = MAIN_DIR + 'train_modified/'
TRAIN_AUDIO_DIR = TRAIN_DIR + 'audio/'
BG_DIR = TRAIN_DIR + 'audio/_background_noise_/'
TEST_DIR = MAIN_DIR + 'test/audio/'
PREDS_DIR = '../output/preds/'
KFOLD_FILENAME = 'kfold_cache_4.pklz'

folds = pickle.load(gzip.open(TRAIN_MODIFIED_DIR + KFOLD_FILENAME, 'rb'))
for f in folds:
    print (len(f[0]), len(f[1]))


In [None]:
folds[0][0][:10]

In [None]:
import os
import glob
from utilities import *
from tqdm import tqdm, tqdm_notebook

def load_fold(fold):
    train_files = [f.split('\\')[-2] + f.split('\\')[-1] for f in fold[0]]
    val_files = [f.split('\\')[-2] + f.split('\\')[-1] for f in fold[1]]
    
    all_files = glob.glob(os.path.join(TRAIN_MODIFIED_AUDIO_DIR, '*/*wav'))

    train, val = [], []
    for fname in tqdm(all_files):
        splits = fname.split('\\')
        label, uid = splits[-2], splits[-1].split('_')[0]
        if label == '_background_noise_':
            label = 'silence'
        if label not in LABELS:
            label = 'unknown'

        label_id = NAME2ID[label]

        sample = (label_id, uid, fname)
        sample_id = splits[-2] + splits[-1]
        if sample_id in val_files:
            val.append(sample)
        elif sample_id in train_files:
            train.append(sample)
    print('There are {} train and {} val samples'.format(len(train), len(val)))

    return train, val

trainset, valset = load_fold(folds[0])
trainset, valset = load_fold(folds[1])
trainset, valset = load_fold(folds[2])
trainset, valset = load_fold(folds[3])

In [None]:
trainset, valset = load_fold(folds[0])

In [None]:
files = ['../output/preds/r50_f/fold_0/train.csv',
        '../output/preds/r50_f/fold_1/train.csv',
         '../output/preds/r50_f/fold_2/train.csv',
         '../output/preds/r50_f/fold_3/train.csv',
        ]

trainset0, valset0 = load_fold(folds[0])
trainset1, valset1 = load_fold(folds[1])
trainset2, valset2 = load_fold(folds[2])
trainset3, valset3 = load_fold(folds[3])

labels = []
for valset in [valset0, valset1, valset2, valset3]:
    labels.append([v[0] for v in valset])
    
    

In [10]:
labels = [item for sublist in labels for item in sublist]

In [11]:
len(labels)

72221

In [21]:
preds = []
for f in files:
    preds.append(pd.read_csv(f, index_col='fname').values)


In [22]:
preds = [item for sublist in preds for item in sublist]

In [23]:
preds = np.array(preds)

In [24]:
preds.shape

(72221, 12)

In [25]:
preds

array([[5.45341440e-07, 1.12733570e-02, 5.68119200e-09, ...,
        5.33063660e-04, 1.10157160e-06, 8.19366130e-04],
       [3.07502160e-05, 4.20786740e-02, 1.83860070e-07, ...,
        1.01446700e-02, 4.59887670e-05, 4.46214970e-02],
       [1.19450780e-04, 1.40774030e-02, 1.34401860e-03, ...,
        2.29340960e-02, 5.63298500e-03, 4.74986100e-01],
       ...,
       [2.74742580e-04, 8.54635440e-05, 8.02750400e-06, ...,
        6.07135100e-04, 9.98027400e-01, 5.09377200e-04],
       [1.49946390e-05, 3.99721700e-05, 1.28649670e-05, ...,
        3.90805070e-04, 9.98484700e-01, 8.63943800e-04],
       [4.91329900e-04, 1.03470775e-05, 3.34988200e-06, ...,
        5.97465260e-04, 5.93591750e-01, 3.91461520e-01]])

In [26]:
pred_labels = np.argmax(preds, axis=1)
pred_labels.shape

(72221,)

In [30]:
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix

conf = confusion_matrix(labels, pred_labels)
for l in LABELS:
    total = sum(conf[NAME2ID[l]])
    acc = 1.0 * conf[NAME2ID[l], NAME2ID[l]] / total
    print('{}, total: {}, accuracy: {}'.format(l, total,  acc))

yes, total: 2350, accuracy: 0.9829787234042553
no, total: 2347, accuracy: 0.9190455901150405
up, total: 2323, accuracy: 0.9130434782608695
down, total: 2329, accuracy: 0.8634607127522542
left, total: 2315, accuracy: 0.9520518358531318
right, total: 2320, accuracy: 0.8939655172413793
on, total: 2324, accuracy: 0.898881239242685
off, total: 2321, accuracy: 0.950021542438604
stop, total: 2347, accuracy: 0.9203238176395399
go, total: 2333, accuracy: 0.911701671667381
silence, total: 8347, accuracy: 0.9595064094884389
unknown, total: 40565, accuracy: 0.7977320350055467


In [None]:
yes total: 2350 accuracy: 0.9829787234042553
no total: 2347 accuracy: 0.9190455901150405
up total: 2323 accuracy: 0.9130434782608695
down total: 2329 accuracy: 0.8634607127522542
left total: 2315 accuracy: 0.9520518358531318

### diff

In [2]:
import pandas as pd
from utilities import *
# diff -y --suppress-common-lines a b | grep '^' | wc -l
def diff_preds(f1, f2):
    p1 = pd.read_csv(f1).label.values
    p2 = pd.read_csv(f2).label.values
    
#     for i in range(len(p1)):
#         if p1[i] != p2[i]:
#             print (p1[i], p2[i])
    return sum(p1 != p2)

def diff_preds2(f1, f2):
    p1 = pd.read_csv(f1, index_col='fname').values
    p1 = np.argmax(p1, axis=1)
    p2 = pd.read_csv(f2).label.values
    p2 = [NAME2ID[name] for name in p2]
    
#     for i in range(len(p1)):
#         if p1[i] != p2[i]:
#             print (p1[i], p2[i])
    return sum(p1 != p2)

print(diff_preds2('../output/preds/r50_f.csv', '../output/subm_majorty.csv'))
print(diff_preds('../output/resnet50_224.csv', '../output/subm_majorty.csv'))
print(diff_preds2('../output/preds/r50_f.csv', '../output/resnet50_224.csv', ))

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


16146
10432
16266


In [3]:
print(diff_preds('../output/r50_199_f.csv', '../output/subm_majorty.csv'))
print(diff_preds('../output/r50_199_f.csv', '../output/subm_majorty.csv'))

16653
16653


In [4]:
print(diff_preds2('../output/preds/r50_199_f.csv', '../output/subm_majorty.csv'))
print(diff_preds('../output/resnet50_fixed.csv', '../output/subm_majorty.csv'))
print(diff_preds2('../output/preds/r50_199_f.csv', '../output/resnet50_fixed.csv'))

15140
9307
14555


In [12]:
print(diff_preds('../output/inc_139.csv', '../output/subm_majorty.csv'))
print(diff_preds('../output/incres_140.csv', '../output/subm_majorty.csv'))
print(diff_preds('../output/inc_139.csv', '../output/incres_140.csv'))

17688
9458
17625


In [13]:
print(diff_preds('../output/xcep.csv', '../output/subm_majorty.csv'))
print(diff_preds('../output/xception.csv', '../output/subm_majorty.csv'))
print(diff_preds('../output/xcep.csv', '../output/xception.csv'))

20459
9565
19008


In [None]:
p = pd.read_csv('../output/preds/r50_199_f.csv', index_col='fname').values


In [25]:
import numpy as np
from utilities import *

p = pd.read_csv('../output/preds/r50_199_f.csv', index_col='fname').values
best = pd.read_csv('../output/subm_majorty.csv').label.values
best = [NAME2ID[name] for name in best]

c = np.argmax(p, axis=1)
print(sum(c != best))

for l in range(len(LABELS) - 1):
    cond = np.logical_and(c == 11, p[:,l] > 0.4)
    p[np.where(cond)[0], 11] = 0.0
c = np.argmax(p, axis=1)

print(sum(c != best))

15140
15229


In [21]:
print(c)

[ 1 11 11 ...  8 11 11]


In [30]:
p = pd.read_csv('../output/preds/r50_199_f.csv', index_col='fname').values
c = pd.read_csv('../output/r50_199_f.csv').label.values
c = [NAME2ID[name] for name in c]

array([ 1, 11, 11, ...,  8, 11, 11], dtype=int64)

In [33]:
print(sum(c != np.argmax(p, axis=1)))

6933


### new partition

In [16]:
from utilities import *

all_files = glob.glob(os.path.join(TRAIN_AUDIO_DIR, '*/*wav'))

with open(os.path.join(TRAIN_DIR, 'validation_list.txt'), 'r') as fin:
    validation_files = fin.readlines()
valset = set()
for fname in validation_files:
    splits = fname.split('/')
    valset.add(splits[-1].split('_')[0].rstrip())
train, val = [], []
noise = [(NAME2ID['silence'], '', '')]
for fname in all_files:
    splits = fname.split('\\')
    label, uid = splits[-2], splits[-1].split('_')[0]
    if label == '_background_noise_':
        label = 'silence'

    sample = (label, uid, fname)
    if uid in valset:
        val.append(sample)
    elif label == 'silence':
        noise.append(sample)
    else:
        train.append(sample)

print('There are {} train and {} val samples'.format(len(train), len(val)))


There are 57923 train and 6798 val samples


In [23]:
from collections import Counter
count_train = Counter([s[0] for s in train])
for k in count_train.keys():
    print('{}:\t{},\t{}'.format(k, count_train[k], count_train[k] / len(train)))

four:	2092,	0.036116913833882915
stop:	2134,	0.03684201439842549
yes:	2116,	0.03653125701362153
dog:	1576,	0.027208535469502616
right:	2111,	0.036444935517842655
left:	2106,	0.03635861402206377
bed:	1516,	0.02617267752015607
five:	2115,	0.03651399271446576
three:	2108,	0.036393142620375324
bird:	1569,	0.027087685375412184
wow:	1579,	0.027260328366969944
on:	2110,	0.036427671218686876
happy:	1553,	0.02681145658891977
no:	2105,	0.036341349722908
six:	2107,	0.03637587832121955
nine:	2134,	0.03684201439842549
up:	2115,	0.03651399271446576
two:	2137,	0.036893807295892825
seven:	2114,	0.03649672841530998
go:	2112,	0.03646219981699843
cat:	1565,	0.027018628178789084
one:	2140,	0.03694560019336015
house:	1577,	0.027225799768658392
down:	2095,	0.03616870673135024
zero:	2116,	0.03653125701362153
tree:	1567,	0.027053156777100632
marvin:	1586,	0.027381178461060373
off:	2101,	0.036272292526284895
sheila:	1558,	0.02689777808469865
eight:	2109,	0.0364104069195311


In [24]:
count_val = Counter([s[0] for s in val])
for k in count_val.keys():
    print('{}:\t{},\t{}'.format(k, count_val[k], count_val[k] / len(val)))

four:	280,	0.04118858487790527
stop:	246,	0.03618711385701677
yes:	261,	0.038393645189761696
dog:	170,	0.025007355104442484
right:	256,	0.03765813474551338
left:	247,	0.03633421594586643
bed:	197,	0.028979111503383347
five:	242,	0.03559870550161812
three:	248,	0.03648131803471609
bird:	162,	0.02383053839364519
wow:	166,	0.024418946749043838
on:	257,	0.03780523683436305
happy:	189,	0.027802294792586054
no:	270,	0.03971756398940865
six:	262,	0.038540747278611356
nine:	230,	0.033833480435422184
up:	260,	0.03824654310091203
two:	236,	0.03471609296852015
seven:	263,	0.038687849367461016
go:	260,	0.03824654310091203
cat:	168,	0.02471315092674316
one:	230,	0.033833480435422184
house:	173,	0.025448661370991468
down:	264,	0.038834951456310676
zero:	260,	0.03824654310091203
tree:	166,	0.024418946749043838
marvin:	160,	0.023536334215945868
off:	256,	0.03765813474551338
sheila:	176,	0.025889967637540454
eight:	243,	0.03574580759046778


In [78]:
all_files = glob.glob(os.path.join(TRAIN_MODIFIED_AUDIO_DIR, '*/*wav'))
uids = []
f_sil = []
for fname in all_files:
    splits = fname.split('\\')
    label, uid = splits[-2], splits[-1].split('_')[0]
    if label == '_background_noise_':
        f_sil.append(fname)
    else:
        uids.append(uid)
uids = np.unique(uids)
f_sil = np.unique(f_sil)

In [80]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=17)
uids_folds = []
for train_index, test_index in kf.split(uids):
    uids_folds.append([uids[train_index], uids[test_index]])

sil_folds = []
for train_index, test_index in kf.split(f_sil):
    sil_folds.append([f_sil[train_index], f_sil[test_index]])

In [82]:
for uid_f in uids_folds:
    print (len(uid_f[0]), len(uid_f[1]))
    for i in uid_f[0]:
        assert i not in uid_f[1]
        
for f in sil_folds:
    print (len(f[0]), len(f[1]))
    for i in f[0]:
        assert i not in f[1]

1409 470
1409 470
1409 470
1410 469
6264 2089
6265 2088
6265 2088
6265 2088


In [83]:
all_files = glob.glob(os.path.join(TRAIN_MODIFIED_AUDIO_DIR, '*/*wav'))
folds = []
for i in range(len(uids_folds)):
    uid_f = uids_folds[i]
    sil_f = sil_folds[i]
    
    train_f = []
    test_f = []
    for fname in all_files:
        splits = fname.split('\\')
        label, uid = splits[-2], splits[-1].split('_')[0]
        if uid in uid_f[0] or fname in sil_f[0]:
            train_f.append(fname)
        elif uid in uid_f[1] or fname in sil_f[1]:
            test_f.append(fname)
        else:
            print(uid)
            print('WTF?!')
    folds.append([train_f, test_f])

In [84]:
for f in folds:
    print (len(f[0]), len(f[1]))
    for i in f[0]:
        assert i not in f[1]

54171 18056
54556 17671
53922 18305
54599 17628


In [63]:
from collections import Counter

def test_folds(folds):
    for fold in folds:
        train_files = [f.split('\\')[-2] + f.split('\\')[-1] for f in fold[0]]
        val_files = [f.split('\\')[-2] + f.split('\\')[-1] for f in fold[1]]

        all_files = glob.glob(os.path.join(TRAIN_MODIFIED_AUDIO_DIR, '*/*wav'))

        train, val = [], []
        for fname in all_files:
            splits = fname.split('\\')
            label, uid = splits[-2], splits[-1].split('_')[0]
            if label == '_background_noise_':
                label = 'silence'
            sample = (label, uid, fname)
            sample_id = splits[-2] + splits[-1]
            if sample_id in val_files:
                val.append(sample)
            elif sample_id in train_files:
                train.append(sample)
        print('There are {} train and {} val samples'.format(len(train), len(val)))

        count_train = Counter([s[0] for s in train])
        count_val = Counter([s[0] for s in val])
        for k in count_train.keys():
            print('{}:\t{}\t-\t{},\t{}\t-\t{}'.format(k, count_train[k], count_val[k], count_train[k] / len(train), count_val[k] / len(val)))
        print('==============================================================')

In [85]:
test_folds(folds)

There are 54171 train and 18056 val samples
four:	1769	-	573,	0.0326558490705359	-	0.03173460345591493
stop:	1748	-	599,	0.03226818777574717	-	0.033174568010633586
yes:	1776	-	574,	0.032785069502132135	-	0.031789986708019496
dog:	1288	-	453,	0.02377655941370844	-	0.0250886132033673
right:	1729	-	591,	0.031917446604271656	-	0.032731501993797076
left:	1733	-	582,	0.03199128685089808	-	0.03223305272485601
bed:	1280	-	431,	0.023628878920455593	-	0.0238701816570669
five:	1727	-	601,	0.031880526480958446	-	0.03328533451484271
three:	1707	-	558,	0.03151132524782633	-	0.030903854674346476
bird:	1276	-	446,	0.02355503867382917	-	0.024700930438635357
wow:	1269	-	465,	0.02342581824223293	-	0.025753212228622063
on:	1769	-	555,	0.0326558490705359	-	0.030737704918032786
happy:	1275	-	454,	0.023536578612172565	-	0.025143996455471867
no:	1780	-	567,	0.03285890974875856	-	0.03140230394328755
silence:	6367	-	1986,	0.11753521256760997	-	0.10999113867966327
six:	1760	-	564,	0.03248970851562644	-	0.0312361

In [86]:
with gzip.open(TRAIN_MODIFIED_DIR + 'kfold4_max.pklz', 'wb') as f:
    pickle.dump(folds, f)

In [88]:
my_folds = pickle.load(gzip.open(TRAIN_MODIFIED_DIR + 'kfold4_max.pklz', 'rb'))
assert (my_folds == folds)

In [65]:
import pickle
import gzip

our_folds = pickle.load(gzip.open(TRAIN_MODIFIED_DIR + KFOLD_FILENAME, 'rb'))

In [66]:
test_folds(our_folds)

There are 53504 train and 18717 val samples
four:	2342	-	0,	0.043772428229665074	-	0.0
stop:	1768	-	579,	0.033044258373205744	-	0.030934444622535664
dog:	1741	-	0,	0.03253962320574163	-	0.0
right:	1742	-	578,	0.03255831339712919	-	0.030881017257039057
left:	1723	-	592,	0.03220319976076555	-	0.03162900037399156
bed:	1711	-	0,	0.03197891746411483	-	0.0
five:	2328	-	0,	0.043510765550239236	-	0.0
three:	2265	-	0,	0.04233328349282297	-	0.0
bird:	1722	-	0,	0.03218450956937799	-	0.0
yes:	1734	-	616,	0.032408791866028706	-	0.03291125714591014
on:	1724	-	600,	0.03222188995215311	-	0.03205641929796442
happy:	1729	-	0,	0.03231534090909091	-	0.0
no:	1762	-	585,	0.03293211722488038	-	0.03125500881551531
silence:	6040	-	2307,	0.11288875598086125	-	0.12325693220067319
six:	2324	-	0,	0.043436004784688995	-	0.0
seven:	2336	-	0,	0.04366028708133971	-	0.0
up:	1734	-	589,	0.032408791866028706	-	0.03146871827750174
two:	2339	-	0,	0.04371635765550239	-	0.0
sheila:	1720	-	0,	0.03214712918660287	-	0.0
go:	173