In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

LABELS = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 'silence', 'unknown']
ID2NAME = {i: name for i, name in enumerate(LABELS)}
NAME2ID = {name: i for i, name in ID2NAME.items()}


In [37]:
df = pd.read_csv('../output/preds/subm_ensemble_weights_raw_8.csv', index_col='fname')
probs = df.values
fnames = df.index

labels = np.argmax(probs, axis=1)
labels = np.array([ID2NAME[i] for i in labels])
max_probs = np.max(probs, axis=1)

In [43]:
print(len(fnames))
print(len(fnames[max_probs > 0.9]))
print(len(fnames[max_probs <= 0.9]))

158538
115576
42962


In [47]:
pseudo_label = pd.DataFrame({'fname': fnames[max_probs > 0.9], 'label': labels[max_probs > 0.9]})
no_label = pd.DataFrame({'fname': fnames[max_probs <= 0.9]})
pseudo_label.to_csv('../input/pseudo_label.csv', index=False)
no_label.to_csv('../input/no_label.csv', index=False)

In [59]:
from utilities import *

pl_fnames = pseudo_label.fname
pl_fnames = np.array([os.path.join(TEST_DIR, f) for f in pl_fnames])

In [60]:
pl_fnames[:10]

array(['C:/data/tf_speech/test/audio/clip_000044442.wav',
       'C:/data/tf_speech/test/audio/clip_0000adecb.wav',
       'C:/data/tf_speech/test/audio/clip_0000d4322.wav',
       'C:/data/tf_speech/test/audio/clip_0001d1559.wav',
       'C:/data/tf_speech/test/audio/clip_0002256ed.wav',
       'C:/data/tf_speech/test/audio/clip_0002a4a1f.wav',
       'C:/data/tf_speech/test/audio/clip_0002d9b83.wav',
       'C:/data/tf_speech/test/audio/clip_000373a5b.wav',
       'C:/data/tf_speech/test/audio/clip_0003e6aee.wav',
       'C:/data/tf_speech/test/audio/clip_00049951d.wav'], dtype='<U47')

In [74]:
from sklearn.model_selection import KFold
import pickle
import gzip

MY_KFOLD_NOISE_FILENAME = 'kfold4_max_new_noise.pklz'
folds = pickle.load(gzip.open(TRAIN_MODIFIED_DIR + MY_KFOLD_NOISE_FILENAME, 'rb'))

kf = KFold(n_splits=4, shuffle=True, random_state=17)

pl_folds = []
for train_index, test_index in kf.split(pl_fnames):
    pl_folds.append([pl_fnames[train_index], pl_fnames[test_index]])

for f in pl_folds:
    print(len(f[0]), len(f[1]))
    
with gzip.open(TRAIN_MODIFIED_DIR + 'kfold4_max_pseudo_label.pklz', 'wb') as f:
    pickle.dump(pl_folds, f)

86682 28894
86682 28894
86682 28894
86682 28894


In [64]:
pl_fnames[:10]

array(['C:/data/tf_speech/test/audio/clip_000044442.wav',
       'C:/data/tf_speech/test/audio/clip_0000adecb.wav',
       'C:/data/tf_speech/test/audio/clip_0000d4322.wav',
       'C:/data/tf_speech/test/audio/clip_0001d1559.wav',
       'C:/data/tf_speech/test/audio/clip_0002256ed.wav',
       'C:/data/tf_speech/test/audio/clip_0002a4a1f.wav',
       'C:/data/tf_speech/test/audio/clip_0002d9b83.wav',
       'C:/data/tf_speech/test/audio/clip_000373a5b.wav',
       'C:/data/tf_speech/test/audio/clip_0003e6aee.wav',
       'C:/data/tf_speech/test/audio/clip_00049951d.wav'], dtype='<U47')

In [72]:
splits = pl_fnames[0].split('/')
splits

['C:', 'data', 'tf_speech', 'test', 'audio', 'clip_000044442.wav']

In [69]:
pseudo_label.label.value_counts()

unknown    69004
yes         4948
stop        4763
right       4457
off         4454
left        4435
on          4338
no          4069
down        4043
silence     3827
go          3693
up          3545
Name: label, dtype: int64

In [70]:
np.unique(labels, return_counts=True)

(array(['down', 'go', 'left', 'no', 'off', 'on', 'right', 'silence',
        'stop', 'unknown', 'up', 'yes'], dtype='<U7'),
 array([ 5527,  5452,  6088,  6065,  5936,  5904,  5413,  8801,  5544,
        92873,  5384,  5551], dtype=int64))

In [77]:
df_pl = pd.read_csv('../input/pseudo_label.csv', index_col='fname')

In [78]:
df_pl.head()

Unnamed: 0_level_0,label
fname,Unnamed: 1_level_1
clip_000044442.wav,no
clip_0000adecb.wav,unknown
clip_0000d4322.wav,unknown
clip_0001d1559.wav,unknown
clip_0002256ed.wav,unknown


In [82]:
df_pl.loc['clip_000044442.wav'].label

'no'