In [1]:
# Будем использовать библиотеку librose
import librosa

In [2]:
% pylab inline
import os
import pandas as pd
import glob
import librosa.display
import numpy as np

Populating the interactive namespace from numpy and matplotlib


In [3]:
# Из файла meta.txt получим название файла и метку для него
data = pd.read_csv('./meta/meta.txt', sep='	', header=None)
data.columns = ['sound', '1', '2', '3', 'label']

In [4]:
#Удалим лишние колонки
data = data.drop(['1', '2', '3'], axis=1)
#data

In [5]:
# Будем использовать эту функцию для загрузки файлов и нахождения в них особенностей с помощью librosa.feature.mfcc
def parser(row):
    file_name = os.path.join('./audio/', row.sound)
    # Проверка, что удалось открыть текущий файл
    try:
        # Используем kaiser_fast для более быстрого выделения особенностей
        X, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=64).T,axis=0) 
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None, None
    
    feature = mfccs
    label = row.label

    return [feature, label]

%time temp = data.apply(parser, axis=1)
temp.columns = ['feature', 'label']

Wall time: 24min 43s


In [6]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Convolution2D, MaxPooling2D
from keras.optimizers import Adam
from sklearn import metrics 

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [7]:
X = np.array(temp.feature.tolist())
y = np.array(temp.label.tolist())
# Перекодируем метки в числа
lb = LabelEncoder()
y = np_utils.to_categorical(lb.fit_transform(y))

In [47]:
#score = 87.10, 87.31
num_labels = len(y[0])
filter_size = 2

# Построим модель
model = Sequential()

model.add(Dense(256, input_shape=(64,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))

model.add(Dense(num_labels))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

# Обучим модель
#model.fit(train_X, train_y, batch_size=64, epochs=40, validation_data=(valid_X, valid_y))
numpy.random.seed(0)
model.fit(X, y, batch_size=48, epochs=52, validation_split=0.05)

Train on 10741 samples, validate on 566 samples
Epoch 1/52
Epoch 2/52
Epoch 3/52
Epoch 4/52
Epoch 5/52
Epoch 6/52
Epoch 7/52
Epoch 8/52
Epoch 9/52
Epoch 10/52
Epoch 11/52
Epoch 12/52
Epoch 13/52
Epoch 14/52
Epoch 15/52
Epoch 16/52
Epoch 17/52
Epoch 18/52
Epoch 19/52
Epoch 20/52
Epoch 21/52
Epoch 22/52
Epoch 23/52
Epoch 24/52
Epoch 25/52
Epoch 26/52
Epoch 27/52
Epoch 28/52
Epoch 29/52
Epoch 30/52
Epoch 31/52
Epoch 32/52
Epoch 33/52
Epoch 34/52
Epoch 35/52
Epoch 36/52
Epoch 37/52
Epoch 38/52
Epoch 39/52
Epoch 40/52
Epoch 41/52
Epoch 42/52
Epoch 43/52
Epoch 44/52
Epoch 45/52
Epoch 46/52
Epoch 47/52
Epoch 48/52
Epoch 49/52
Epoch 50/52
Epoch 51/52
Epoch 52/52


<keras.callbacks.History at 0xec07a4eeb8>

In [9]:
# Тестирование
sounds = os.listdir('./test')
#sounds

In [10]:
test_data = pd.DataFrame(sounds)
test_data.columns = ['sound']
#test_data

In [11]:
# Аналог функции parser для тестовых данных(нет меток)
def parser_for_test_data(row):
    file_name = os.path.join('./test/', row.sound)
    try:
        X, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=64).T,axis=0) 
    except Exception as e:
        print("Error encountered while parsing file: ", file)
        return None
    
    feature = mfccs
    
    return [feature]

%time temp_test = test_data.apply(parser_for_test_data, axis=1)
temp_test.columns = ['feature']

Wall time: 2min 6s


In [12]:
X_test = np.array(temp_test.feature.tolist())

In [48]:
# Посмотрим, с какой вероятностью каждый файл принадлежит соответсвующему классу
predict_score = model.predict(X_test)
predict_score

array([[8.4851861e-01, 1.7919061e-04, 1.3573247e-03, ..., 9.8784734e-03,
        1.7258745e-02, 3.3158789e-04],
       [9.1939563e-01, 1.9319938e-05, 1.4819815e-04, ..., 5.5358279e-03,
        7.4662943e-03, 8.2316365e-05],
       [8.6767685e-01, 9.3208408e-05, 6.7306345e-04, ..., 8.1697004e-03,
        1.0805587e-02, 2.1842681e-04],
       ...,
       [4.4197063e-03, 7.7396169e-02, 1.1272083e-05, ..., 6.9621056e-01,
        2.7869370e-02, 5.5615660e-02],
       [2.2648174e-14, 1.9923943e-11, 7.5207193e-12, ..., 4.2411199e-09,
        9.5558918e-01, 5.2280379e-03],
       [8.2951415e-09, 1.0796146e-08, 2.0492763e-08, ..., 1.1443246e-06,
        3.8398722e-01, 6.7535607e-04]], dtype=float32)

In [49]:
# Предсказанные классы для каждого файла
pred = model.predict_classes(X_test)

In [50]:
pred

array([0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 6, 6, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 1, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 6, 6, 2,
       2, 2, 2, 2, 6, 2, 6, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       0, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 2, 0, 0, 4, 4, 4, 6, 4, 4, 6, 6, 4, 4, 2,
       6, 6, 4, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,

In [51]:
encoded_pred = pred

In [52]:
# Перекодируем числовые значения меток в их изначальные названия
decoded_pred = lb.inverse_transform(pred)

  if diff:


In [53]:
decoded_pred

array(['background', 'background', 'background', 'background',
       'background', 'background', 'background', 'keyboard', 'keyboard',
       'keyboard', 'keyboard', 'keyboard', 'background', 'door',
       'background', 'background', 'background', 'background', 'door',
       'background', 'background', 'background', 'background',
       'background', 'background', 'background', 'background',
       'background', 'background', 'background', 'background',
       'background', 'background', 'background', 'speech', 'background',
       'background', 'background', 'speech', 'speech', 'background',
       'bags', 'bags', 'bags', 'bags', 'bags', 'bags', 'bags', 'bags',
       'bags', 'bags', 'bags', 'bags', 'bags', 'bags', 'bags', 'bags',
       'bags', 'bags', 'bags', 'bags', 'bags', 'bags', 'bags', 'bags',
       'bags', 'bags', 'bags', 'keyboard', 'bags', 'keyboard', 'door',
       'door', 'door', 'door', 'bags', 'bags', 'bags', 'bags', 'bags',
       'bags', 'bags', 'bags', 'bags', 'ba

In [54]:
#Запишем полученные результаты в файл result.txt
df = pd.DataFrame({
    'FileName': sounds
})
df['Score'] = np.max(predict_score, axis=1)
df['Class'] =  decoded_pred
df

Unnamed: 0,FileName,Score,Class
0,background_0001.wav,0.848519,background
1,background_0004.wav,0.919396,background
2,background_0009.wav,0.867677,background
3,background_0012.wav,0.888409,background
4,background_0016.wav,0.902466,background
5,background_0018.wav,0.600227,background
6,background_0022.wav,0.768001,background
7,background_0028.wav,0.435835,keyboard
8,background_0031.wav,0.735383,keyboard
9,background_0036.wav,0.662257,keyboard


In [58]:
df.to_csv(r'./result.txt', header=None, index=None, sep=' ', mode='a')

In [55]:
#Посчитаем точность на открытых данных, таких 473 записи
y_pred = encoded_pred#[:473]
y_pred

array([0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 0, 2, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 6, 6, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 3, 1, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 6, 6, 2,
       2, 2, 2, 2, 6, 2, 6, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       0, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 6, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 2, 0, 0, 4, 4, 4, 6, 4, 4, 6, 6, 4, 4, 2,
       6, 6, 4, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,

In [56]:
# Получим истинное значение на открытых данных (вытащили из названия)
encod = {'back' : 0, 'bags' : 1, 'door' : 2, 'keyb' : 3,  'knoc' : 4, 'ring' : 5, 'spee' : 6, 'tool' : 7 }
check = sounds[:473]
y_true = []
for sound in check:
    y_true.append(encod[sound[:4]])
print(y_true)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 

In [57]:
# И посчитаем точность предсказания
right_number = 0
for i in range(0, 473):
    if (y_pred[i] == y_true[i]):
        right_number += 1
score = right_number/473
score

0.8900634249471459

P.S. Так как в обучении модели используется validation_split (а не validation_data) и разбиение просиходит случайно, то итоговый скор варьируется от 84 до 89.