# Extracting features

In this section we are opening the files

In [13]:
import tarfile
import numpy as np
import pandas as pd

In [14]:
# open the zip file
tar_file = tarfile.open('wav.tgz', 'r')

In [15]:
# an array with the order of wav files in the feat.npy array
path = np.load('path.npy')
# an array with Mel-frequency cepstral coefficients extracted from each wav file. 
feat = np.load('feat.npy', allow_pickle=True)

In [16]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Preparing MFCC features

In this section we put the different files together. 

In [17]:
# Create a dictionary from the files path and feat
zipped = zip(path, feat)
zipped_dict = dict(zipped)

In [18]:
# 1. Convert train and test dataset into numpy array -> train.values, test.valeus
# 2. Create a dictionary from 1.
train_dict = dict(train.values)

In [19]:
# Create mapping between both dictionaries
map_dict = [(k, zipped_dict[k], v) for k, v in train_dict.items()]

In [20]:
# Create two empty lists before split
y_train_bs = []
X_train_bs = []

In [21]:
# Creating the training set
for i in range (len(map_dict)):
    y_train_bs.append(map_dict[i][2])
    X_train_bs.append(map_dict[i][1])

# Baseline model: K-NN 

In this section we create our first model. A K-nearest neighbours with the standard parameter settings, without cross validation, and with just the mean of the MFCC features. We will use this accuracy as the baseline. 

In [22]:
# Creating empty numpy array for putting in the features in the right format
X = np.zeros((len(X_train_bs), 99, 13))
X.shape

(94824, 99, 13)

In [23]:
# Putting the features in the array
row = 0
for i in X_train_bs:
    X[row, 0:len(i),] = i
    row += 1

In [24]:
# Taking mean of axis 1 to get two dimensional data
print(X.shape)
X = X.mean(axis = 1)
print(X.shape)

(94824, 99, 13)
(94824, 13)


In [25]:
# Putting the labels also in a numpy array
y = np.array(y_train_bs)

In [26]:
# Splitting the data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(75859, 13)
(18965, 13)
(75859,)
(18965,)


In [27]:
# Base Model => Defining and running the model
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn.score(X_val, y_val)

0.31231215396783546

# Feature manipulation/engineering

To improve our accuracy we want do not just want to use the mean of of the MFCC, but use every piece of data we have. For this we flatten the three dimenstional dataframe to a two dimensional dataframe. Furthermore we scale our data.

In [28]:
X = np.array(X_train_bs)

In [29]:
# a list which contains horizontally distributed 99x13 features for each audio
lis = []
for i in range(X.shape[0]):
    lis.append(X[i])
lis_array = np.array(lis)

In [30]:
# flatten the 99x13 into 1287
M = []
for i in range (len(lis)):
    M.append(lis[i].ravel())

In [31]:
# put zeros if the lenght is less than 1287
M_pad = []
for i in range (len(M)):
    M_pad.append(np.pad(M[i], (0, 1287 - len(M[i])), 'constant'))

In [32]:
# Check if the code above did what we want
for i in range (len(M_pad)):
    if len(M_pad[i]) != 1287:
        print(len(M_pad[i]))

In [33]:
# vertically stack the features of all audios
M_pad_stack = []
for i in range (len(M_pad)):
    M_pad_stack.append(np.vstack(M_pad[i]))

In [34]:
M_pad_stack_arr = np.array(M_pad_stack)

In [35]:
X_features = M_pad_stack_arr.mean(axis = 2)

In [36]:
X_features.shape

(94824, 1287)

In [37]:
# Scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X = scaler.fit_transform(X_features)

# Feature extraction and engineering for Test data

In [38]:
test_list = list(test.values)
test_path = []
for k in range (len(test_list)):
    test_path.append(test_list[k][0])
    
map_test = [(k, zipped_dict[k]) for k in test_path]
X_test = []

for i in range (len(map_test)):
    X_test.append(map_test[i][1])

In [39]:
T = np.array(X_test)

# a list which contains horizontally distributed 99x13 features for each audio
lis_T = []
for i in range(T.shape[0]):
    lis_T.append(T[i])
lis_T_array = np.array(lis_T)

In [40]:
# flatten the 99x13 into 1287
M = []
for i in range (len(lis_T)):
    M.append(lis_T[i].ravel())
    
# put zeros if the lenght is less than 1287
M_pad = []
for i in range (len(M)):
    M_pad.append(np.pad(M[i], (0, 1287 - len(M[i])), 'constant'))
    
# Check if the code above did what we want
for i in range (len(M_pad)):
    if len(M_pad[i]) != 1287:
        print(len(M_pad[i]))

# vertically stack the features of all audios
M_pad_stack = []
for i in range (len(M_pad)):
    M_pad_stack.append(np.vstack(M_pad[i]))
    
M_pad_stack_arr = np.array(M_pad_stack)

T_features = M_pad_stack_arr.mean(axis = 2)

print(T_features.shape)

(11005, 1287)


In [49]:
# Scale the features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
T = scaler.fit_transform(T_features)

# Convolutional Neural Network

In [42]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten ,LSTM
from keras.layers import Convolution2D, MaxPooling2D, Conv2D , GlobalAveragePooling2D,BatchNormalization
from keras.optimizers import Adam
from keras.utils import np_utils
from sklearn import metrics 
from datetime import datetime 
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

Using TensorFlow backend.


In [43]:
#Converting Categorical classes 

le = LabelEncoder()
yy = to_categorical(le.fit_transform(y_train_bs)) 

In [44]:
# split the train data into train and validation. 20% of the train data is now val data

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, yy, test_size=0.2, random_state=1)

print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(75859, 1287)
(18965, 1287)
(75859, 35)
(18965, 35)


In [45]:
#Reshape Traina and validation data for CNN
num_rows = 13
num_columns = 99
num_channels = 1
num_labels = yy.shape[1]

X_train = X_train.reshape(X_train.shape[0],  num_columns,num_rows, num_channels)
X_val = X_val.reshape(X_val.shape[0], num_columns,num_rows, num_channels)

In [74]:
#reference for CNN model -> 

def cnn_model():
    M = Sequential()
    M.add(Conv2D(filters=16, kernel_size=(3,3), input_shape=(99,13,1), activation='relu', padding="same"))
    M.add(MaxPooling2D(pool_size=1))
    M.add(Dropout(0.2))

    M.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu',padding="same"))
    M.add(MaxPooling2D(pool_size=2))
    M.add(Dropout(0.2))

    M.add(Conv2D(filters=64, kernel_size=(3,3), activation='relu',padding="same"))
    M.add(MaxPooling2D(pool_size=2))
    M.add(Dropout(0.2))

    M.add(Conv2D(filters=128, kernel_size=(3,3), activation='relu',padding="same"))
    M.add(MaxPooling2D(pool_size=2))
    M.add(Dropout(0.2))

    M.add(Conv2D(filters=256, kernel_size=(3,3), activation='relu',padding="same"))
    M.add(MaxPooling2D(pool_size=1))
    M.add(Dropout(0.2))
    M.add(GlobalAveragePooling2D())

    M.add(Dense(35, activation='softmax'))
    M.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')
    
    return M

In [None]:
from sklearn.model_selection import KFold

#reference for kfold validation -> https://androidkt.com/k-fold-cross-validation-with-tensorflow-keras/
 
n_split=5
num_rows = 13
num_columns = 99
num_channels = 1
num_labels = yy.shape[1]
 
for train_index,test_index in KFold(n_split).split(X):
    X_train,x_test=X[train_index],X[test_index]
    y_train,y_test=yy[train_index],yy[test_index]
    
    X_train = X_train.reshape(X_train.shape[0],  num_columns,num_rows, num_channels)
    x_test = x_test.reshape(x_test.shape[0], num_columns,num_rows, num_channels)
    M=cnn_model()
    M.fit(X_train, y_train,epochs=20)
    print('Model evaluation ',M.evaluate(x_test,y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model evaluation  [56.61002503716117, 0.10313735902309418]
Epoch 1/20
Epoch 2/20
Epoch 3/20

# Test

In this section happens the following:  firstly the cnn is traint on all the training data. Secondly the predicitons are made on the test data and saved. 

In [46]:
num_rows = 13
num_columns = 99
num_channels = 1

X = X.reshape(X.shape[0],  num_columns,num_rows, num_channels)
print(X.shape)

(94824, 99, 13, 1)


In [50]:
X_test = T.reshape(T.shape[0],num_columns,num_rows,num_channels)

In [51]:
y_test = M.predict_classes(X_test)

AttributeError: 'list' object has no attribute 'predict_classes'

In [169]:
y_test = le.inverse_transform(y_test)

In [170]:
test['word'] = y_test

In [172]:
test.to_csv("result.csv", index=False)