### Import libraries

In [1]:
import  glob
import  os
# os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# os.environ["CUDA_VISIBLE_DEVICES"]="0"

import  librosa
import  librosa.display
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import specgram
%matplotlib inline
plt.style.use('ggplot')

from pydub import AudioSegment
from pydub.silence import split_on_silence


In [2]:
import tensorflow as tf
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Input, Convolution2D, MaxPooling2D, Dense, Dropout, Flatten
from keras.layers.embeddings import Embedding
from keras.utils.np_utils import to_categorical
from keras.layers import Input, LSTM, GRU, Dense
from keras.layers import Bidirectional


Using TensorFlow backend.


### Path to Directories

In [7]:
ROOT_DIRECTORY = 'dataset/'


# declarative_CLASS_train = 'NLP_Declarative/NLP_train_16k/*.wav'
# question_CLASS_train = 'NLP_Question/NLP_train_16k/*.wav'

# declarative_CLASS_test = 'NLP_Declarative/NLP_test_16k/*.wav'
# question_CLASS_test = 'NLP_Question/NLP_test_16k/*.wav'


# declarative_CLASS_val = 'NLP_Declarative/NLP_val_16k/*.wav'
# question_CLASS_val = 'NLP_Question/NLP_val_16k/*.wav'

declarative_CLASS_train='declarative/*.wav'
question_CLASS_train='question/*.wav'
train_percentile = 0.8
test_percentile = 0.2

### Load file paths

In [8]:
def filenames_list(directory_path):
    filename_list=[]
    for filename in glob.glob(directory_path):
        (filename_list.append(filename))
    return filename_list

In [9]:
dec_files_list=filenames_list(ROOT_DIRECTORY+declarative_CLASS_train)
ques_files_list=filenames_list(ROOT_DIRECTORY+question_CLASS_train)


In [5]:
dec_files_list_train=filenames_list(ROOT_DIRECTORY+declarative_CLASS_train)
ques_files_list_train=filenames_list(ROOT_DIRECTORY+question_CLASS_train)



In [6]:
dec_files_list_test=filenames_list(ROOT_DIRECTORY+declarative_CLASS_test)
ques_files_list_test=filenames_list(ROOT_DIRECTORY+question_CLASS_test)

dec_files_list_val=filenames_list(ROOT_DIRECTORY+declarative_CLASS_val)
ques_files_list_val=filenames_list(ROOT_DIRECTORY+question_CLASS_val)

### Extracting MFCC

In [10]:
def extract_mfcc(file_name):
    X, sample_rate = librosa.load(file_name)
#     print ("Features :",len(X), X.shape, "sampled at ", sample_rate, "hz")
#     stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
#     mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)
#     chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
#     mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
#     contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
#     tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    mfccs=np.array(mfccs, ndmin=2)
    return mfccs

In [11]:
Max_RNN=25


### load audio files and convert into feature vector and divide into train and test data

In [14]:
def parse_audio_files(percentiles, filename_list, label, Max_RNN):
    max_len=-999999
    Xs_train=[]
    Xs_test=[]
    probability = [float(v) for v in percentiles.values()]

    for e,fn in enumerate(filename_list):
        chunks_fnames=[]
        print(e)
        sound_file = AudioSegment.from_wav(fn)
        audio_chunks = split_on_silence(sound_file, min_silence_len=100, silence_thresh=sound_file.dBFS-5)
        for i, chunk in enumerate(audio_chunks):
            out_file = "chunks/chunk{0}.wav".format(i)
            chunks_fnames.append(out_file)
            chunk.export(out_file, format="wav")
        x=[]
        for chunk_names in chunks_fnames:
            
            mfccs = extract_mfcc(chunk_names)
            
            portion =  np.random.choice(2, 1, p=probability)
            
            if len(x)==0:
                x = mfccs.T
            else:
                x= np.concatenate( (x,mfccs.T), axis = 1)
        

        x = pad_sequences(x, maxlen=25, padding='post', dtype = 'float')
        if portion == 0:
            Xs_train.append(x.T)
        elif portion == 1:
            Xs_test.append(x.T)

        print(mfccs.shape)
#             Xs_train.append(mfccs.T)
#         Xs_train.append(x.T)
    if label == 'declarative':
        Ys_train = to_categorical(np.ones(len(Xs_train)),2)
        Ys_test = to_categorical(np.ones(len(Xs_test)),2)
        
    elif label == 'question':
        Ys_train = to_categorical(np.zeros(len(Xs_train)),2)
        Ys_test = to_categorical(np.zeros(len(Xs_test)),2)

#     if label == 'declarative':
#         Ys_train = to_categorical(np.ones(len(Xs_train)),2)
#     elif label == 'question':
#         Ys_train = to_categorical(np.zeros(len(Xs_train)),2)
    
    Xs_train = np.array(Xs_train)
    Xs_test = np.array(Xs_test)

    return Xs_train, Xs_test, Ys_train, Ys_test, max_len 
        


In [10]:
def parse_audio_files_test(filename_list, label, Max_RNN):
    max_len=-999999
    Xs_train=[]
    Xs_test=[]
    for e,fn in enumerate(filename_list):
        chunks_fnames=[]
        print(e)
        sound_file = AudioSegment.from_wav(fn)
        audio_chunks = split_on_silence(sound_file, min_silence_len=100, silence_thresh=sound_file.dBFS-5)
        for i, chunk in enumerate(audio_chunks):
            out_file = "chunks/chunk{0}.wav".format(i)
            chunks_fnames.append(out_file)
            chunk.export(out_file, format="wav")
        x=[]
        for chunk_names in chunks_fnames:
            
            mfccs = extract_mfcc(chunk_names)
            if len(x)==0:
                x = mfccs.T
            else:
                x= np.concatenate( (x,mfccs.T), axis = 1)
        x = pad_sequences(x, maxlen=25, padding='post', dtype = 'float')
#         with open(file_name, 'wb') as fs:
# #         for val in vector_name:
# #             print(val)
# #             sn, sr=librosa.load(fname_list[i], sr=None)
# #             Sn=librosa.stft(sn, n_fft=1024, hop_length=512)
# # #             mag_Sn=np.abs(Sn)
# #     #         trn_arr=np.concatenate((trn_arr, mag_Sn), axis=1)
#             np.savetxt(fs, x.T, fmt='%.5f')
#             fs.write(b'\n')
#         fs.close()            

        Xs_train.append(x.T)
    if label == 'declarative':
        Ys_train = to_categorical(np.ones(len(Xs_train)),2)
    elif label == 'question':
        Ys_train = to_categorical(np.zeros(len(Xs_train)),2)
    
#     Xs_train = np.array(Xs_train)
    return Xs_train, Ys_train, max_len 
        


### Load declarative sentences

In [18]:
# dec_Xs_train, dec_Ys_train, dec_max_len=parse_audio_files(dec_files_list_train, "declarative", Max_RNN)

In [17]:
# dec_Xs_test, dec_Ys_test, dec_max_len_test=parse_audio_files(dec_files_list_test, "declarative", Max_RNN)
# dec_Xs_val, dec_Ys_val, dec_max_len_val=parse_audio_files(dec_files_list_val, "declarative", Max_RNN)

In [15]:
percentiles = {'train':train_percentile, 'test':test_percentile}
dec_Xs_train, dec_Xs_test, dec_Ys_train, dec_Ys_test, dec_max_len=parse_audio_files(percentiles, dec_files_list, "declarative", Max_RNN)

print(len(dec_Xs_train), len(dec_Ys_train), len(dec_Xs_test), len(dec_Ys_test), "Max Len: ", dec_max_len)

0
(1, 40)
1
(1, 40)
2
(1, 40)
3
(1, 40)
4
(1, 40)
5
(1, 40)
6
(1, 40)
7
(1, 40)
8
(1, 40)
9
(1, 40)
10
(1, 40)
11
(1, 40)
12
(1, 40)
13
(1, 40)
14
(1, 40)
15
(1, 40)
16
(1, 40)
17
(1, 40)
18
(1, 40)
19
(1, 40)
20
(1, 40)
21
(1, 40)
22
(1, 40)
23
(1, 40)
24
(1, 40)
25
(1, 40)
26
(1, 40)
27
(1, 40)
28
(1, 40)
29
(1, 40)
30
(1, 40)
31
(1, 40)
32
(1, 40)
33
(1, 40)
34
(1, 40)
35
(1, 40)
36
(1, 40)
37
(1, 40)
38
(1, 40)
39
(1, 40)
40
(1, 40)
41
(1, 40)
42
(1, 40)
43
(1, 40)
44
(1, 40)
45
(1, 40)
46
(1, 40)
47
(1, 40)
48
(1, 40)
49
(1, 40)
50
(1, 40)
51
(1, 40)
52
(1, 40)
53
(1, 40)
54
(1, 40)
55
(1, 40)
56
(1, 40)
57
(1, 40)
58
(1, 40)
59
(1, 40)
60
(1, 40)
61
(1, 40)
62
(1, 40)
63
(1, 40)
64
(1, 40)
65
(1, 40)
66
(1, 40)
67
(1, 40)
68
(1, 40)
69
(1, 40)
70
(1, 40)
71
(1, 40)
72
(1, 40)
73
(1, 40)
74
(1, 40)
75
(1, 40)
76
(1, 40)
77
(1, 40)
78
(1, 40)
79
(1, 40)
80
(1, 40)
81
(1, 40)
82
(1, 40)
83
(1, 40)
84
(1, 40)
85
(1, 40)
86
(1, 40)
87
(1, 40)
88
(1, 40)
89
(1, 40)
90
(1, 40)
91
(1, 40

(1, 40)
694
(1, 40)
695
(1, 40)
696
(1, 40)
697
(1, 40)
698
(1, 40)
699
(1, 40)
700
(1, 40)
701
(1, 40)
702
(1, 40)
703
(1, 40)
704
(1, 40)
705
(1, 40)
706
(1, 40)
707
(1, 40)
708
(1, 40)
709
(1, 40)
710
(1, 40)
711
(1, 40)
712
(1, 40)
713
(1, 40)
714
(1, 40)
715
(1, 40)
716
(1, 40)
717
(1, 40)
718
(1, 40)
719
(1, 40)
720
(1, 40)
721
(1, 40)
722
(1, 40)
723
(1, 40)
724
(1, 40)
725
(1, 40)
726
(1, 40)
727
(1, 40)
728
(1, 40)
729
(1, 40)
730
(1, 40)
731
(1, 40)
732
(1, 40)
733
(1, 40)
734
(1, 40)
735
(1, 40)
736
(1, 40)
737
(1, 40)
738
(1, 40)
739
(1, 40)
740
(1, 40)
741
(1, 40)
742
(1, 40)
743
(1, 40)
744
(1, 40)
745
(1, 40)
746
(1, 40)
747
(1, 40)
748
(1, 40)
749
(1, 40)
750
(1, 40)
751
(1, 40)
752
(1, 40)
753
(1, 40)
754
(1, 40)
755
(1, 40)
756
(1, 40)
757
(1, 40)
758
(1, 40)
759
(1, 40)
760
(1, 40)
761
(1, 40)
762
(1, 40)
763
(1, 40)
764
(1, 40)
765
(1, 40)
766
(1, 40)
767
(1, 40)
768
(1, 40)
769
(1, 40)
770
(1, 40)
771
(1, 40)
772
(1, 40)
773
(1, 40)
774
(1, 40)
775
(1, 40)
776
(1, 

In [19]:
# for dec_xs in dec_Xs_train:
#     print(dec_xs.shape)
# # dec_Xs[0].shape

### Load Questions

In [20]:
# ques_Xs_train, ques_Ys_train, ques_max_len_train=parse_audio_files(ques_files_list_train, "question", Max_RNN)

In [21]:
# ques_Xs_test, ques_Ys_test, ques_max_len_test=parse_audio_files_test(ques_files_list_test, "question", Max_RNN)


In [22]:
# ques_Xs_test=np.array(ques_Xs_test, ndmin=3)
# len(ques_Xs_test)

In [24]:
# ques_Xs_test[0].shape

In [25]:
# ques_Xs_val, ques_Ys_val, ques_max_len_val=parse_audio_files(ques_files_list_val, "question", Max_RNN)

In [26]:
# ques_Xs_val[0].shape

In [27]:
ques_Xs_train, ques_Xs_test, ques_Ys_train, ques_Ys_test, ques_max_len=parse_audio_files(percentiles, ques_files_list, "question", Max_RNN)

print(len(ques_Xs_train), len(ques_Ys_train), len(ques_Xs_test), len(ques_Ys_test), "Max Len: ", ques_max_len)

0
(1, 40)
1
(1, 40)
2
(1, 40)
3
(1, 40)
4
(1, 40)
5
(1, 40)
6
(1, 40)
7
(1, 40)
8
(1, 40)
9
(1, 40)
10
(1, 40)
11
(1, 40)
12
(1, 40)
13
(1, 40)
14
(1, 40)
15
(1, 40)
16
(1, 40)
17
(1, 40)
18
(1, 40)
19
(1, 40)
20
(1, 40)
21
(1, 40)
22
(1, 40)
23
(1, 40)
24
(1, 40)
25
(1, 40)
26
(1, 40)
27
(1, 40)
28
(1, 40)
29
(1, 40)
30
(1, 40)
31
(1, 40)
32
(1, 40)
33
(1, 40)
34
(1, 40)
35
(1, 40)
36
(1, 40)
37
(1, 40)
38
(1, 40)
39
(1, 40)
40
(1, 40)
41
(1, 40)
42
(1, 40)
43
(1, 40)
44
(1, 40)
45
(1, 40)
46
(1, 40)
47
(1, 40)
48
(1, 40)
49
(1, 40)
50
(1, 40)
51
(1, 40)
52
(1, 40)
53
(1, 40)
54
(1, 40)
55
(1, 40)
56
(1, 40)
57
(1, 40)
58
(1, 40)
59
(1, 40)
60
(1, 40)
61
(1, 40)
62
(1, 40)
63
(1, 40)
64
(1, 40)
65
(1, 40)
66
(1, 40)
67
(1, 40)
68
(1, 40)
69
(1, 40)
70
(1, 40)
71
(1, 40)
72
(1, 40)
73
(1, 40)
74
(1, 40)
75
(1, 40)
76
(1, 40)
77
(1, 40)
78
(1, 40)
79
(1, 40)
80
(1, 40)
81
(1, 40)
82
(1, 40)
83
(1, 40)
84
(1, 40)
85
(1, 40)
86
(1, 40)
87
(1, 40)
88
(1, 40)
89
(1, 40)
90
(1, 40)
91
(1, 40

In [28]:
# for dec_xs in ques_Xs_train:
#     print(dec_xs.shape)


In [29]:
def write_file(file_name, vector_name):
    ### Writing training data S

    with open(file_name, 'wb') as fs:
        for val in vector_name:
#             print(val)
#             sn, sr=librosa.load(fname_list[i], sr=None)
#             Sn=librosa.stft(sn, n_fft=1024, hop_length=512)
# #             mag_Sn=np.abs(Sn)
#     #         trn_arr=np.concatenate((trn_arr, mag_Sn), axis=1)
            np.savetxt(fs, val, fmt='%.5f')
            fs.write(b'\n')
    fs.close()            



In [30]:
# write_file("dec_Xs_val_chunk_mfcc.txt", dec_Xs_val)
# write_file("ques_Xs_val_chunk_mfcc.txt", ques_Xs_val)
# # write_file("train_n.txt", fname_trn)
# write_file("train_x.txt", fname_trx)

In [31]:
# write_file("dec_Xs_test_chunk_mfcc.txt", dec_Xs_test)
# write_file("ques_Xs_test_chunk_mfcc.txt", ques_Xs_test)

In [32]:
# write_file("dec_Xs_train_chunk_mfcc.txt", dec_Xs_train)


In [33]:
# write_file("ques_Xs_train_chunk_mfcc.txt", ques_Xs_train)


In [34]:
# write_file("ques_Xs_test_chunk_mfcc.txt", ques_Xs_test)

In [35]:
### Function for Reading file

def read_file(file_name):
    with open(file_name) as f:
        lines=f.readlines()
        print(len(lines))
        sentence_full=[]
        count = 0
        sentence=[]
        for line in lines:

            if count < 25:
                if count ==0:
                    sentence=np.array(np.fromstring(line, dtype=float, sep=' '), ndmin=2)
                    count+=1
                else:
                    myarray = np.array(np.fromstring(line, dtype=float, sep=' '), ndmin=2)
                    sentence=np.concatenate((sentence, myarray), axis=0)
                    count+=1
            else:
                sentence_full.append(sentence) 
                count=0
                sentence=[]
        return sentence_full


In [36]:
# dec_Xs_val_file = read_file("dec_Xs_val_chunk_mfcc.txt")
# ques_Xs_val_file = read_file("ques_Xs_val_chunk_mfcc.txt")



In [37]:
# dec_Xs_test_file = read_file("dec_Xs_test_chunk_mfcc.txt")
# ques_Xs_test_file = read_file("ques_Xs_test_chunk_mfcc.txt")


In [38]:
# ques_Xs_train_file.shape

In [39]:
# dec_Xs_train_file = read_file("dec_Xs_train_chunk_mfcc.txt")


In [40]:
# ques_Xs_train_file = read_file("ques_Xs_train_chunk_mfcc.txt")
# 

In [41]:
# dec_Ys_train_file = to_categorical(np.ones(len(dec_Xs_train_file)),2)
# dec_Ys_test_file = to_categorical(np.ones(len(dec_Xs_test_file)),2)
# dec_Ys_val_file = to_categorical(np.ones(len(dec_Xs_val_file)),2)

In [42]:
# ques_Ys_train_file = to_categorical(np.zeros(len(ques_Xs_train_file)),2)
# ques_Ys_test_file = to_categorical(np.zeros(len(ques_Xs_test_file)),2)
# ques_Ys_val_file = to_categorical(np.zeros(len(ques_Xs_val_file)),2)

### Training and Testing data

In [43]:
bx = np.concatenate( (dec_Xs_train,ques_Xs_train), axis = 0)
by = np.concatenate( (dec_Ys_train,ques_Ys_train), axis = 0)

# bx = np.concatenate( (dec_Xs_train_file,ques_Xs_train_file), axis = 0)
# by = np.concatenate( (dec_Ys_train_file,ques_Ys_train_file), axis = 0)


In [44]:
bx.shape

(905, 25, 40)

In [45]:
# bx_val = np.concatenate( (dec_Xs_val_file,ques_Xs_val_file), axis = 0)
# by_val = np.concatenate( (dec_Ys_val_file,ques_Ys_val_file), axis = 0)


In [46]:
n = bx.shape[0]
suffle_n = np.random.permutation(n)

bx = bx[suffle_n]
by = by[suffle_n]

In [47]:
# n = bx_val.shape[0]
# suffle_n = np.random.permutation(n)

# bx_val = bx_val[suffle_n]
# by_val = by_val[suffle_n]

### Model

In [48]:
# Max_RNN = 1500

# create the model

model = Sequential()
model.add(Bidirectional(GRU(Max_RNN, return_sequences=False), input_shape=(Max_RNN,40)))
model.add(Dropout(0.2))

model.add(Dense(2, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

model.fit(bx, by,validation_split = 0.05, shuffle=True, nb_epoch=20, batch_size=64)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional_1 (Bidirection (None, 50)                9900      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 102       
Total params: 10,002
Trainable params: 10,002
Non-trainable params: 0
_________________________________________________________________
None




Train on 859 samples, validate on 46 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f1eed6726d8>

### Evaluate the model

In [49]:
bx_test = np.concatenate( (dec_Xs_test,ques_Xs_test), axis = 0)
by_test = np.concatenate( (dec_Ys_test,ques_Ys_test), axis = 0)




In [None]:
dec_Ys_test_file[0].shape
len(dec_Ys_test_file)

In [50]:
scores = model.evaluate(bx_test, by_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 97.25%
