In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import  transforms,datasets
from torch.utils import data
import librosa
import torch.optim as optim

In [2]:
categories = !ls Data/speech_commands/train | sort -u 
categories=categories[1:]
category_dict={cat:i for i,cat in enumerate(categories)}
print(category_dict)

{'bed': 0, 'bird': 1, 'cat': 2, 'dog': 3, 'down': 4, 'eight': 5, 'five': 6, 'four': 7, 'go': 8, 'happy': 9, 'house': 10, 'left': 11, 'marvin': 12, 'nine': 13, 'no': 14, 'off': 15, 'on': 16, 'one': 17, 'right': 18, 'seven': 19, 'sheila': 20, 'six': 21, 'stop': 22, 'three': 23, 'tree': 24, 'two': 25, 'up': 26, 'wow': 27, 'yes': 28, 'zero': 29}


In [3]:
partition={}
df=pd.read_csv('Data/speech_commands/csvs/train.csv')
partition['train']=df['file_name'].get_values()
partition['train_labels']=df['label'].get_values()

df=pd.read_csv('Data/speech_commands/csvs/valid.csv')
partition['validation']=df['file_name'].get_values()
partition['validation_labels']=df['label'].get_values()

In [4]:
class AudioDataset(data.Dataset):
    
    def __init__(self,filenames,labels):
        self.filenames=filenames
        self.labels=labels
        self.sr=16000
    
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self,index):
        file = self.filenames[index]
        wav,_ = librosa.load(file,self.sr)
        wav = librosa.util.normalize(wav)
        wav = librosa.util.pad_center(wav,self.sr)
        mfcc_feat= librosa.feature.mfcc(wav,self.sr,n_mfcc=13)
        delta_feat = mfcc_feat[:-1]-mfcc_feat[1:]
#         mfcc_delta = librosa.feature.delta(mfcc,order=2)
        deltadelta_feat = delta_feat[:-1]-delta_feat[1:]
        #Removing the first two frames
        mfcc_feat = mfcc_feat[2:]
        delta_feat = delta_feat[1:]

        full_input = np.concatenate((mfcc_feat,delta_feat,deltadelta_feat), axis=0)

        mfcc = torch.from_numpy(full_input).float().unsqueeze(0)
        
        y = self.labels[index]
        return mfcc,y

In [5]:
train_dataset=AudioDataset(partition['train'],partition['train_labels'])
valid_dataset=AudioDataset(partition['validation'],partition['validation_labels'])

train_loader = data.DataLoader(train_dataset,batch_size=64,shuffle=True)
valid_loader = data.DataLoader(valid_dataset,batch_size=64,shuffle=True)

In [9]:
x,y=next(iter(train_loader))
print(x.shape)
print(y.shape)

torch.Size([64, 1, 33, 32])
torch.Size([64])


In [26]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1,3,(3,3),stride=(1,1),padding=1)
        self.conv2 = nn.Conv2d(3,5,(3,3),stride=(1,1),padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d((2,2))
        self.dropout=nn.Dropout(0.3)
        
        self.fc1=nn.Linear(768,512)
        self.fc2=nn.Linear(512,256)
        self.fc3=nn.Linear(256,30)
        self.out= nn.LogSoftmax(dim=1)
    
    def forward(self,x):
        x=self.conv1(x)
        x=self.pool(x)
        x=self.dropout(x)
        x=self.relu(x)
        
#         x=self.conv2(x)
#         x=self.pool(x)
#         x=self.dropout(x)
#         x=self.relu(x)
        
        x=x.view(x.shape[0],-1) # This selects the batch size x 1
        
        x=self.fc1(x)
        x=self.dropout(x)
        x=self.relu(x)
        
        x=self.fc2(x)
        x=self.dropout(x)
        x=self.relu(x)
        
        x=self.fc3(x)
        x=self.out(x)
        
        return x

In [27]:
Model=Network()

In [28]:
print(Model)

Network(
  (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(3, 5, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (relu): ReLU()
  (pool): MaxPool2d(kernel_size=(2, 2), stride=(2, 2), padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout(p=0.3)
  (fc1): Linear(in_features=768, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=256, bias=True)
  (fc3): Linear(in_features=256, out_features=30, bias=True)
  (out): LogSoftmax()
)


In [13]:
def train(Model,trainloader,testloader,criterion,optimizer,epochs):
    test_losses,train_losses=[],[]
    for e in range(epochs):
        running_loss=0;

        Model.train();
        for images,labels in trainloader:
#             images_t = images.view(images.shape[0],-1);
            optimizer.zero_grad();

            logits=Model.forward(images);
            loss_t=criterion(logits,labels);
            loss_t.backward();
            optimizer.step();

            running_loss+=loss_t;

        else:
            test_loss,accuracy=validation(Model,testloader,criterion);

            print("Epoch: {}/{}.. ".format(e+1, epochs),
                      "Training Loss: {:.3f}.. ".format(running_loss/len(trainloader)),
                      "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
                      "Test Accuracy: {:.3f}".format(accuracy))

        test_losses.append(test_loss/len(testloader))
        train_losses.append(running_loss/len(trainloader))

    return train_losses,test_losses,accuracy

def validation(Model,testloader,criterion):
    test_loss=0;
    accuracy=0;

    Model.eval();
    images_num=0;
    with torch.no_grad():
        for images,labels in testloader:
            images_num+=images.shape[0];
#             images_t=images.view(images.shape[0],-1);
            logits=Model.forward(images);
            loss_t=criterion(logits,labels)
            test_loss+=loss_t;

            _,pred_labels=torch.topk(logits,1,dim=1)
            equality=(labels==pred_labels.view(*labels.shape))

            accuracy += torch.sum(equality)

    overall_acc=accuracy.float()/images_num;
    return test_loss,overall_acc

In [29]:
criterion = nn.NLLLoss()
optimizer=torch.optim.Adam(Model.parameters())

In [30]:
train_loss,test_loss,accuracy=train(Model,train_loader,valid_loader,
                                             criterion,optimizer,5)

Epoch: 1/5..  Training Loss: 3.033..  Test Loss: 2.175..  Test Accuracy: 0.363
Epoch: 2/5..  Training Loss: 2.269..  Test Loss: 1.651..  Test Accuracy: 0.532
Epoch: 3/5..  Training Loss: 1.943..  Test Loss: 1.375..  Test Accuracy: 0.607
Epoch: 4/5..  Training Loss: 1.750..  Test Loss: 1.189..  Test Accuracy: 0.664
Epoch: 5/5..  Training Loss: 1.613..  Test Loss: 1.074..  Test Accuracy: 0.690
