In [151]:
%matplotlib inline

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import  transforms,datasets
from torch.utils import data
import librosa
import torch.optim as optim

In [150]:
categories = !ls Data/speech_commands/train | sort -u 
categories=categories[1:]
category_dict={cat:i for i,cat in enumerate(categories)}
print(category_dict)

{'bed': 0, 'bird': 1, 'cat': 2, 'dog': 3, 'down': 4, 'eight': 5, 'five': 6, 'four': 7, 'go': 8, 'happy': 9, 'house': 10, 'left': 11, 'marvin': 12, 'nine': 13, 'no': 14, 'off': 15, 'on': 16, 'one': 17, 'right': 18, 'seven': 19, 'sheila': 20, 'six': 21, 'stop': 22, 'three': 23, 'tree': 24, 'two': 25, 'up': 26, 'wow': 27, 'yes': 28, 'zero': 29}


In [36]:
partition={}
df=pd.read_csv('Data/speech_commands/csvs/train.csv')
partition['train']=df['file_name'].get_values()
partition['train_labels']=df['label'].get_values()

df=pd.read_csv('Data/speech_commands/csvs/valid.csv')
partition['validation']=df['file_name'].get_values()
partition['validation_labels']=df['label'].get_values()

In [145]:
class AudioDataset(data.Dataset):
    
    def __init__(self,filenames,labels):
        self.filenames=filenames
        self.labels=labels
        self.sr=16000
    
    def __len__(self):
        return len(self.filenames)
    
    def __getitem__(self,index):
        file = self.filenames[index]
        wav,_ = librosa.load(file,self.sr)
        wav = librosa.util.normalize(wav)
        wav = librosa.util.pad_center(wav,self.sr)
        mfcc_feat= librosa.feature.mfcc(wav,self.sr,n_mfcc=13)
        delta_feat = mfcc_feat[:-1]-mfcc_feat[1:]
#         mfcc_delta = librosa.feature.delta(mfcc,order=2)
        deltadelta_feat = delta_feat[:-1]-delta_feat[1:]
        #Removing the first two frames
        mfcc_feat = mfcc_feat[2:]
        delta_feat = delta_feat[1:]

        full_input = np.concatenate((mfcc_feat,delta_feat,deltadelta_feat), axis=0)

        mfcc = torch.from_numpy(full_input).float().unsqueeze(0)
        
        y = self.labels[index]
        return mfcc,y

In [146]:
train_dataset=AudioDataset(partition['train'],partition['train_labels'])
valid_dataset=AudioDataset(partition['validation'],partition['validation_labels'])

In [156]:
train_loader = data.DataLoader(train_dataset,batch_size=64,shuffle=True)
valid_loader = data.DataLoader(valid_dataset,batch_size=64,shuffle=True)

In [148]:
aa=iter(train_loader)
x,y=next(aa)

In [149]:
x.shape

torch.Size([128, 1, 33, 32])

In [157]:
n_input=33*32
n_hidden=[512,256,64]
n_output=30

Model = nn.Sequential(nn.Linear(n_input,n_hidden[0]),
                     nn.ReLU(),
                     nn.Dropout(p=0.2), 
                     nn.Linear(n_hidden[0],n_hidden[1]),
                     nn.ReLU(),
                     nn.Dropout(p=0.2),
                     nn.Linear(n_hidden[1],n_hidden[2]),
                     nn.ReLU(),
                     nn.Dropout(p=0.2),
                     nn.Linear(n_hidden[2],n_output),
                     nn.LogSoftmax(dim=1))
criterion = nn.NLLLoss()
optimizer = optim.Adam(Model.parameters())

In [158]:
print(Model)

Sequential(
  (0): Linear(in_features=1056, out_features=512, bias=True)
  (1): ReLU()
  (2): Dropout(p=0.2)
  (3): Linear(in_features=512, out_features=256, bias=True)
  (4): ReLU()
  (5): Dropout(p=0.2)
  (6): Linear(in_features=256, out_features=64, bias=True)
  (7): ReLU()
  (8): Dropout(p=0.2)
  (9): Linear(in_features=64, out_features=30, bias=True)
  (10): LogSoftmax()
)


In [159]:
def train(Model,trainloader,testloader,criterion,optimizer,epochs):
    test_losses,train_losses=[],[]
    for e in range(epochs):
        running_loss=0;

        Model.train();
        for images,labels in trainloader:
            images_t = images.view(images.shape[0],-1);
            optimizer.zero_grad();

            logits=Model.forward(images_t);
            loss_t=criterion(logits,labels);
            loss_t.backward();
            optimizer.step();

            running_loss+=loss_t;

        else:
            test_loss,accuracy=validation(Model,testloader,criterion);

            print("Epoch: {}/{}.. ".format(e+1, epochs),
                      "Training Loss: {:.3f}.. ".format(running_loss/len(trainloader)),
                      "Test Loss: {:.3f}.. ".format(test_loss/len(testloader)),
                      "Test Accuracy: {:.3f}".format(accuracy))

        test_losses.append(test_loss/len(testloader))
        train_losses.append(running_loss/len(trainloader))

    return train_losses,test_losses,accuracy

def validation(Model,testloader,criterion):
    test_loss=0;
    accuracy=0;

    Model.eval();
    images_num=0;
    with torch.no_grad():
        for images,labels in testloader:
            images_num+=images.shape[0];
            images_t=images.view(images.shape[0],-1);
            logits=Model.forward(images_t);
            loss_t=criterion(logits,labels)
            test_loss+=loss_t;

            _,pred_labels=torch.topk(logits,1,dim=1)
            equality=(labels==pred_labels.view(*labels.shape))

            accuracy += torch.sum(equality)

    overall_acc=accuracy.float()/images_num;
    return test_loss,overall_acc

In [160]:
train_loss,test_loss,accuracy=train(Model,train_loader,valid_loader,criterion,optimizer,5)

Epoch: 1/5..  Training Loss: 3.128..  Test Loss: 2.375..  Test Accuracy: 0.270
Epoch: 2/5..  Training Loss: 2.325..  Test Loss: 1.796..  Test Accuracy: 0.445
Epoch: 3/5..  Training Loss: 1.961..  Test Loss: 1.547..  Test Accuracy: 0.537
Epoch: 4/5..  Training Loss: 1.750..  Test Loss: 1.370..  Test Accuracy: 0.575
Epoch: 5/5..  Training Loss: 1.619..  Test Loss: 1.284..  Test Accuracy: 0.622


In [161]:
train_loss,test_loss,accuracy=train(Model,train_loader,valid_loader,criterion,optimizer,5)

Epoch: 1/5..  Training Loss: 1.534..  Test Loss: 1.186..  Test Accuracy: 0.652
Epoch: 2/5..  Training Loss: 1.465..  Test Loss: 1.166..  Test Accuracy: 0.653
Epoch: 3/5..  Training Loss: 1.404..  Test Loss: 1.155..  Test Accuracy: 0.656
Epoch: 4/5..  Training Loss: 1.360..  Test Loss: 1.121..  Test Accuracy: 0.677
Epoch: 5/5..  Training Loss: 1.326..  Test Loss: 1.086..  Test Accuracy: 0.691
