In [0]:
import numpy as np
import pickle as pkl
import time
import json
import random
import torch
from torch import nn
import editdistance
import os
np.seterr(divide='ignore') # masks log(0) errors

{'divide': 'warn', 'invalid': 'warn', 'over': 'warn', 'under': 'ignore'}

In [0]:
!pip install -q soundfile

In [0]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [0]:
hmm_path='/content/drive/My Drive/MLSS2019/tutorials/'
import sys
sys.path.append(hmm_path)
from hmm.multiple import FullGaussianHMM
from hmm.single import GaussianHMM

In [0]:
"""
Single Digit HMM
"""
data_single_digit = np.load("/content/drive/My Drive/MLSS2019/tutorials/hmm/data/mfccs_single.npz",allow_pickle=True)

n_states = 15
n_dims = 13
n_iter = 0
model = dict()
digits = range(10)
    
for digit in digits:
    print("Training HMM for digit %d" % digit)
    Xtrain_digit = [x for x, y in zip(data_single_digit["Xtrain"], data_single_digit["Ytrain"]) if y == digit]
    model[digit] = GaussianHMM(n_states=n_states, n_dims=n_dims)
    model[digit].init_gaussian_params(Xtrain_digit)
    model[digit].init_hmm_params()
    for i in range(n_iter):
        print("starting iteration {}...".format(i+1))
        model[digit].train(Xtrain_digit)

print("Testing HMM")
accuracy = np.zeros(10)
confusion = np.zeros((10, 10))
for x, y in zip(data_single_digit["Xtest"], data_single_digit["Ytest"]):
    T = len(x)

    scores = []
    for digit in digits:
        log_pi = np.log(model[digit].pi)
        log_A = np.log(model[digit].A)
        log_B = model[digit].get_emissions(x)
        _, log_prob = model[digit].viterbi(log_pi, log_A, log_B)
        scores.append(log_prob)

    top_digit, top_log_prob = sorted(zip(digits, scores), key=lambda x: -x[1])[0]
    confusion[y, top_digit] += 1.

accuracy = np.diag(confusion) / confusion.sum(axis=1)

print("accuracy ({:.4f}): {}".format(accuracy.mean(), accuracy))

with open("/content/drive/My Drive/MLSS2019/tutorials/hmm/single_digit_model.pkl", "wb") as f:
    pkl.dump(model, f)

Training HMM for digit 0
Training HMM for digit 1
Training HMM for digit 2
Training HMM for digit 3
Training HMM for digit 4
Training HMM for digit 5
Training HMM for digit 6
Training HMM for digit 7
Training HMM for digit 8
Training HMM for digit 9
Testing HMM
accuracy (0.9833): [1.         0.95833333 0.95833333 1.         0.95833333 1.
 1.         1.         0.95833333 1.        ]


In [0]:
"""
Multiple Digit HMM
"""
data_multiple_digit = np.load("/content/drive/My Drive/MLSS2019/tutorials/hmm/data/mfccs_multiple.npz",allow_pickle=True)
full_model = FullGaussianHMM(data_multiple_digit["Xtrain"], "/content/drive/My Drive/MLSS2019/tutorials/hmm/single_digit_model.pkl")

n_iter = 0

print("Training HMM")
for i in range(n_iter):
    print("starting iteration {}...".format(i + 1))
    full_model.train(data_multiple_digit["Xtrain"], data_multiple_digit["Ytrain"])

print("Testing HMM")
test_wer = full_model.test(data_multiple_digit["Xtest"], data_multiple_digit["Ytest"])
print("{:.2f}% WER".format(test_wer * 100.))

Training HMM
Testing HMM
56.56% WER


In [0]:
rnn_path='/content/drive/My Drive/MLSS2019/tutorials/'
sys.path.append(rnn_path)

from hmm.multiple import FullGaussianHMM
from hmm.single import GaussianHMM

from rnn.loader import make_loader, Preprocessor
from rnn.model import Seq2Seq
from rnn.model import LinearND #Hint: this is useful when defining the modified attention mechanism

In [0]:
import torch
from torch import nn

class Attention(nn.Module):
    def __init__(self, enc_dim, dec_dim, attn_dim=None):
        """
        Initialize Attention.
        ----
        enc_dim: encoder hidden state dimension
        dec_dim: decoder hidden state dimension
        attn_dim: attention feature dimension
        """
        super(Attention, self).__init__()
        if enc_dim == dec_dim and attn_dim is None:
            self.use_default = True
        elif attn_dim is not None:
            self.use_default = False
            self.attn_dim = attn_dim
            self.enc_dim = enc_dim
            self.dec_dim = dec_dim
            self.v = LinearND(self.attn_dim, 1, bias=False)
            self.W1 = LinearND(self.enc_dim, self.attn_dim, bias=False)
            self.W2 = nn.Linear(self.dec_dim, self.attn_dim, bias=False)
        else:
            raise ValueError("invalid args (enc_dim={}, dec_dim={}, attn_dim={})".format(enc_dim, dec_dim, attn_dim))

    def forward(self, eh, dhx, ax=None):
        """
        Forward Attention method.
        ----
        eh (FloatTensor): the encoder hidden state with
            shape (batch size, time, hidden dimension).
        dhx (FloatTensor): one time step of the decoder hidden
            state with shape (batch size, hidden dimension).
        ax (FloatTensor): one time step of the attention vector.
        ----
        Returns the context vectors (sx) and the corresponding attention alignment (ax)
        """
        
        # Compute inner product of decoder slice with every encoder slice
        pax = torch.sum(eh * dhx, dim=2)
        ax = nn.functional.softmax(pax, dim=1)
        sx = torch.sum(eh * ax.unsqueeze(2), dim=1, keepdim=True)

        return sx, ax

In [0]:
def compute_wer(results):
    """
    Compute the word-error-rate (WER).
    """
    dist = 0.
    for label, pred in results:
        dist += editdistance.eval(label, pred)
    total = sum(len(label) for label, _ in results)
    return dist / total

def train(model, optimizer, ldr):
    """
    Train the model for an epoch (one pass over the training data)
    ----
    model: Seq2Seq model instance
    optimizer: torch.nn optimizer instance
    ldr: data loader instance
    ----
    Returns the average loss over an epoch
    """
    model.train()
    model.scheduled_sampling = model.sample_prob != 0
    
    losses = []
    
    for ii, (inputs, labels) in enumerate(ldr):
        optimizer.zero_grad()
        x, y = model.collate(inputs, labels)
        loss = model.loss(x, y)
        loss.backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()
        losses.append(loss.data.item())
        
    return np.mean(losses)

def evaluate(model, ldr, preproc):
    """
    Evaluate the model (on either dev or test).
    ----
    model: Seq2Seq model instance
    ldr: data loader instance
    preproc: preprocessor instance
    ----
    Returns the average loss and wer on a given dataset
    """
    model.eval()
    model.scheduled_sampling = False
    
    losses, hyps, refs = [], [], []
    
    with torch.no_grad():
        for inputs, labels in ldr:
            x, y = model.collate(inputs, labels)
            # get loss
            loss = model.loss(x, y)
            losses.append(loss.data.item())
            # get predictions
            pred = model.infer(x, y)
            hyps.extend(pred)
            refs.extend(labels)

    results = [(preproc.decode(r), preproc.decode(h)) for r, h in zip(refs, hyps)]
    
    return np.mean(losses), compute_wer(results)

In [16]:
!pip install -q simplejson
import simplejson as json
import random
"""
Use the development set to tune your model.
------
With the default config, can get <10% dev WER within 15 epochs.
"""

with open("/content/drive/My Drive/MLSS2019/tutorials/rnn/config.json", "r") as fid:                                                                                                                                                                                                                                      
    config = json.load(fid)

random.seed(config["seed"])
np.random.seed(config["seed"])
torch.manual_seed(config["seed"])

use_cuda = torch.cuda.is_available()
if use_cuda:
    torch.backends.cudnn.deterministic = True

print("Training RNN")
data_cfg = config["data"]
model_cfg = config["model"]
opt_cfg = config["optimizer"]

preproc = Preprocessor(data_cfg["train_set"], start_and_end=data_cfg["start_and_end"])

train_ldr = make_loader(data_cfg["train_set"], preproc, opt_cfg["batch_size"])
dev_ldr = make_loader(data_cfg["dev_set"], preproc, opt_cfg["batch_size"])

attention = Attention(model_cfg["encoder"]["hidden_size"], model_cfg["decoder"]["hidden_size"])
model = Seq2Seq(preproc.input_dim, preproc.vocab_size, attention, model_cfg)
model = model.cuda() if use_cuda else model.cpu()

optimizer = torch.optim.SGD(model.parameters(), lr=opt_cfg["learning_rate"], momentum=opt_cfg["momentum"])

log="epoch {:4} | train_loss={:6.2f}, dev_loss={:6.2f} with {:6.2f}% WER ({:6.2f}s elapsed)"

best_so_far = float("inf")
for ep in range(opt_cfg["max_epochs"]):
    start = time.time()
    
    train_loss = train(model, optimizer, train_ldr)    
    dev_loss, dev_wer = evaluate(model, dev_ldr, preproc)
    
    print(log.format(ep + 1, train_loss, dev_loss, dev_wer * 100., time.time() - start))
    
    torch.save(model, os.path.join(config["save_path"], str(ep)))
    
    if dev_wer < best_so_far:
        best_so_far = dev_wer
        torch.save(model, os.path.join(config["save_path"], "best"))

Training RNN




epoch    1 | train_loss=  5.38, dev_loss=  4.56 with  80.71% WER (  9.78s elapsed)


  "type " + obj.__name__ + ". It won't be checked "


epoch    2 | train_loss=  4.05, dev_loss=  3.81 with  64.82% WER (  9.83s elapsed)
epoch    3 | train_loss=  3.53, dev_loss=  3.39 with  59.82% WER (  9.94s elapsed)
epoch    4 | train_loss=  3.11, dev_loss=  2.89 with  49.29% WER (  9.93s elapsed)
epoch    5 | train_loss=  2.63, dev_loss=  3.77 with  60.00% WER ( 10.16s elapsed)
epoch    6 | train_loss=  2.48, dev_loss=  2.51 with  41.07% WER ( 10.09s elapsed)
epoch    7 | train_loss=  2.08, dev_loss=  2.22 with  32.32% WER (  9.94s elapsed)
epoch    8 | train_loss=  1.80, dev_loss=  1.07 with  16.79% WER ( 10.01s elapsed)
epoch    9 | train_loss=  0.97, dev_loss=  0.83 with  13.57% WER ( 10.03s elapsed)
epoch   10 | train_loss=  0.77, dev_loss=  0.78 with  11.25% WER ( 10.02s elapsed)
epoch   11 | train_loss=  0.72, dev_loss=  1.05 with  16.07% WER ( 10.26s elapsed)
epoch   12 | train_loss=  0.68, dev_loss=  0.77 with  11.25% WER ( 10.06s elapsed)
epoch   13 | train_loss=  0.70, dev_loss=  0.77 with  13.57% WER ( 10.02s elapsed)
epoc

In [0]:
# TODO: tune on the dev set
# may want to set up function or chunk of code here to perform tuning
# call train on training set, call evaluate on dev, save/plot/compare results

In [17]:
print("Testing RNN")
test_model = torch.load(os.path.join(config["save_path"], "best"))
test_ldr = make_loader(data_cfg["test_set"], preproc, opt_cfg["batch_size"])

_, test_wer = evaluate(test_model, test_ldr, preproc)

print("{:.2f}% WER (test)".format(test_wer * 100.))

Testing RNN




6.19% WER (test)
