In [1]:
!pip install torchaudio==0.4.0 torch==1.4.0 comet-ml==3.0.2

Collecting torchaudio==0.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/6e/bc/3ebc127162d27bed33dc914606f10117d106680baae7ce83603ea09985fd/torchaudio-0.4.0-cp36-cp36m-manylinux1_x86_64.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 2.8MB/s 
[?25hCollecting torch==1.4.0
[?25l  Downloading https://files.pythonhosted.org/packages/24/19/4804aea17cd136f1705a5e98a00618cb8f6ccc375ad8bfa437408e09d058/torch-1.4.0-cp36-cp36m-manylinux1_x86_64.whl (753.4MB)
[K     |████████████████████████████████| 753.4MB 19kB/s 
[?25hCollecting comet-ml==3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/99/c6/fac88f43f2aa61a09fee4ffb769c73fe93fe7de75764246e70967d31da09/comet_ml-3.0.2-py3-none-any.whl (170kB)
[K     |████████████████████████████████| 174kB 48.9MB/s 
[?25hCollecting netifaces>=0.10.7
  Downloading https://files.pythonhosted.org/packages/0c/9b/c4c7eb09189548d45939a3d3a6b3d53979c67d124459b27a094c365c347f/netifaces-0.10.9-cp36-cp36m-manylinux1_x86

In [17]:
import os
from comet_ml import Experiment
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np

def avg_wer(wer_scores, combined_ref_len):
  return float(sum(wer_Scores)) / float(combined_ref_len)

def __levenshtein_distance(ref, hyp):
  # measures difference between two sequences
  soln = len(ref)
  resp = len(hyp)

  # if sequences are the same/null
  if ref == hyp:
    return 0
  if soln == 0:
    return resp
  if resp ==0:
    return soln

  # distance calculation
  if soln < resp:
    ref, hyp = hyp, ref
    soln, resp = resp, soln

  #starting with 0 space
  distance = np.zeros((2, resp+1), dtype=np.int32)

  #distance matrix initialization
  for j in range(0, resp+1):
    distance[0][j] = j

  # calculate levenshtein distance
  for i in range(1, soln + 1):
    prev_row = (i - 1) % 2
    cur_row = i % 2
    distance[cur_row][0] = i
    for j in range(1, soln + 1):
      if ref[i-1] == hyp[j-1]:
        distance[cur_row][j] = distance[prev_row]
      else:
        s_num = distance[prev_row][j-1] + 1 #words substituted
        i_num = distance[cur_row][j-1] + 1 #words inserted
        d_num = distance[prev_row][j] + 1 #words deleted
        distance[cur_row][j] = min(s_num, i_num, d_num)

  return distance[soln % 2][resp]

def word_errors(ref, hyp, delimiter=' '):
  # returns word level levenshtein distances of hypothesis from reference in a list format; not case sensitive

  reference = ref.lower()
  hypothesis = hyp.lower()

  ref_words = reference.split(delimiter)
  hyp_words = hypothesis.split(delimiter)

  distance = _levenshtein_distance(ref_words, hyp_words)

  return float(distance), len(ref_words)

def char_errors(ref, hyp):
# returns word level levenshtein distances of hypothesis from reference in a list format; not case sensitive

  reference = ref.lower()
  hypothesis = hyp.lower()

  join_char = ' '

  reference = join_char.join(filter(None, reference.split(' ')))
  hypothesis = join_char.join(filter(None, hypothesis.split(' ')))

  distance = _levenshtein_distance(reference, hypothesis)

  return float(distance, len(reference))

def wer(ref, hyp, delimieter = ' '):
# computes word error rate -> number of words substituted, deleted, or inserted divided by number of words in reference
  distance, ref_len = word_errors(ref, hyp, delimiter)

  wer = float(distance) / ref_len

  return wer

def cer(ref, hyp):
# computes character error rate -> number of characters substituted, deleted, or inserted divided by number of characters in reference  
  distance, ref_len = char_errors(ref, hyp)

  cer = float(distance) / ref_len

  return cer 

class txtTransform:
  def _init_(self):
    # mapping characters to integers for converting text to integer sequence, and vice versa
    char_map_str = """
     ' 0
        <SPACE> 1
        a 2
        b 3
        c 4
        d 5
        e 6
        f 7
        g 8
        h 9
        i 10
        j 11
        k 12
        l 13
        m 14
        n 15
        o 16
        p 17
        q 18
        r 19
        s 20
        t 21
        u 22
        v 23
        w 24
        x 25
        y 26
        z 27
        """
    self.char_map = {}
    self.index_map = {}
    for line in char_map_str.strip().split('\n'):
      ch, index = line.split()         
      self.char_map[ch] = int(index)
      self.index_map[int(index)] = ch
      self.index_map[1] = ' '

  def text_to_int(self, text):
    # convert text to integer sequence
    int_sequence = []
    for char in text:
      if char == ' ':
        ch = self.char_map['<SPACE>']
      else:
        ch = self.char_map[c]
      int_sequence.append(ch)
    
    return int_sequence

  def int_to_text(self, labels):
    # convert integer sequence to text
    text = []
    for i in labels:
      text.append(self.index_map[i])
    
    return ''.join(text).replace('<SPACE>', ' ')

# create spectogram from audio signal
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100))

valid_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30), #data augmentation
    torchaudio.transforms.TimeMasking(time_mask_param=100)) #data augmentation

text_transform = txtTransform()

def data_processing(data, data_type = "train"):
  spectrograms = []
  labels = []
  input_lengths = []
  label_lengths = []
  for (mp3, _, utterance, _, _, _) in data:
    if data_type == 'train':
      spec = train_audio_transforms(mp3).squeeze(0).transpose(0,1)
    elif data_Type == 'valid':
      spec = valid_audio_transforms(mp3).squeeze(0).transpose(0, 1)
    spectograms.append(spec)
    label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
    labels.append(label)
    input_lengths.append(spec.shape[0]//2)
    label_lengths.append(len(label))

def Decoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
  arg_maxes = torch.argmax(output, dim=2)
  decoded = []
  targets = []     
  for i, args in enumerate(arg_maxes):
    decoded = []
    targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
    for j, index in enumerate(args):
      if index != blank_label:
        if collapse_repeated and j != 0 and index == args[j-1]:
          continue
        decode.append(index.item())
      decoded.append(text_transform.int_to_text(decode))
    return decoded, targets


In [None]:
class speechRNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(speechRNN, self).__init__()
    self.hidden_size = hidden_size
    self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
    self.fc = nn.Linear(hidden_size, num_classes)
    
  def forward(self, x):
    h0 = torch.zeros(1, x.size(0), self.hidden_size)
    c0 = torch.zeros(1, x.size(0), self.hidden_size)
    # Forward propagate the LSTM
    out, _ = self.rnn(x, (h0, c0))
    # Pass the output of the last time step to the classifier
    out = self.fc(out[:, -1, :])
    
    return out


In [None]:
class IterMeter(object):
  #tracking number of iterations
  def __init__(self):
    self.val = 0

  def step(self):
    self.val += 1

  def get(self):
    return self.val  
    
      

In [13]:
comet_api_key = "FYqH757a4Za03rbnYfB9ic1MF" # add your api key here
project_name = "aps360-speech-recognition"
experiment_name = "aps360roughcode-colab"

if comet_api_key:
  experiment = Experiment(api_key=comet_api_key, project_name=project_name, parse_args=False)
  experiment.set_name(experiment_name)
  experiment.display()
else:
  experiment = Experiment(api_key='dummy_key', disabled=True)

COMET INFO: old comet version (3.0.2) detected. current: 3.1.12 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/abox3/aps360-speech-recognition/b8bacc1da44948d68c6375f58056c18b

