In [None]:
!pip install torchaudio==0.4.0 torch==1.4.0 comet-ml==3.0.2

In [3]:
import os
from comet_ml import Experiment
import torch
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
import torch.nn.functional as F
import torchaudio
import numpy as np
from openpyxl import load_workbook

def avg_wer(wer_scores, combined_ref_len):
  return float(sum(wer_Scores)) / float(combined_ref_len)

def __levenshtein_distance(ref, hyp):
  # measures difference between two sequences
  soln = len(ref)
  resp = len(hyp)

  # if sequences are the same/null
  if ref == hyp:
    return 0
  if soln == 0:
    return resp
  if resp ==0:
    return soln

  # distance calculation
  if soln < resp:
    ref, hyp = hyp, ref
    soln, resp = resp, soln

  #starting with 0 space
  distance = np.zeros((2, resp+1), dtype=np.int32)

  #distance matrix initialization
  for j in range(0, resp+1):
    distance[0][j] = j

  # calculate levenshtein distance
  for i in range(1, soln + 1):
    prev_row = (i - 1) % 2
    cur_row = i % 2
    distance[cur_row][0] = i
    for j in range(1, soln + 1):
      if ref[i-1] == hyp[j-1]:
        distance[cur_row][j] = distance[prev_row]
      else:
        s_num = distance[prev_row][j-1] + 1 #words substituted
        i_num = distance[cur_row][j-1] + 1 #words inserted
        d_num = distance[prev_row][j] + 1 #words deleted
        distance[cur_row][j] = min(s_num, i_num, d_num)

  return distance[soln % 2][resp]

def word_errors(ref, hyp, delimiter=' '):
  # returns word level levenshtein distances of hypothesis from reference in a list format; not case sensitive

  reference = ref.lower()
  hypothesis = hyp.lower()

  ref_words = reference.split(delimiter)
  hyp_words = hypothesis.split(delimiter)

  distance = _levenshtein_distance(ref_words, hyp_words)

  return float(distance), len(ref_words)

def char_errors(ref, hyp):
# returns word level levenshtein distances of hypothesis from reference in a list format; not case sensitive

  reference = ref.lower()
  hypothesis = hyp.lower()

  join_char = ' '

  reference = join_char.join(filter(None, reference.split(' ')))
  hypothesis = join_char.join(filter(None, hypothesis.split(' ')))

  distance = _levenshtein_distance(reference, hypothesis)

  return float(distance, len(reference))

def wer(ref, hyp, delimieter = ' '):
# computes word error rate -> number of words substituted, deleted, or inserted divided by number of words in reference
  distance, ref_len = word_errors(ref, hyp, delimiter)

  wer = float(distance) / ref_len

  return wer

def cer(ref, hyp):
# computes character error rate -> number of characters substituted, deleted, or inserted divided by number of characters in reference  
  distance, ref_len = char_errors(ref, hyp)

  cer = float(distance) / ref_len

  return cer 

class txtTransform:
  def _init_(self):
    # mapping characters to integers for converting text to integer sequence, and vice versa
    char_map_str = """
     ' 0
        <SPACE> 1
        a 2
        b 3
        c 4
        d 5
        e 6
        f 7
        g 8
        h 9
        i 10
        j 11
        k 12
        l 13
        m 14
        n 15
        o 16
        p 17
        q 18
        r 19
        s 20
        t 21
        u 22
        v 23
        w 24
        x 25
        y 26
        z 27
        """
    self.char_map = {}
    self.index_map = {}
    for line in char_map_str.strip().split('\n'):
      ch, index = line.split()         
      self.char_map[ch] = int(index)
      self.index_map[int(index)] = ch
      self.index_map[1] = ' '

  def text_to_int(self, text):
    # convert text to integer sequence
    int_sequence = []
    for char in text:
      if char == ' ':
        ch = self.char_map['<SPACE>']
      else:
        ch = self.char_map[c]
      int_sequence.append(ch)
    
    return int_sequence

  def int_to_text(self, labels):
    # convert integer sequence to text
    text = []
    for i in labels:
      text.append(self.index_map[i])
    
    return ''.join(text).replace('<SPACE>', ' ')

# create spectrogram from audio signal
train_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30),
    torchaudio.transforms.TimeMasking(time_mask_param=100))

val_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30), #data augmentation
    torchaudio.transforms.TimeMasking(time_mask_param=100)) #data augmentation

test_audio_transforms = nn.Sequential(
    torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=30), #data augmentation
    torchaudio.transforms.TimeMasking(time_mask_param=100)) #data augmentation


text_transform = txtTransform()

def data_processing(data, data_type = "train"):
  spectrograms = []
  labels = []
  input_lengths = []
  label_lengths = []
  for (mp3, _, utterance, _, _, _) in data:
    if data_type == 'train':
      spec = train_audio_transforms(mp3).squeeze(0).transpose(0,1)
    elif data_Type == 'valid':
      spec = valid_audio_transforms(mp3).squeeze(0).transpose(0, 1)
    spectrograms.append(spec)
    label = torch.Tensor(text_transform.text_to_int(utterance.lower()))
    labels.append(label)
    input_lengths.append(spec.shape[0]//2)
    label_lengths.append(len(label))

def Decoder(output, labels, label_lengths, blank_label=28, collapse_repeated=True):
  arg_maxes = torch.argmax(output, dim=2)
  decodes = []
  targets = []     
  for i, args in enumerate(arg_maxes):
    decodes = []
    targets.append(text_transform.int_to_text(labels[i][:label_lengths[i]].tolist()))
    for j, index in enumerate(args):
      if index != blank_label:
        if collapse_repeated and j != 0 and index == args[j-1]:
          continue
        decode.append(index.item())
      decodes.append(text_transform.int_to_text(decode))
    return decodes, targets


In [14]:
cwd = os.getcwd()
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in %r: %s" % (cwd, files))

Files in '/content': ['.config', 'sample_data']


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [26]:
samples = load_workbook (r'/content/drive/My Drive/APS360 - AI Fundamentals/Project/Validated Samples.xlsx')

In [None]:
# Find the number of pre-processed data in train, val, and test files
fpath_train = r'/content/drive/My Drive/APS360 - AI Fundamentals/Project/train/'
fpath_val = r'/content/drive/My Drive/APS360 - AI Fundamentals/Project/val/'
fpath_test =r'/content/drive/My Drive/APS360 - AI Fundamentals/Project/test/'

num_train = len(os.listdir(fpath_train))
num_val = len(os.listdir(fpath_val))
num_test = len(os.listdir(fpath_test))

In [None]:
# Obtain original row index used in spectrogram code
np.random.seed(50)
lastrow = samples['Validated_Samples'].max_row
range_array = np.array(range(2,lastrow + 1))
np.random.shuffle(range_array)

range_train = range_array[:num_train]
range_val = range_array[num_train:num_train + num_val]
range_test = range_array[num_train + num_val:]

In [None]:
class speechRNN(nn.Module):
  def __init__(self, input_size, hidden_size, num_classes):
    super(speechRNN, self).__init__()
    self.hidden_size = hidden_size
    self.rnn = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True)
    self.fc = nn.Linear(hidden_size, num_classes)
    
  def forward(self, x):
    h0 = torch.zeros(1, x.size(0), self.hidden_size)
    c0 = torch.zeros(1, x.size(0), self.hidden_size)
    # Forward propagate the LSTM
    out, _ = self.rnn(x, (h0, c0))
    # Pass the output of the last time step to the classifier
    out = self.fc(out[:, -1, :])
    
    return out


In [None]:
class IterMeter(object):
  #tracking number of iterations
  def __init__(self):
    self.val = 0

  def step(self):
    self.val += 1

  def get(self):
    return self.val  

 def train(model, device, train_loader, criterion, optimizer, epoch, iter_meter, experiment):
   model.train()
   data_len = len(train_loader.dataset)
   with experiment.train():
     for batch_idx, data in enumerate(train_loader):
       spectrograms, labels, input_lengths, label_lengths = data
       spectrograms, labels = spectrograms.to(device), labels.to(device)

       optimizer.zero_grad()

       output = model(spectrograms)
       output = F.log_softmax(output, dim=2)
       output = output.transpose(0,1)

       loss = criterion(output, labels, input_lengths, label_lengths)
       loss.backward()

       experiment.log_metric('loss', loss.item(), step+iter_meter.get())
       experiment.log_metric('learning_rate', step=iter_meter.get())

       optimizer.step()
       iter_meter.step()

  def val(model, device, test_loader, criterion, epoch, iter_meter, experiment):
    model.eval()
    val_loss = 0
    val_cer, val_wer = [], []
    with experiment.val()
      with torch.no_grad():
        for i, data in enumerate(val_loader):
          spectrograms, labels, input_lengths, label_lengths = data
          spectrograms, labels = spectrograms.to(device), labels.to(device)

          output = model(spectrograms)
          output = F.log_softmax(output, dim=2)
          output = output.transpose(0,1)

          loss = criterion(output, labels, input_lengths, label_lengths)
          val_loss += loss.item() / len(val_loader)

          decoded_preds, decoded_targets = Decoder(output.transpose(0,1), labels, label_lengths)
          for j in range(len(Decoded_preds)):
            val_cer.append(cer(Decoded_targets[j], decoded_preds[j]))
            val_wer.append(wer(decoded_targets[j], decoded_preds[j]))
  avg_cer = sum(val_cer)/len(val_cer)
  avg_wer = sum(val_wer)/len(val_wer)
  experiment.log_metric('val_loss', val_loss, step=iter_meter.get())
  experiment.log_metric('cer', avg_cer, step=iter_meter.get())
  experiment.log_metric('wer', avg_wer, step=iter_meter.get())

  print('Val set: average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(val_loss, avg_cer, avg_wer))

  def main(learning_Rate = xxxxx, batch_size = xxx, epochs = xxx, t)
    
    hparams = {
        "n_cnn_layers": 3,
        "n_rnn_layers": 5,
        "rnn_dim": 512,
        "n_class": 29,
        "n_feats": 128,
        "stride":2,
        "dropout": 0.1,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "epochs": epochs
    }

    experiment.log_parameters(hparams)

    use_cuda = torch.cuda.is_available()
    torch.manual_seed(7)
    device = torch.device("cuda" if use_cuda else "cpu")
    
    if not os.path.isdir("./data"):
        os.makedirs("./data")

    model = SpeechRecognitionModel(
        hparams['n_cnn_layers'], hparams['n_rnn_layers'], hparams['rnn_dim'],
        hparams['n_class'], hparams['n_feats'], hparams['stride'], hparams['dropout']
        ).to(device)

    optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
    criterion = nn.CTCLoss(blank=28).to(device)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'], 
                                            steps_per_epoch=int(len(train_loader)),
                                            epochs=hparams['epochs'],
                                            anneal_strategy='linear')

    iter_meter = IterMeter()
    for epoch in range(1, epochs + 1):
        train(model, device, train_loader, criterion, optimizer, scheduler, epoch, iter_meter, experiment)
        test(model, device, test_loader, criterion, epoch, iter_meter, experiment)    


In [13]:
comet_api_key = "FYqH757a4Za03rbnYfB9ic1MF" # add your api key here
project_name = "aps360-speech-recognition"
experiment_name = "aps360roughcode-colab"

if comet_api_key:
  experiment = Experiment(api_key=comet_api_key, project_name=project_name, parse_args=False)
  experiment.set_name(experiment_name)
  experiment.display()
else:
  experiment = Experiment(api_key='dummy_key', disabled=True)

COMET INFO: old comet version (3.0.2) detected. current: 3.1.12 please update your comet lib with command: `pip install --no-cache-dir --upgrade comet_ml`
COMET INFO: Experiment is live on comet.ml https://www.comet.ml/abox3/aps360-speech-recognition/b8bacc1da44948d68c6375f58056c18b

