<a href="https://colab.research.google.com/github/neelpawarcmu/deep-learning-course-projects/blob/main/Utterance_to_Phoneme_Mapping_using_Seq2Seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Summary

**Task: Unaligned Phoneme Recognition / Utterance to Phoneme Mapping.**

We are given unaligned labels, which means the correlation between the features and labels is not given explicitly and our model will have to figure this out by itself. The data has a list of phonemes for each utterance, but not which frames correspond to which phonemes. The main task is to predict the phonemes contained in utterances in the test set. We are not given aligned phonemes in the training data, and we are not producing alignment for the test data.

\
To solve this problem we build a sequence to sequence SpeechNet model from scratch using methodology and architectural details of sequence to sequence modeling mentioned in [this research paper](https://arxiv.org/pdf/2105.03070.pdf). We use CTC decode to solve the unaligned feature and label problem.


\
Additional details can be found [here](https://www.kaggle.com/competitions/11785-fall2021-hw3p2/data)

# Setup

## imports and configs 


#### Google Drive, Kaggle Preprocessing

In [None]:
# Google drive setup
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
import os
import json

# dont load dataset if already loaded
def setupNeeded():
  if os.path.exists('data'):
    return False
  else:
    return True

if setupNeeded():
  print('\n-----------Installing kaggle----------\n')
  # make kaggle dir and initialize the api token for downloading to the dir 
  # if this step gives a 401 error: go to kaggle and generate new api token from account settings
  api_token = {"username":"neelpawarcmu","key":"6b3a4829599c17cc4fac80794b01329d"} 
  !mkdir .kaggle
  !mkdir ~/.kaggle
  with open('/content/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)
  !cp /content/.kaggle/kaggle.json ~/.kaggle/kaggle.json
  !chmod 600 /root/.kaggle/kaggle.json
  
  # kaggle install
  !pip install --upgrade --force-reinstall --no-deps kaggle
  !kaggle --version
  
  # download data from the kaggle competition
  print('\n-----------Downloading data----------\n')
  !kaggle competitions download -c 11785-fall2021-hw3p2 # <- change this to download from a different kaggle competition
  !mkdir data
  print('\n-----------Unzipping data----------\n')
  !unzip -qo './11785-fall2021-hw3p2.zip' -d data 
  !ls data/

  # clone ctcdecoder (pip install ctcdecode causes installation errors)
  print('\n-----------Downloading ctcdecode----------\n')
  !git clone --recursive https://github.com/parlance/ctcdecode.git
  !pip install wget
  %cd ctcdecode

  !pip install .
  %cd ..

  # install Levenshtein
  print('\n-----------Downloading Levenshtein----------\n')
  !pip install python-Levenshtein
  print('\n-----------All prerequisites downloaded----------\n')

#### Libraries & Setup

In [None]:
import os
import sys
import time

import Levenshtein

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pdb
import gc
from tqdm.notebook import trange, tqdm

import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

# cudnn autotuner to speed up cnns
torch.backends.cudnn.benchmark = True

In [None]:
# Check if cuda is available and set device
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

NUM_WORKERS = 8 if cuda else 0

print("Cuda = ", str(cuda), " with num_workers = ", str(NUM_WORKERS),  " system version = ", sys.version)

Cuda =  True  with num_workers =  8  system version =  3.7.13 (default, Apr 24 2022, 01:04:09) 
[GCC 7.5.0]


## Data loaders

### Load raw data from unzipped files

In [None]:
# load training and dev data
train_data = np.load('data/HW3P2_Data/train.npy', allow_pickle=True)
train_labels = np.load('data/HW3P2_Data/train_labels.npy', allow_pickle=True)

dev_data = np.load('data/HW3P2_Data/dev.npy', allow_pickle=True)
dev_labels = np.load('data/HW3P2_Data/dev_labels.npy', allow_pickle=True)

# load test data
test_data = np.load('data/HW3P2_Data/test.npy', allow_pickle=True)

In [None]:
print(f'Train data: {train_data.shape}')
print(f'Train labels {train_labels.shape}')

print(f'Dev data: {dev_data.shape}')
print(f'Dev labels {dev_labels.shape}')

print(f'Test data: {test_data.shape}')

Train data: (14542,)
Train labels (14542,)
Dev data: (2200,)
Dev labels (2200,)
Test data: (2561,)


In [None]:
for i in range(10):
  print(train_data[i].shape)

(1504, 40)
(1560, 40)
(566, 40)
(1453, 40)
(795, 40)
(488, 40)
(1261, 40)
(1544, 40)
(1314, 40)
(1479, 40)


### Create Toy Data

In [None]:
train_data_toy = train_data[:1000]
train_labels_toy = train_labels[:1000]

dev_data_toy = dev_data[:200]
dev_labels_toy = dev_labels[:200]

test_data_toy = test_data[:200]

In [None]:
print(f'Train data toy: {train_data_toy.shape}')
print(f'Train labels toy: {train_labels_toy.shape}')

print(f'Dev data toy: {dev_data_toy.shape}')
print(f'Dev data toy: {dev_data_toy.shape}')

print(f'Test labels toy: {dev_labels_toy.shape}')

Train data toy: (1000,)
Train labels toy: (1000,)
Dev data toy: (200,)
Dev data toy: (200,)
Test labels toy: (200,)


In [None]:
# save training and dev data
# TODO: Edit path to where you want to save data
# train_data = np.load('/content/gdrive/MyDrive/.../train_data_toy.npy', allow_pickle=True)
# train_labels = np.load('/content/gdrive/MyDrive/.../train_labels_toy.npy', allow_pickle=True)

# dev_data = np.load('/content/gdrive/MyDrive/.../dev_data_toy.npy', allow_pickle=True)
# dev_labels = np.load('/content/gdrive/MyDrive/.../dev_labels_toy.npy', allow_pickle=True)

### Create Custom Dataset Classes

In [None]:
# Define dataset class
class MyDataSet(Dataset):
  '''
  define train and validation dataset class for the torch dataloader to serve from
  '''
  # load the dataset
  def __init__(self, x, y):
    self.X = x
    self.Y = y

  # get number of items/rows in dataset
  def __len__(self):
    return len(self.Y)

  # get row item at some index
  def __getitem__(self, index):
    x = torch.FloatTensor(self.X[index])
    y = torch.LongTensor(self.Y[index])
    return x, y

  def collate_fn(batch):
    batch_x = [x for x,y in batch]
    batch_y = [y for x,y in batch]
    lengths_x = torch.as_tensor([len(x) for x in batch_x])
    lengths_y = torch.as_tensor([len(y) for y in batch_y])
    padded_batch_x = pad_sequence(batch_x, batch_first=True)
    padded_batch_y = pad_sequence(batch_y, batch_first=True)
    return padded_batch_x, padded_batch_y, lengths_x, lengths_y

In [None]:
# Define dataset class
class TestDataSet(Dataset):
  '''
  
  '''
  # load the dataset
  # TODO: replace x and y with dataset path and load data from here -> more efficient
  def __init__(self, x):
    self.X = x

  # get number of items/rows in dataset
  def __len__(self):
    return len(self.X) 

  # get row item at some index
  def __getitem__(self, index):
    x = torch.FloatTensor(self.X[index])
    return x

  def collate_fn(batch_x):
    lengths_x = torch.as_tensor([len(x) for x in batch_x])
    padded_batch_x = pad_sequence(batch_x, batch_first=True, padding_value=0)
    return padded_batch_x, lengths_x


## 2.3 Data Loaders

In [None]:
BATCH_SIZE = 64

dataset_in_use = 'toy'
# dataset_in_use = 'main'

if dataset_in_use == 'toy':
  train_data, train_labels, dev_data, dev_labels, test_data = train_data_toy, train_labels_toy, dev_data_toy, dev_labels_toy, test_data_toy


# training data
train = MyDataSet(train_data, train_labels)
train_args = dict(shuffle = True, batch_size = BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=MyDataSet.collate_fn)
train_loader = DataLoader(train, **train_args)

# validation data
dev = MyDataSet(dev_data, dev_labels)
dev_args = dict(shuffle = False, batch_size = BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=MyDataSet.collate_fn) 
dev_loader = DataLoader(dev, **dev_args)

# test data
test = TestDataSet(test_data)
test_args = dict(shuffle = False, batch_size = BATCH_SIZE, num_workers=NUM_WORKERS, collate_fn=TestDataSet.collate_fn)
test_loader = DataLoader(test, **test_args)

test_toy = TestDataSet(test_data_toy)
test_loader_toy = DataLoader(test_toy, **test_args)

In [None]:
# # save test data:
# torch.save(test_loader_toy, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_loader.pt")
# torch.save(test_toy, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_dataset.pt")

In [None]:
# save data
savedata = False
if savedata:
  torch.save(train, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_dataset.pt")
  torch.save(train_labels, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_labels_dataset.pt")
  torch.save(dev, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_dataset.pt")
  torch.save(dev_labels, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_labels_dataset.pt")
  torch.save(test, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_dataset.pt")

saveloader = False
if saveloader:
  torch.save(train_loader, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_loader.pt")
  torch.save(dev_loader, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_loader.pt")
  torch.save(test_loader, "/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_loader.pt")

#load data
loaddata = False
if loaddata:
  train_data = torch.load('/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_dataset.pt')
  train_labels = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_labels_dataset.pt")
  dev_data = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_dataset.pt")
  dev_labels = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_labels_dataset.pt")
  test_data = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_dataset.pt")


loadloader = True
if loadloader:
  train_loader = torch.load('/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_loader.pt')
  dev_loader = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_loader.pt")
  test_loader = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_loader.pt")

In [None]:
# Check which data has been loaded 

print('------------------------------------')
print('Active dataset:\n')
print(f'number of examples in train data: {len(train_data)}')
print(f'number of examples in val data: {len(dev_data)}')
print(f'number of examples in test data: {len(test_data)}')

print('train_loader length (in batches)', len(train_loader))
print('------------------------------------')

------------------------------------
Active dataset:

number of examples in train data: 1000
number of examples in val data: 200
number of examples in test data: 200
train_loader length (in batches) 16
------------------------------------


In [None]:
#test one batch of dataloader:
trial_x = trial_y = None
for batch, (x,y, len_x, len_y) in enumerate(train_loader): #batch, (x,y) in enumerate(train_loader):
  break

print('\n------------------------------------\n')
print(f'batches in :\ttrain_loader:{len(train_loader)},\tdev_loader:{len(dev_loader)},\ttest_loader:{len(test_loader)}\n')
print(f'batch {batch} of train_loader:\n')
print(f'x: {x.shape}')
print(f'y: {y.shape}')
print(f'len_x: {len(len_x)}')
print(f'len_y: {len(len_y)}')
print('\n------------------------------------\n')




------------------------------------

batches in :	train_loader:16,	dev_loader:4,	test_loader:4

batch 0 of train_loader:

x: torch.Size([64, 1723, 40])
y: torch.Size([64, 198])
len_x: 64
len_y: 64

------------------------------------



In [None]:
x[0]

tensor([[-4.2428,  1.9185,  0.6261,  ...,  2.7999, -9.4368,  0.6930],
        [ 1.7173,  1.2406, -0.6536,  ...,  1.9749, -1.1662,  0.1333],
        [-1.9295, -0.1151,  0.3308,  ...,  2.6052,  0.3467,  0.6405],
        ...,
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]])

# 2 Model Building

## 2.1 Model Creation

In [None]:
# create building blocks


# Convolutional - Batchnorm - ReLU block
class CBR(nn.Sequential):
  def __init__(self, input_size, output_size):
    super().__init__(
        nn.Conv1d(input_size, output_size, kernel_size=3, padding=1),
        nn.BatchNorm1d(output_size),
        nn.ReLU(output_size)
    )


# Convolutional - Batchnorm - ELU block
class CBE(nn.Sequential):
  def __init__(self, input_size, output_size):
    super().__init__(
        nn.Conv1d(input_size, output_size, kernel_size=3, padding=1, bias=False),
        nn.BatchNorm1d(output_size),
        nn.ELU()
    )


# Convolutional Block (design 1) by stacking CBR blocks
class ConvBlock(nn.Sequential):
  def __init__(self, input_size, narrow, wide):
    super().__init__(
        CBR(input_size, narrow),
        CBR(narrow, narrow),
        CBR(narrow, wide),
        CBR(wide, wide),
        CBR(wide, wide)
    )


# Convolutional Block (design 2) by stacking CBR blocks
class ConvBlockWide(nn.Sequential):
  def __init__(self, input_size, output_size):
    super().__init__(
        CBE(input_size, output_size)
    )  


# LSTM Block
class LSTMModel(nn.Sequential):
  def __init__(self, embedding_size, lstm_output_size, num_lstm_layers, bi, dropout):
    super().__init__(
        nn.LSTM(embedding_size, lstm_output_size, num_lstm_layers, bidirectional=bi, batch_first=True, dropout=dropout)
    )


# Final Linear MLP Block
class LinearBlock(nn.Sequential):
  def __init__(self, input_size, hidden_size, output_size):
    super().__init__(
        nn.Linear(input_size, hidden_size),
        nn.ReLU(inplace=True),
        nn.Linear(hidden_size, output_size)
    )

## My Seq2Seq model

In [None]:
# Wide model
class WideSeq2SeqModel(nn.Module):
  def __init__(self, input_size, conv_output, lstm_units, lstm_layers, bi, linear_hidden, output_size, dropout=0.2):
    lstm_directions = 2 if bi else 1
    linear_input = lstm_units * lstm_directions
    
    super().__init__()
    self.conv = ConvBlockWide(input_size, conv_output)
    self.lstm = LSTMModel(conv_output, lstm_units, num_lstm_layers=lstm_layers, bi=bi, dropout=dropout)
    self.linear = LinearBlock(linear_input, linear_hidden, output_size)

  '''
  SHAPE CALCULATIONS FOR SHAPE MATCHING IN WORKFLOW
    if dataloader has batchfirst true: (batches, length, channels)
    input and output of cnn: from conv1d docs (batches, channels, length) -> permute(0,2,1)
    input of pack padded: from pack_padded_sequence docs (length, batches, channels) -> permute(2,0,1)
    output of pack padded w/o batchfirst: from " docs (length, batches, channels)
    input of lstm: from nn.lstm docs (batches, length, channels) -> permute(1,0,2)
    input of pad packed w batch first: from pad_packed docs (batches, length, channels) 
    output of pad packed w batch first: " (batches, length, channels)
    linear: (couldnt find details in docs): same,  (batches, length, channels) 
    ctcloss input ie model final output:  (length, batches, channels) -> permute(1,0,2)
  '''

  def forward(self, x, x_lengths): # x dim (batches, Seq_len, Channels=40)
    x_cnn_input = x.permute(0, 2, 1) # (batches, in_channels, length)
    x_post_cnn = self.conv(x_cnn_input) # (batches, out_channels, length_out)
    x_rnn_in = x_post_cnn.permute(2, 0, 1) # (length_out, batches, channels_out)
    x_packed = pack_padded_sequence(x_rnn_in, x_lengths, enforce_sorted=False)
    out_packed, hidden = self.lstm(x_packed)
    out, out_lens = pad_packed_sequence(out_packed, batch_first=True) # (batches, length_out, out_channels)
    # Log softmax after output layer is required since nn.CTCLoss expect log prob
    out_prob = self.linear(out).log_softmax(2) # (batches, length, out_channels)
    # Permute to fit for input format of CTCLoss
    out_prob = out_prob.permute(1, 0, 2) #torch.transpose(out_prob, 0, 1) # (lengths, batches, out_channels)
    return out_prob, x_lengths

In [None]:
# Narrow model
class Seq2SeqModel(nn.Module):
  def __init__(self, input_size, conv_hiddens, lstm_units, lstm_layers, bi, linear_hidden, output_size, dropout=0.2):
    directions = 2 if bi else 1
    linear_input = lstm_units * directions
    super().__init__()
    self.conv = ConvBlock(40, conv_hiddens[0], conv_hiddens[1])
    self.lstm = LSTMModel(conv_hiddens[1], lstm_units, num_lstm_layers=lstm_layers, bi=bi, dropout=dropout)    
    self.linear = LinearBlock(linear_input, linear_hidden, output_size)

  def forward(self, x, xLens): # x dim (B, T_in, C_in=40)
    x_cnn_input = x.permute(0, 2, 1) # (B, C_in, T_in)
    x_post_cnn = self.conv(x_cnn_input) # (B, C_out, T_out)
    x_rnn_in = x_post_cnn.permute(2, 0, 1) # (T, B, C_out)
    x_packed = pack_padded_sequence(x_rnn_in, xLens, enforce_sorted=False)
    out_packed, hidden = self.lstm(x_packed)
    out, out_lens = pad_packed_sequence(out_packed, batch_first=True) # (B, T, C)
    
    # Log softmax after output layer is required since nn.CTCLoss expect log prob
    out_prob = self.linear(out).log_softmax(2) # (B, T, Classes=47)
    
    # Permute to fit for input format of CTCLoss
    out_prob = out_prob.permute(1, 0, 2) #torch.transpose(out_prob, 0, 1) # (T, B, C)
    
    # calculate new xLens
    return out_prob, xLens

## 2.2 Model Initialization

In [None]:
# create model
input_size = 40
conv_hiddens = [128, 256]
lstm_units = 512
lstm_layers = 4
bidirectional = True
linear_hidden = 512
output_size = 42



wide_model = WideSeq2SeqModel(input_size, 1024, lstm_units,  lstm_layers, bidirectional, linear_hidden, output_size, dropout=0.5)
narrow_model = Seq2SeqModel(input_size, conv_hiddens, lstm_units,  lstm_layers, bidirectional, linear_hidden, output_size, dropout=0.2) 

# SELECT MODEL
model = wide_model
model_path = '/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_models/test_best_model.pt'
##########

try:
  model.load_state_dict(torch.load(model_path))
  print('√√√√√√√√√√√√√√ model keys matched √√√√√√√√√√√√√√\n\n')
except:
  print('XXXXXXXXXXXXX didnt load any model XXXXXXXXXXXXX\n\n')
model = model.to(device)
print(model)

√√√√√√√√√√√√√√ model keys matched √√√√√√√√√√√√√√


WideSeq2SeqModel(
  (conv): ConvBlockWide(
    (0): CBE(
      (0): Conv1d(40, 1024, kernel_size=(3,), stride=(1,), padding=(1,), bias=False)
      (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ELU(alpha=1.0)
    )
  )
  (lstm): LSTMModel(
    (0): LSTM(1024, 512, num_layers=4, batch_first=True, dropout=0.5, bidirectional=True)
  )
  (linear): LinearBlock(
    (0): Linear(in_features=1024, out_features=512, bias=True)
    (1): ReLU(inplace=True)
    (2): Linear(in_features=512, out_features=42, bias=True)
  )
)


# 4 Model Training

## 4.0 Set Hyperparameters

In [None]:
# Hyperparams
LEARNING_RATE = 5e-4
WEIGHT_DECAY = 5e-5


criterion = nn.CTCLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=2, verbose=True)

# scheduler alternative (Adam gives decnt results in most cases)
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.5)

## 4.1 Train Epoch

In [None]:
# Train the model
def train_epoch(model, train_loader, criterion, optimizer):
  # put model in train mode
  model.train()

  # initialize loss and timer for one epoch
  avg_loss = 0.0
  start = time.time()

  # initialize timer to time every batch (used to tune speeds before final deployment)
  start_time = time.time()
  for batch, (x,y,len_x,len_y) in enumerate(train_loader):
    # reset gradients to zero, required before starting backpropragation
    optimizer.zero_grad()
    # move all tensors to one device
    x = x.to(device)
    y = y.to(device)
    # generate model output
    output, len_output = model(x, len_x) # two outputs due to LSTM 
    output.to(device)
    # backprop
    loss = criterion(output, y, len_x, len_y)
    loss.backward()
    optimizer.step()
    # annotate time and loss
    if batch and batch % 10 == 0:
        end_time = time.time()
        print(f"batch: {batch} \t lr : {optimizer.param_groups[0]['lr']} \t training loss : {loss} \t time taken : {(end_time-start_time)*10//1/10} sec")
        start_time = end_time
    # clear memory
    torch.cuda.empty_cache()
    del x
    del y
    del len_x
    del len_y
    del output
    del len_output

  return loss

  # timing and loss for entire epoch
  end = time.time()
  avg_loss /= len(train_loader) # average batch loss

  print(f'Training loss: {avg_loss} Time: {end - start}')
  return avg_loss

## 4.2 CTC Decoding

In [None]:
import sys
sys.path.append("data/HW3P2_Data")
sys.path.append('/content/gdrive/MyDrive/IDL-Kaggle/hw3')

from phoneme_list import PHONEME_MAP, PHONEME_LIST
print(PHONEME_LIST)
print(PHONEME_MAP)

[' ', 'SIL', 'SPN', 'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER', 'EY', 'F', 'G', 'H', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH']
[' ', '.', '!', 'a', 'A', 'h', 'o', 'w', 'y', 'b', 'c', 'd', 'D', 'e', 'r', 'E', 'f', 'g', 'H', 'i', 'I', 'j', 'k', 'l', 'm', 'n', 'N', 'O', 'Y', 'p', 'R', 's', 'S', 't', 'T', 'u', 'U', 'v', 'W', '?', 'z', 'Z']


In [None]:
from ctcdecode import CTCBeamDecoder


class Decoder(CTCBeamDecoder):
  def __init__(self, beam_width=100):
    self.decoder = CTCBeamDecoder(labels=PHONEME_MAP, beam_width=beam_width, log_probs_input=True)

  def decode(self, output, len_x): 
    # print('output unprocessed', output.shape)
    output = torch.transpose(output, 0, 1)
    beam_results, beam_scores, timesteps, len_out = self.decoder.decode(output, len_x)
    # print('beam_results', beam_results.shape)

    #TCLO
    decodedListShort = []
    for b in range(beam_results.size(0)):
        currDecode = ""
        if len_out[b][0] != 0:
            currDecodeShort = "".join([PHONEME_MAP[i] for i in beam_results[b, 0, :len_out[b][0]]])
        decodedListShort.append(currDecodeShort)
    return decodedListShort
    
    def get_edit_dist(self, output, output_lens, target, target_lens):
      output, target = output.cpu(), target.cpu()
      phonome_preds = self.decoder.decode(output, output_lens)
      phonomes = self.decoder.convert_to_strings(target, target_lens)
      edit_dist = np.sum([self.decoder.Lev_dist(phonome_pred, phonome) for (phonome_pred, phonome) in zip(phonome_preds, phonomes)])
      return edit_dist
  
# Initialize decoder here
decoder = Decoder()
# In CTCBeamDecoder beam_width=1 (greedy search); beam_width>1 (beam search)

## 4.3 Validate Epoch

In [None]:
def idx_to_phoneme(target):
    return "".join([PHONEME_MAP[x] for x in target])

def calculateLevScore(w1, w2):
    return Levenshtein.distance(w1.replace(" ", ""), w2.replace(" ", ""))

In [None]:
def validate_model(model, data_loader, epoch, decode=False):
    with torch.no_grad(): # no grad reqd for backprop
        # put model in eval mode
        model.eval()
        running_loss = 0.0
        running_charErr = 0.0
        totalSampleCnt = 0
        print('validating', end="")
        for batch_idx, (data, target, dataLens, targetLens) in enumerate(data_loader):
            print(" –", end="")
            data, target = data.to(device), target.to(device)
            output, dataLens_new = model(data, dataLens)
            loss = criterion(output,
                             target,
                             dataLens_new,
                             targetLens)
            
            running_loss += loss.item()
            totalSampleCnt += len(data)
            if decode:
                decodedStringsShort = decoder.decode(output, dataLens)
                targetStrings = [idx_to_phoneme(i) for i in target]
                # print(decodedStringsShort)
                # print(targetStrings)
                
                for i in range(len(targetStrings)):
                    currCharErr = calculateLevScore(decodedStringsShort[i], targetStrings[i])
                    running_charErr += currCharErr
            
            
            torch.cuda.empty_cache()
            del data
            del target
            del dataLens
            del targetLens

        loss_per_sample = running_loss / len(dev_data)
        dist_per_sample = running_charErr / len(dev_data)
        return loss_per_sample, dist_per_sample

## 4.4 Run Epochs

In [None]:
torch.cuda.empty_cache()

In [None]:
# Define training settings
epochs = 100
epoch_array = np.arange(epochs)
train_loss_array = np.zeros(epochs)
val_loss_array = np.zeros(epochs)

best_loss = float('inf')
print('Start...')
for epoch in range(epochs):
  start_time = time.time()
  print('Epoch: ', epoch+1)

  training_loss = train_epoch(model, train_loader, criterion, optimizer)
  # val_loss, predictions, distance, running_dist = validate_model(model, dev_loader, epoch)
  val_loss, running_dist = validate_model(model, dev_loader, epoch, decode=True)
  
  print(f'\nEpoch {epoch+1} \tval_loss {val_loss} \tdistance {running_dist}')

  # save the best model
  if val_loss < best_loss:
    print('Best loss: {}, epoch: {}'.format(val_loss, epoch + 1))
    print('training loss: {}, epoch: {}'.format(val_loss, epoch + 1))
    # update and save
    model_path = '/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_models'
    torch.save(model.state_dict(), os.path.join(model_path, 'test_best_model.pt'))
    best_loss = val_loss

  train_loss_array[epoch] = training_loss
  val_loss_array[epoch] = val_loss

  # step scheduler based on val loss
  scheduler.step(val_loss)
  print(f"Epoch completed: {(time.time() - start_time)//6/10} min")
  print('LR = ', optimizer.param_groups[0]['lr'])
  
  print('='*40)
print('Done...')

Start...
Epoch:  1
batch: 10 	 lr : 0.0005 	 training loss : 4.948569297790527 	 time taken : 48.1 sec
batch: 20 	 lr : 0.0005 	 training loss : 3.776538372039795 	 time taken : 40.8 sec
batch: 30 	 lr : 0.0005 	 training loss : 3.476513385772705 	 time taken : 43.0 sec
batch: 40 	 lr : 0.0005 	 training loss : 3.379917860031128 	 time taken : 39.7 sec
batch: 50 	 lr : 0.0005 	 training loss : 3.3863964080810547 	 time taken : 42.6 sec
batch: 60 	 lr : 0.0005 	 training loss : 3.344802141189575 	 time taken : 41.1 sec
batch: 70 	 lr : 0.0005 	 training loss : 3.311263084411621 	 time taken : 42.9 sec
batch: 80 	 lr : 0.0005 	 training loss : 3.2967472076416016 	 time taken : 38.5 sec
batch: 90 	 lr : 0.0005 	 training loss : 3.2500076293945312 	 time taken : 39.3 sec
batch: 100 	 lr : 0.0005 	 training loss : 3.2239456176757812 	 time taken : 41.6 sec
batch: 110 	 lr : 0.0005 	 training loss : 3.218719959259033 	 time taken : 40.7 sec
batch: 120 	 lr : 0.0005 	 training loss : 3.194938

In [None]:
mask = np.where(val_loss_array != 0)
epoch_array = epoch_array[mask]
train_loss_array = train_loss_array[mask]
val_loss_array = val_loss_array[mask]

plt.figure(dpi=300)
plt.yscale('log')
plt.plot(epoch_array, train_loss_array, color = 'r', label='Training Loss')
plt.plot(epoch_array, val_loss_array, color = 'g', label='Validation Loss')
plt.legend()
plt.show()
plt.savefig(model_path+'.jpg')

In [None]:
print(torch.cuda.memory_summary(device=None, abbreviated=False))

In [None]:



cell inserted to break code flow




# 5 Test Data

### 5.0 some extra code for simultaneous testing from different account

In [None]:
#### delete next two cells later

In [None]:
#test prereqs
# Google drive setup
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

# ctcdecode
print('\n-----------Downloading ctcdecode----------\n')
!git clone --recursive https://github.com/parlance/ctcdecode.git
!pip install wget
%cd ctcdecode

!pip install .
%cd ..
print('\n-----------Done----------\n')

#Levenshtein
print('\n-----------Downloading Levenshtein----------\n')
!pip install python-Levenshtein
print('\n-----------Done----------\n')



#imports
import os
import sys
import time

import Levenshtein

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import pdb
import gc
from tqdm.notebook import trange, tqdm

import torch
import torch.nn as nn

from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')


# Define dataset class
class MyDataSet(Dataset):
  # load the dataset
  def __init__(self, x, y):
    self.X = x
    self.Y = y

  # get number of items/rows in dataset
  def __len__(self):
    return len(self.Y)

  # get row item at some index
  def __getitem__(self, index):
    x = torch.FloatTensor(self.X[index])
    y = torch.LongTensor(self.Y[index])
    return x, y

  def collate_fn(batch):
    batch_x = [x for x,y in batch]
    batch_y = [y for x,y in batch]
    lengths_x = torch.as_tensor([len(x) for x in batch_x])
    lengths_y = torch.as_tensor([len(y) for y in batch_y])
    padded_batch_x = pad_sequence(batch_x, batch_first=True)
    padded_batch_y = pad_sequence(batch_y, batch_first=True)
    return padded_batch_x, padded_batch_y, lengths_x, lengths_y

    
# Define dataset class
class TestDataSet(Dataset):
  # load the dataset
  # TODO: replace x and y with dataset path and load data from here -> more efficient
  def __init__(self, x):
    self.X = x

  # get number of items/rows in dataset
  def __len__(self):
    return len(self.X) 

  # get row item at some index
  def __getitem__(self, index):
    x = torch.FloatTensor(self.X[index])
    return x

  def collate_fn(batch_x):
    lengths_x = torch.as_tensor[len(x) for x in batch_x])
    padded_batch_x = pad_sequence(batch_x, batch_first=True, padding_value=0)
    return padded_batch_x, lengths_x



#data
# train = torch.load('/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_dataset.pt')
# train_labels = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_labels_dataset.pt")
# dev = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_dataset.pt")
# dev_labels = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_labels_dataset.pt")
# test = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_dataset.pt")

#loaders
train_loader = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/train_loader.pt")
dev_loader = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/dev_loader.pt")
test_loader = torch.load("/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_datasets/test_loader.pt")

In [None]:
# Check if cuda is available and set device
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")

NUM_WORKERS = 8 if cuda else 0

print("Cuda = ", str(cuda), " with num_workers = ", str(NUM_WORKERS),  " system version = ", sys.version)

In [None]:
# TODO: Create model    

class CBR(nn.Sequential):
  def __init__(self, input_size, output_size):
    super().__init__(
        nn.Conv1d(input_size, output_size, kernel_size=3, padding=1),
        nn.BatchNorm1d(output_size),
        nn.ReLU(output_size)
    )
  

class ConvBlock(nn.Sequential):
  def __init__(self, input_size, narrow, wide):
    super().__init__(
        CBR(input_size, narrow),
        CBR(narrow, narrow),
        CBR(narrow, wide),
        CBR(wide, wide),
        CBR(wide, wide)
    )


class LSTMModel(nn.Sequential):
  def __init__(self, embedding_size, lstm_output_size, num_lstm_layers, bi=False):
    super().__init__(
        nn.LSTM(embedding_size, lstm_output_size, num_lstm_layers, bidirectional=bi, batch_first=True, dropout=0.2)
    )


class LinearBlock(nn.Sequential):
  def __init__(self, input_size, hidden_size, output_size):
    super().__init__(
        nn.Linear(input_size, hidden_size),
        nn.ReLU(inplace=True),
        nn.Linear(hidden_size, output_size)
    )


class Seq2SeqModel(nn.Module):
  def __init__(self, input_size, conv_hiddens, lstm_units, lstm_layers, bi, linear_hidden, output_size):
    directions = 2 if bi else 1
    linear_input = lstm_units * directions
    super().__init__()
    self.conv = ConvBlock(40, conv_hiddens[0], conv_hiddens[1])
    self.lstm = LSTMModel(conv_hiddens[1], lstm_units, num_lstm_layers=lstm_layers, bi=bi)    
    self.linear = LinearBlock(linear_input, linear_hidden, output_size)

  def forward(self, x, xLens): # x dim (B, T_in, C_in=40)
    x_cnn_input = x.permute(0, 2, 1) # (B, C_in, T_in)
    x_post_cnn = self.conv(x_cnn_input) # (B, C_out, T_out)
    x_rnn_in = x_post_cnn.permute(2, 0, 1) # (T, B, C_out)
    x_packed = pack_padded_sequence(x_rnn_in, xLens, enforce_sorted=False)
    out_packed, hidden = self.lstm(x_packed)
    out, out_lens = pad_packed_sequence(out_packed, batch_first=True) # (B, T, C)
    
    # Log softmax after output layer is required since nn.CTCLoss expect log prob
    out_prob = self.linear(out).log_softmax(2) # (B, T, Classes=47)
    
    # Permute to fit for input format of CTCLoss
    out_prob = out_prob.permute(1, 0, 2) #torch.transpose(out_prob, 0, 1) # (T, B, C)
    
    # TODO: calculate new xLens
    return out_prob, xLens


  # def forward(self, x_padded, x_lens):
  #   x_padded = self.conv(x_padded.permute(0, 2, 1)).permute(0, 2, 1)
  #   x_packed = pack_padded_sequence(x_padded, x_lens, batch_first=True, enforce_sorted=False)
  #   output_packed, _ = self.lstm(x_packed)
  #   output_padded, output_lens = pad_packed_sequence(output_packed, batch_first=True)
  #   # Log softmax after output layer is required since`nn.CTCLoss` expects log probabilities.
  #   output = self.linear(output_padded).log_softmax(2)
  #   return output, output_lens


  # def forward(self, x, len_x):
  #   # x is already padded, prepare for cnn
  #   x = x.permute(0,2,1) #batch_size * channels * 
  #   x = self.conv(x)
  #   # pack x and prepare for lstm
  #   x = x.permute(2,0,1) #
  #   x = pack_padded_sequence(x, len_x, batch_first=True, enforce_sorted=False)
  #   out, len_out_packed = self.lstm(x)
  #   # pad the packed output
  #   out, len_out = pad_packed_sequence(out, batch_first=True)
  #   # linear layers
  #   out = self.linear(out)
  #   # log loss as required by CTC loss function
  #   out = out.log_softmax(2)
  #   return out, len_out

In [None]:
# create model
input_size = 40
conv_hiddens = [128, 256]
lstm_units = 512
lstm_layers = 4
bidirectional = True
linear_hidden = 512
output_size = 42

model_path = '/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_models/test_best_model_10_lronp.pt'


model = Seq2SeqModel(input_size, conv_hiddens, lstm_units,  lstm_layers, bidirectional, linear_hidden, output_size)
try:
  model.load_state_dict(torch.load(model_path))
  print('model keys matched\n\n')
except:
  print('didnt load any model')
model = model.to(device)
print(model)

In [None]:
# Hyperparams
LEARNING_RATE = 2e-3
WEIGHT_DECAY = 5e-5


criterion = nn.CTCLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=2, verbose=True)

# ReduceLRONP
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

In [None]:
import sys
sys.path.append("data/HW3P2_Data")
sys.path.append('/content/gdrive/MyDrive/IDL-Kaggle/hw3')

from phoneme_list import PHONEME_MAP, PHONEME_LIST
print(PHONEME_LIST)
print(PHONEME_MAP)

In [None]:
def idx2phonemes(target):
    return "".join([PHONEME_MAP[x] for x in target])

def calculateLevScore(w1, w2):
    return Levenshtein.distance(w1.replace(" ", ""), w2.replace(" ", ""))

In [None]:
from ctcdecode import CTCBeamDecoder


class Decoder(CTCBeamDecoder):
  def __init__(self, beam_width=100):
    self.decoder = CTCBeamDecoder(labels=PHONEME_MAP, beam_width=beam_width, log_probs_input=True)

  def decode(self, output, len_x): 
    # print('output unprocessed', output.shape)
    output = torch.transpose(output, 0, 1)
    beam_results, beam_scores, timesteps, len_out = self.decoder.decode(output, len_x)
    # print('beam_results', beam_results.shape)

    #TCLO
    decodedListShort = []
    for b in range(beam_results.size(0)):
        currDecode = ""
        if len_out[b][0] != 0:
            currDecodeShort = "".join([PHONEME_MAP[i] for i in beam_results[b, 0, :len_out[b][0]]])
        decodedListShort.append(currDecodeShort)
    return decodedListShort
    
    def get_edit_dist(self, output, output_lens, target, target_lens):
      output, target = output.cpu(), target.cpu()
      phonome_preds = self.decoder.decode(output, output_lens)
      phonomes = self.decoder.convert_to_strings(target, target_lens)
      edit_dist = np.sum([self.decoder.Lev_dist(phonome_pred, phonome) for (phonome_pred, phonome) in zip(phonome_preds, phonomes)])
      return edit_dist
  
# TODO: Initialize decoder here
decoder = Decoder()
# In CTCBeamDecoder beam_width=1 (greedy search); beam_width>1 (beam search)

## 5.1 Make Predictions

In [None]:
#tclo
predictions = []


def test_model(model, data_loader):
    start = time.time()
    model.eval()
    sample_num = 0
    with torch.no_grad():
      print('testing starting ...')
      for batch, (x, len_x) in enumerate(test_loader):
        x = x.to(device)
        out, len_out = model(x, len_x)
        preds = decoder.decode(out, len_x)
        predictions.extend(preds)
        sample_num += len(len_x)
        print(f'saved batch:\t {batch}/{len(test_loader)}\t time elapsed = {(time.time()-start)//6/10} min')
    
    print('Testing done, predictions updated')
    indices = np.arange(len(predictions))
    d = {'id': indices, 'label': np.array(predictions)}
    df = pd.DataFrame(data=d)
    df.to_csv('/content/gdrive/MyDrive/IDL-Kaggle/hw3/predictions.csv', header=True, index=False)
    print('predictions complete')

In [None]:
predictions = np.array(predictions[2561:])
np.save('predictions.npy', predictions)

In [None]:
print(len(test_loader))
model.load_state_dict(torch.load('/content/gdrive/MyDrive/IDL-Kaggle/hw3/saved_models/test_best_model.pt'))
test_model(model, test_loader)

In [None]:
indices = np.arange(len(predictions))
d = {'id': indices, 'label': np.array(predictions)}
df = pd.DataFrame(data=d)
df.to_csv('/content/gdrive/MyDrive/IDL-Kaggle/hw3/predictions.csv', header=True, index=False)
print('predictions complete')

In [None]:
#tclo
predictions = []

def test_model(model, data_loader):
    start = time.time()
    model.eval()
    sample_num = 0
    with torch.no_grad():
      print('testing starting ...')
      for batch, (x, len_x) in enumerate(test_loader):
        x = x.to(device)
        out, len_out = model(x, len_x)
        preds = decoder.decode(out, len_x)
        predictions.extend(preds)
        sample_num += len(len_x)
        print(f'saved batch:\t {batch}/{len(test_loader)}\t time elapsed = {(time.time()-start)//6/10} min')
    
    print('Testing done, predictions updated')
    indices = np.arange(len(predictions))
    d = {'id': indices, 'label': np.array(predictions)}
    df = pd.DataFrame(data=d)
    df.to_csv('/content/gdrive/MyDrive/IDL-Kaggle/hw3/predictions.csv', header=True, index=False)
    print('predictions complete')

## 5.2 Save Predictions to csv File

## 5.3 Submit Predictions