# HW4 P2 IDL

## Downloading the data

In [2]:
!pip install kaggle



In [3]:
! mkdir ~/.kaggle

In [4]:
! cp kaggle.json ~/.kaggle/

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle competitions download -c 11-785-fall-20-homework-4-part-2

Downloading 11-785-fall-20-homework-4-part-2.zip to /home/ubuntu/AWS_HW4_P2
100%|█████████████████████████████████████▉| 3.72G/3.72G [00:59<00:00, 34.5MB/s]
100%|██████████████████████████████████████| 3.72G/3.72G [00:59<00:00, 67.2MB/s]


In [7]:
!unzip 11-785-fall-20-homework-4-part-2.zip

Archive:  11-785-fall-20-homework-4-part-2.zip
  inflating: hw4p2/dev.npy           
  inflating: hw4p2/dev_transcripts.npy  
  inflating: hw4p2/sample.csv        
  inflating: hw4p2/test.npy          
  inflating: hw4p2/train.npy         
  inflating: hw4p2/train_transcripts.npy  


## Imports and data loading

In [1]:
import os
import numpy as np
from PIL import Image

import torch
import torchvision   
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
import numpy as np
import torch
from torch.utils.data import Dataset 


'''
Loading all the numpy files containing the utterance information and text information
'''
def load_data():
    speech_train = np.load('./hw4p2/train.npy', allow_pickle=True, encoding='bytes')
    speech_valid = np.load('./hw4p2/dev.npy', allow_pickle=True, encoding='bytes')
    speech_test = np.load('./hw4p2/test.npy', allow_pickle=True, encoding='bytes')

    transcript_train = np.load('./hw4p2/train_transcripts.npy', allow_pickle=True,encoding='bytes')
    transcript_valid = np.load('./hw4p2/dev_transcripts.npy', allow_pickle=True,encoding='bytes')

    return speech_train, speech_valid, speech_test, transcript_train, transcript_valid


'''
Transforms alphabetical input to numerical input, replace each letter by its corresponding 
index from letter_list
'''

def transform_letter_to_index(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    letter_to_index_list = []
    for transcript_train in transcript:
        char_list = []
        for i,word in enumerate(transcript_train):
            if i == 0:
                char_list.append(letter_list.index('<sos>'))

            for char in word.decode('utf-8'):
                char_list.append(letter_list.index(char))

            if i == len(transcript_train)-1:
                char_list.append(letter_list.index('<eos>'))
            else:
                char_list.append(letter_list.index(' '))
        letter_to_index_list.append(char_list)
    return letter_to_index_list


'''
Optional, create dictionaries for letter2index and index2letter transformations
'''
def create_dictionaries(letter_list):
    letter2index = dict()
    index2letter = dict()
    return letter2index, index2letter


class Speech2TextDataset(Dataset):
    '''
    Dataset class for the speech to text data, this may need some tweaking in the
    getitem method as your implementation in the collate function may be different from
    ours. 
    '''
    def __init__(self, speech, text=None, isTrain=True):
        self.speech = speech
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return self.speech.shape[0]

    def __getitem__(self, index):
        if (self.isTrain == True):
            return torch.tensor(self.speech[index].astype(np.float32)),torch.tensor(len(self.speech[index])), torch.tensor(self.text[index]),torch.tensor(len(self.text[index]))
        else:
            return torch.tensor(self.speech[index].astype(np.float32)),torch.tensor(len(self.speech[index]))


def collate_train(seq_list):
    ### Return the padded speech and text data, and the length of utterance and transcript ###
    X=[]
    X_len = []
    Y=[]
    Y_len = []
    for i in range(len(seq_list)):
        X.append(seq_list[i][0])
        X_len.append(seq_list[i][1])
        Y.append(seq_list[i][2])
        Y_len.append(seq_list[i][3])
        
    inputs = torch.nn.utils.rnn.pad_sequence(X,padding_value=0.0)
    targets = torch.nn.utils.rnn.pad_sequence(Y,batch_first=True,padding_value=0.0)
    return inputs,torch.tensor(X_len),targets,torch.tensor(Y_len)


def collate_test(seq_list):
    ### Return padded speech and length of utterance ###
    X=[]
    X_len = []
    for i in range(len(seq_list)):
        X.append(seq_list[i][0])
        X_len.append(seq_list[i][1])
        
    inputs = torch.nn.utils.rnn.pad_sequence(X,padding_value=0.0)
    return inputs,torch.tensor(X_len)

In [3]:
speech_train, speech_valid, speech_test, transcript_train, transcript_valid = load_data()

# Util.py

In [4]:
from matplotlib.lines import Line2D
import matplotlib.pyplot as plt
import numpy as np
def plot_attn_flow(attn_mask, path):
    plt.imsave(path, attn_mask, cmap='hot')
    return plt

def plot_grad_flow(named_parameters, path):
    ave_grads = []
    max_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            if(p is not None):
                layers.append(n)
                ave_grads.append(p.grad.abs().mean())
                max_grads.append(p.grad.abs().max())
    plt.bar(np.arange(len(max_grads)), max_grads, alpha=0.1, lw=1, color="c")
    plt.bar(np.arange(len(max_grads)), ave_grads, alpha=0.1, lw=1, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, lw=2, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(left=0, right=len(ave_grads))
    plt.ylim(bottom = -0.001, top=0.02) # zoom in on the lower gradient regions
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    #plt.tight_layout()
    plt.grid(True)
    plt.legend([Line2D([0], [0], color="c", lw=4),
                Line2D([0], [0], color="b", lw=4),
                Line2D([0], [0], color="k", lw=4)], ['max-gradient', 'mean-gradient', 'zero-gradient'])
    plt.show()
    plt.savefig(path)
    return plt, max_grads


# Models.py

In [86]:
import torch
import torch.nn as nn
import torch.nn.utils as utils
from torch.nn.utils.rnn import pack_padded_sequence as pack
from torch.nn.utils.rnn import pad_packed_sequence as unpack

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

class Attention(nn.Module):
    '''
    Attention is calculated using key, value and query from Encoder and decoder.
    Below are the set of operations you need to perform for computing attention:
        energy = bmm(key, query)
        attention = softmax(energy)
        context = bmm(attention, value)
    '''
    def __init__(self):
        super(Attention, self).__init__()

    def forward(self, query, key, value, lens):
        '''query, context, lengths):
        :param query :(batch_size, hidden_size) Query is the output of LSTMCell from Decoder
        :param keys: (batch_size, max_len, encoder_size) Key Projection from Encoder
        :param values: (batch_size, max_len, encoder_size) Value Projection from Encoder
        :return context: (batch_size, encoder_size) Attended Context
        :return attention_mask: (batch_size, max_len) Attention mask that can be plotted 
        '''
                
        key = key.permute(1,0,2)
        value = value.permute(1,0,2)

        attention = torch.bmm(key, query.unsqueeze(2)).squeeze(2)
        
        mask = torch.arange(key.size(1)).unsqueeze(0) >= lens.unsqueeze(1)
        
        # Set attention logits at padding positions to negative infinity.
        attention.masked_fill_(mask.cuda(), -1e9)
        
        # Take softmax over the "source length" dimension.
        attention = nn.functional.softmax(attention, dim=1)
        
        out = torch.bmm(attention.unsqueeze(1), value).squeeze(1)
        
        # attention vectors are returned for visualization
        return out, attention

class pBLSTM(nn.Module):
    '''
    Pyramidal BiLSTM
    The length of utterance (speech input) can be hundereds to thousands of frames long.
    The Paper reports that a direct LSTM implementation as Encoder resulted in slow convergence,
    and inferior results even after extensive training.
    The major reason is inability of AttendAndSpell operation to extract relevant information
    from a large number of input steps.
    '''
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)

    def forward(self, x):
        '''
        :param x :(N, T) input to the pBLSTM
        :return output: (N, T, H) encoded sequence from pyramidal Bi-LSTM 
        '''
        
        x,lens = unpack(x)
        x = x.permute(1,0,2)
        if x.shape[1] % 2 != 0:
            x = x[:,:-1,:]
        x = x.reshape(x.shape[0],int(x.shape[1]/2),x.shape[2]*2)
        x = x.permute(1,0,2)
        x = pack(x,lengths=lens/2,enforce_sorted=False)
        x,_ = self.blstm(x)
        return x

class Encoder(nn.Module):
    '''
    Encoder takes the utterances as inputs and returns the key and value.
    Key and value are nothing but simple projections of the output from pBLSTM network.
    '''
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bidirectional=True)
        
        ### Add code to define the blocks of pBLSTMs! ###
        self.pblstm1 = pBLSTM(hidden_dim*4,hidden_dim)
        self.pblstm2 = pBLSTM(hidden_dim*4,hidden_dim)
        self.pblstm3 = pBLSTM(hidden_dim*4,hidden_dim)
        self.pblstm4 = pBLSTM(hidden_dim*4,hidden_dim)
        
        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)

    def forward(self, x, lens):
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)
        outputs = self.pblstm1(outputs)
        outputs = self.pblstm2(outputs)
        outputs = self.pblstm3(outputs)
        outputs = self.pblstm4(outputs)
        ### Use the outputs and pass it through the pBLSTM blocks! ###
        
        linear_input, lengths = utils.rnn.pad_packed_sequence(outputs)
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)
        return keys, value, lengths 


class Decoder(nn.Module):
    '''
    As mentioned in a previous recitation, each forward call of decoder deals with just one time step, 
    thus we use LSTMCell instead of LSLTM here.
    The output from the second LSTMCell can be used as query here for attention module.
    In place of value that we get from the attention, this can be replace by context we get from the attention.
    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.
    '''
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=0)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)

        self.isAttended = isAttended
        if (isAttended == True):
            self.attention = Attention()

        self.character_prob = nn.Linear(key_size + value_size, vocab_size)

    def forward(self, key, values, text=None, isTrain=True,lens=None):
        '''
        :param key :(T, N, key_size) Output of the Encoder Key projection layer
        :param values: (T, N, value_size) Output of the Encoder Value projection layer
        :param text: (N, text_len) Batch input of text with text_length
        :param isTrain: Train or eval mode
        :return predictions: Returns the character perdiction probability 
        '''
        batch_size = key.shape[1]

        if (isTrain == True):
            #print(text)
            max_len =  text.shape[1]
            embeddings = self.embedding(text)
        else:
            max_len = 250

        predictions = []
        hidden_states = [None, None]
        prediction = torch.zeros(batch_size,1).to(DEVICE)#(torch.ones(batch_size, 1)*33).to(DEVICE)
        context = values[0,:,:]
        attention_list = []
        for i in range(max_len):
            # * Implement Gumble noise and teacher forcing techniques 
            # * When attention is True, replace values[i,:,:] with the context you get from attention.
            # * If you haven't implemented attention yet, then you may want to check the index and break 
            #   out of the loop so you do not get index out of range errors. 
            
            if (isTrain):
                char_embed = embeddings[:,i,:]
            else:
                char_embed = self.embedding(prediction.argmax(dim=-1))
            
            inp = torch.cat([char_embed,context], dim=1)
            
            hidden_states[0] = self.lstm1(inp, hidden_states[0])

            inp_2 = hidden_states[0][0]
            hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

            ### Compute attention from the output of the second LSTM Cell ###
            output = hidden_states[1][0]
            
            if self.isAttended==True:
                context, attention = self.attention(output,key,values,lens)
                attention_list.append(attention[0].detach().cpu().numpy())
                prediction = self.character_prob(torch.cat([output, context], dim=1))
                predictions.append(prediction.unsqueeze(1))
            else:
                prediction = self.character_prob(torch.cat([output, values[i,:,:]], dim=1))
                predictions.append(prediction.unsqueeze(1))
        if self.isAttended==True:
            return torch.cat(predictions, dim=1),attention_list
        else:
            return torch.cat(predictions, dim=1),None


class Seq2Seq(nn.Module):
    '''
    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.
    This is simply a wrapper "model" for your encoder and decoder.
    '''
    def __init__(self, input_dim, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=False):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim,isAttended=True)

    def forward(self, speech_input, speech_len, text_input=None, isTrain=True):
        key, value,lengths = self.encoder(speech_input, speech_len)
        if (isTrain == True):
            predictions,attention = self.decoder(key, value, text_input,lens=lengths)
        else:
            predictions,attention = self.decoder(key, value, text=None, isTrain=False,lens=lengths)
        return predictions,attention


# Train_test.py

In [87]:
import time
import torch
### Add Your Other Necessary Imports Here! ###

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

def train(model, train_loader, criterion, optimizer, epoch):
    model.train()
    model.to(DEVICE)
    loss_history = []
    start = time.time()
    all_attentions = []
    for j, (X,X_lens,Y,Y_lens) in enumerate(train_loader):
        X,Y = X.to(DEVICE), Y.to(DEVICE)
        X_lens,Y_lens= X_lens.to(DEVICE),Y_lens.to(DEVICE)
        optimizer.zero_grad()
        loss = 0

        out,attention_list = model(X,X_lens,Y[:,:-1])
        loss += criterion(out.reshape(out.shape[0]*out.shape[1],out.shape[-1]), Y[:,1:].reshape(Y[:,1:].shape[0]*Y[:,1:].shape[1]))
        
        if j % 15 == 0:
            print(f'Batch {j} has loss: {loss.item()}')
        
        loss.backward()
        optimizer.step()
        torch.cuda.empty_cache()
        del X
        del X_lens
        del Y
        del Y_lens
        del loss
    
    end = time.time()
    print(f'Time: {(end-start)/60}')
    return attention_list

# Main

In [88]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

LETTER_LIST = ['<pad>', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', \
               'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '-', "'", '.', '_', '+', ' ','<sos>','<eos>']


model = Seq2Seq(input_dim=40, vocab_size=len(LETTER_LIST), hidden_dim=128)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(reduction='mean',ignore_index=0)
nepochs = 25
batch_size = 64 if DEVICE == 'cuda' else 1 
character_text_train = transform_letter_to_index(transcript_train, LETTER_LIST)
character_text_valid = transform_letter_to_index(transcript_valid, LETTER_LIST)

train_dataset = Speech2TextDataset(speech_train, character_text_train)

test_dataset = Speech2TextDataset(speech_test, None, False)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_train)

test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_test)

for epoch in range(nepochs):
    attn_mask = train(model, train_loader, criterion, optimizer, epoch)
    plot_attn_flow(attn_mask, f'Epoch_{epoch}.png')
    test(model, test_loader, epoch)
    print(f'EPOCH {epoch}')


Batch 0 has loss: 3.5653891563415527
Batch 15 has loss: 2.84572696685791
Batch 30 has loss: 2.6806483268737793
Batch 45 has loss: 2.4591357707977295
Batch 60 has loss: 2.274397611618042
Batch 75 has loss: 2.166311740875244
Batch 90 has loss: 2.0933098793029785
Batch 105 has loss: 2.040273904800415
Batch 120 has loss: 2.015489339828491
Batch 135 has loss: 2.0003626346588135
Batch 150 has loss: 1.9396114349365234
Batch 165 has loss: 1.9316234588623047
Batch 180 has loss: 1.9001574516296387
Batch 195 has loss: 1.86001455783844
Batch 210 has loss: 1.8456873893737793
Batch 225 has loss: 1.830087661743164
Batch 240 has loss: 1.8065807819366455
Batch 255 has loss: 1.7985484600067139
Batch 270 has loss: 1.7689565420150757
Batch 285 has loss: 1.7374876737594604
Batch 300 has loss: 1.7554261684417725
Batch 315 has loss: 1.753662109375
Batch 330 has loss: 1.726441740989685
Batch 345 has loss: 1.7080533504486084
Batch 360 has loss: 1.7228367328643799
Batch 375 has loss: 1.7124234437942505
Batch 39

Batch 420 has loss: 1.327275276184082
Batch 435 has loss: 1.2881848812103271
Time: 6.282207385698954
EPOCH 6
Batch 0 has loss: 1.3423268795013428
Batch 15 has loss: 1.3109989166259766
Batch 30 has loss: 1.3062008619308472
Batch 45 has loss: 1.3411693572998047
Batch 60 has loss: 1.3247040510177612
Batch 75 has loss: 1.3381891250610352
Batch 90 has loss: 1.3324694633483887
Batch 105 has loss: 1.330281138420105
Batch 120 has loss: 1.3548778295516968
Batch 135 has loss: 1.3468801975250244
Batch 150 has loss: 1.3455208539962769
Batch 165 has loss: 1.3536370992660522
Batch 180 has loss: 1.3098607063293457
Batch 195 has loss: 1.313861608505249
Batch 210 has loss: 1.3218474388122559
Batch 225 has loss: 1.3366203308105469
Batch 240 has loss: 1.3015645742416382
Batch 255 has loss: 1.3520522117614746
Batch 270 has loss: 1.331685185432434
Batch 285 has loss: 1.3114471435546875
Batch 300 has loss: 1.342331886291504
Batch 315 has loss: 1.3250279426574707
Batch 330 has loss: 1.3207647800445557
Batch 

Batch 375 has loss: 1.2562503814697266
Batch 390 has loss: 1.2899831533432007
Batch 405 has loss: 1.2629534006118774
Batch 420 has loss: 1.277705192565918
Batch 435 has loss: 1.2471891641616821
Time: 6.285720956325531
EPOCH 13
Batch 0 has loss: 1.268730640411377
Batch 15 has loss: 1.2647590637207031
Batch 30 has loss: 1.2819359302520752
Batch 45 has loss: 1.291440486907959
Batch 60 has loss: 1.2652580738067627
Batch 75 has loss: 1.2442091703414917
Batch 90 has loss: 1.239975929260254
Batch 105 has loss: 1.2456506490707397
Batch 120 has loss: 1.264572262763977
Batch 135 has loss: 1.2538275718688965
Batch 150 has loss: 1.2668341398239136
Batch 165 has loss: 1.285172462463379
Batch 180 has loss: 1.2615360021591187
Batch 195 has loss: 1.265581488609314
Batch 210 has loss: 1.2676359415054321
Batch 225 has loss: 1.275172233581543
Batch 240 has loss: 1.2755517959594727
Batch 255 has loss: 1.2781726121902466
Batch 270 has loss: 1.2707631587982178
Batch 285 has loss: 1.2889281511306763
Batch 30

KeyboardInterrupt: 

### Training for a few more epochs to see if it starts "Paying Attention"

In [94]:
for epoch in range(5):
    attn_mask = train(model, train_loader, criterion, optimizer, epoch)
    plot_attn_flow(attn_mask, f'Epoch_{10+epoch}.png')
    # val()
    test(model, test_loader, epoch)
    print(f'EPOCH {epoch}')

Batch 0 has loss: 1.2587796449661255
Batch 15 has loss: 1.2202048301696777
Batch 30 has loss: 1.2254769802093506
Batch 45 has loss: 1.2266666889190674
Batch 60 has loss: 1.243112564086914
Batch 75 has loss: 1.2216757535934448
Batch 90 has loss: 1.258890986442566
Batch 105 has loss: 1.2073585987091064
Batch 120 has loss: 1.246994972229004
Batch 135 has loss: 1.245572566986084
Batch 150 has loss: 1.2636767625808716
Batch 165 has loss: 1.2405892610549927
Batch 180 has loss: 1.220415472984314
Batch 195 has loss: 1.2707115411758423
Batch 210 has loss: 1.2565006017684937
Batch 225 has loss: 1.2359211444854736
Batch 240 has loss: 1.2797437906265259
Batch 255 has loss: 1.2618598937988281
Batch 270 has loss: 1.2338792085647583
Batch 285 has loss: 1.2452365159988403
Batch 300 has loss: 1.2694497108459473
Batch 315 has loss: 1.238133430480957
Batch 330 has loss: 1.2468219995498657
Batch 345 has loss: 1.2857352495193481
Batch 360 has loss: 1.259308934211731
Batch 375 has loss: 1.2313255071640015
B

KeyboardInterrupt: 

# Submission 

In [95]:
model.eval()
Final = []
for i, (X,X_len) in enumerate(test_loader):
    X,X_len = X.to(DEVICE),X_len.to(DEVICE)
    out,_ = model(X,X_len,isTrain=False)
    _,out2 = torch.max(out,dim=2)
    out2 = out2.detach().cpu().numpy()
    for batch in out2:
        out = []
        for word in batch:
            if word == 33: # 33 --> <sos>
                None
            elif word == 34: # 34 --> <eos>
                break
            else:
                out.append(LETTER_LIST[word])
        Final.append(''.join(out))

In [96]:
Final[300]

'es and the street was a great deal of the street and the street was a great deal of the street and the street was a great deal of the street and the street was a great deal of the street and the street was a great deal of the street and the street wa'

In [98]:
import pandas as pd
ID = [i for i in range(len(Final))]
df_pred = pd.DataFrame(data={'Id':ID,'label':Final})
df_pred.to_csv(f'Prediction2.csv',index=False)
!kaggle competitions submit -c 11-785-fall-20-homework-4-part-2 -f Prediction2.csv -m "Submission last"

100%|█████████████████████████████████████████| 652k/652k [00:03<00:00, 206kB/s]
Successfully submitted to 11-785-Fall-20-Homework 4 Part 2