In [1]:
import numpy as np
import torch 
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.nn.utils as utils
from torch.nn.utils.rnn import *
import pickle as pk
from torchnlp.nn import WeightDrop , LockedDropout
from torch.utils.data import DataLoader, Dataset 
from torch.distributions.gumbel import Gumbel
from torchnlp.nn import LockedDropout
import time
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# **Load data**

Loading all the numpy files containing the utterance information and text information

In [2]:
speech_train = np.load('train_new.npy', allow_pickle=True, encoding='bytes')
speech_valid = np.load('dev_new.npy', allow_pickle=True, encoding='bytes')
speech_test = np.load('test_new.npy', allow_pickle=True, encoding='bytes')

transcript_train = np.load('./train_transcripts.npy', allow_pickle=True,encoding='bytes')
transcript_valid = np.load('./dev_transcripts.npy', allow_pickle=True,encoding='bytes')
print("Data Loading Sucessful.....")

Data Loading Sucessful.....


In [3]:
print(transcript_valid)

[array([b'THE', b'FEMALE', b'PRODUCES', b'A', b'LITTER', b'OF', b'TWO',
       b'TO', b'FOUR', b'YOUNG', b'IN', b'NOVEMBER', b'AND', b'DECEMBER'],
      dtype='|S8')
 array([b'NUMEROUS', b'WORKS', b'OF', b'ART', b'ARE', b'BASED', b'ON',
       b'THE', b'STORY', b'OF', b'THE', b'SACRIFICE', b'OF', b'ISAAC'],
      dtype='|S9')
 array([b'THEIR', b'SOLUTION', b'REQUIRES', b'DEVELOPMENT', b'OF', b'THE',
       b'HUMAN', b'CAPACITY', b'FOR', b'SOCIAL', b'INTEREST'],
      dtype='|S11')
 ...
 array([b'AND', b'IT', b'HAPPENED', b'PERIOD', b'DOUBLE-QUOTE'],
      dtype='|S12')
 array([b'THE', b'HOUSE', b'SO', b'FAR', b'HAS', b'PROPOSED', b'A',
       b'LOAN', b'HYPHEN', b'GUARANTEE', b'BUDGET', b'OF', b'ONLY',
       b'TWO', b'POINT', b'FOUR', b'BILLION', b'DOLLARS', b'COMMA',
       b'BUT', b'BACKERS', b'ARE', b'PRESSING', b'FOR', b'MORE',
       b'PERIOD'], dtype='|S9')
 array([b'DOUBLE-QUOTE', b"HAVEN'T", b'WE', b'ALREADY', b'GONE',
       b'OVERBOARD', b'IN', b'S.', b'B.', b'A.', b'BUDGET'

In [None]:
print(speech_valid[4].shape)

# **Transform Text Data**

`transform_letter_to_index` function transforms alphabetical input to numerical input. Each letter is replaced by its corresponding index from `letter_list` .

In [3]:
letter_list = ['<sos>','<eos>','A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',\
             'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '-', "'", '.', '_', '+', ' ']

In [4]:
def transform_letter_to_index(transcript, letter_list):
    '''
    :param transcript :(N, ) Transcripts are the text input
    :param letter_list: Letter list defined above
    :return letter_to_index_list: Returns a list for all the transcript sentence to index
    '''
    output_list=[]
    letter_index_list = np.arange(len(letter_list))

    for i in range(transcript.shape[0]):
        
        sentence = transcript[i]
        sentence = np.array(sentence , dtype=str)
        temp=" ".join(list(sentence))

        sentence_list = [letter_index_list[0]+1]
        
        for j in range(len(temp)):
            
            sentence_list.append(letter_list.index(temp[j])+1)

        sentence_list.append(letter_index_list[1]+1)

        output_list.append(sentence_list)
    
    return output_list

In [5]:
character_text_train = transform_letter_to_index(transcript_train, letter_list)
character_text_valid = transform_letter_to_index(transcript_valid, letter_list)

print("Transformed data sucessfully.....")

Transformed data sucessfully.....


In [11]:
print(character_text_valid[0])

[1, 22, 10, 7, 34, 8, 7, 15, 3, 14, 7, 34, 18, 20, 17, 6, 23, 5, 7, 21, 34, 3, 34, 14, 11, 22, 22, 7, 20, 34, 17, 8, 34, 22, 25, 17, 34, 22, 17, 34, 8, 17, 23, 20, 34, 27, 17, 23, 16, 9, 34, 11, 16, 34, 16, 17, 24, 7, 15, 4, 7, 20, 34, 3, 16, 6, 34, 6, 7, 5, 7, 15, 4, 7, 20, 2]



# **Pyramidal BiLSTM**
 

*   The length of utterance (speech input) can be hundereds to thousands of frames long.
*   Paper reports that that a direct LSTM implementation as Encoder resulted in slow convergence and inferior results even after extensive training.
*   The major reason is inability of `AttendAndSpell` operation to extract relevant information from a large number of input steps.

In [6]:

class pBLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(pBLSTM, self).__init__()
        self.blstm = nn.LSTM(input_size=input_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True)
        self.do1=nn.Dropout(0.2)
    
    def forward(self,x,new_length_list , train):
#     '''
#     :param x :(N,T) input to the pBLSTM
#     :return output: (N,T,H) encoded sequence from pyramidal Bi-LSTM 
#     '''
        outputs1, _ = utils.rnn.pad_packed_sequence(x)
        if(outputs1.size(0)%2!=0):
            outputs1 = outputs1[:outputs1.size(0)-1,:,:]
        new_length1 = int(outputs1.shape[0]/2)
        batch_size1 = outputs1.shape[1]
        freq_size1= outputs1.shape[2]
        i_plstm1 = outputs1.transpose(0,1)
        i_plstm1 = i_plstm1.reshape(batch_size1,new_length1,freq_size1*2)
        i_plstm1 = i_plstm1.transpose(0,1)
        #new_length_list=np.array(list(lens))/2
        
        
        rnn_inp1 = utils.rnn.pack_padded_sequence(i_plstm1, lengths=new_length_list, batch_first=False, enforce_sorted=False)
        
        output,_ = self.blstm(rnn_inp1)
        if(train):
            temp,_ = utils.rnn.pad_packed_sequence(output)
            temp1=self.do1(temp)
            output = utils.rnn.pack_padded_sequence(temp1, lengths=new_length_list, batch_first=False, enforce_sorted=False)
        return output

# **Encoder**

*    Encoder takes the utterances as inputs and returns the key and value.
*    Key and value are nothing but simple projections of the output from pBLSTM network.

In [7]:

class Encoder(nn.Module):
    
    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        self.lstm = nn.LSTM(input_size=input_dim,hidden_size=hidden_dim,num_layers=1,bidirectional=True)
        #Here you need to define the blocks of pBLSTMs
    
        self.plstm1 = pBLSTM(hidden_dim*4,hidden_dim)
        self.plstm2 = pBLSTM(hidden_dim*4,hidden_dim)
        self.plstm3 = pBLSTM(hidden_dim*4,hidden_dim)
        
        self.do1=LockedDropout(0.2)
        self.do2=LockedDropout(0.2)
        self.do3=LockedDropout(0.2)
    
        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)
  
    def forward(self,x, lens,train):
        
        rnn_inp = utils.rnn.pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs, _ = self.lstm(rnn_inp)


        length = np.array(list(lens))

        o_plstm1=self.plstm1(outputs,length//2 , train)
        o_plstm2=self.plstm2(o_plstm1,length//4 , train)
        o_plstm3=self.plstm3(o_plstm2,length//8 , train)
        
        linear_input, _ = utils.rnn.pad_packed_sequence(o_plstm3)

        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)
        
        new_length_list1 = length//8

        return keys, value, new_length_list1


# **Attention**

*    Attention is calculated using key, value and query from Encoder and decoder.

Below are the set of operations you need to perform for computing attention.

```
energy = bmm(key, query)
attention = softmax(energy)
context = bmm(attention, value)
```



In [8]:
class Attention(nn.Module):
  def __init__(self):
    super(Attention, self).__init__()
  def forward(self, query, key, value, lens):
    '''
    :param query :(N,context_size) Query is the output of LSTMCell from Decoder
    :param key: (T,N,key_size) Key Projection from Encoder per time step
    :param value: (T,N,value_size) Value Projection from Encoder per time step
    :return output: Attended Context
    :return attention_mask: Attention mask that can be plotted  
    '''

    key = key.transpose(0, 1)
    value = value.transpose(0, 1)
    lens = torch.LongTensor(lens)

    energy = torch.bmm(key, query.unsqueeze(2)).squeeze(2)

    mask = torch.arange(key.size(1)).unsqueeze(0) >= lens.unsqueeze(1)

    mask = mask.to(device)
    
    energy.masked_fill_(mask, -1e9)


    attention = nn.functional.softmax(energy, dim=1)

    context = torch.bmm(attention.unsqueeze(1), value).squeeze(1)

    
    return context
    

# **Decoder**

*    As mentioned in Recitation-9 each forward call of decoder deals with just one time step. Thus we use LSTMCell instead of LSLTM here.
*    Output from the second LSTMCell can be used as query here for attention module.
*    In place of `value` that we get from the attention, this can be replace by context we get from the attention.
*    Methods like Gumble noise and teacher forcing can also be incorporated for improving the performance.

In [9]:

class Decoder(nn.Module):
  def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128,  isAttended=False):
    super(Decoder, self).__init__()
    self.embedding = nn.Embedding(vocab_size, hidden_dim)

    
    self.lstm1 = nn.LSTMCell(input_size=hidden_dim+value_size, hidden_size=hidden_dim)
    
    self.do1= nn.Dropout(0.2)
    self.do2= nn.Dropout(0.2)
    
    self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)

    self.isAttended = isAttended
    if(isAttended):
      self.attention = Attention()
    self.character_prob = nn.Linear(key_size+value_size,vocab_size)

  def forward(self, key, values,input_length,rate, text=None, train=True):
    '''
    :param key :(T,N,key_size) Output of the Encoder Key projection layer
    :param values: (T,N,value_size) Output of the Encoder Value projection layer
    :param text: (N,text_len) Batch input of text with text_length
    :param train: Train or eval mode
    :return predictions: Returns the character perdiction probability 
    '''
    batch_size = key.shape[1]
    if(train):
      max_len =  text.shape[1]-1
      embeddings = self.embedding(text)
    else:
      max_len = 250
    
    predictions = []
    hidden_states = [None, None]
    
    
    prediction = torch.zeros(batch_size,1).to(device)
    
    x_att = values[0,:,:]

    gumbel_rate =0.1

    for i in range(max_len):
      '''
      Here you should implement Gumble noise and teacher forcing techniques
      '''
      teacher_forcing_choice = np.random.choice([0,1],p=[rate,1-rate])
        
      if(train):
        if(teacher_forcing_choice==0):
                prediction1= Gumbel(prediction.cpu() ,torch.FloatTensor([gumbel_rate])).sample().to(device)
                 
                char_embed = self.embedding(prediction1.argmax(dim=-1))

        char_embed = embeddings[:,i,:]

      else:
        
        if(i==0):
            
            char_embed = self.embedding(torch.tensor([1]).to(device))
        
        else:
         
            char_embed = self.embedding(prediction.argmax(dim=-1))
    
      
      
      #When attention is True you should replace the values[i,:,:] with the context you get from attention
      
      inp = torch.cat([char_embed,x_att], dim=1)
      if(train):
        
        hidden_states[0] = self.lstm1(inp,hidden_states[0])
        o1 = self.do1(hidden_states[0][0])
        
        hidden_states[1] = self.lstm2(o1,hidden_states[1])
        output = self.do2(hidden_states[1][0])
      else:  
      
        hidden_states[0] = self.lstm1(inp,hidden_states[0])

        inp_2 = hidden_states[0][0]
        hidden_states[1] = self.lstm2(inp_2,hidden_states[1])

        output = hidden_states[1][0]
      
      x_att = self.attention(output,key,values,input_length)
      prediction = self.character_prob(torch.cat([output, x_att], dim=1))
      predictions.append(prediction.unsqueeze(1))

    return torch.cat(predictions, dim=1)

# **Sequence to Sequence Model**

*    We train an end-to-end sequence to sequence model comprising of Encoder and Decoder.

In [12]:
class Seq2Seq(nn.Module):
    def __init__(self,input_dim,vocab_size,hidden_dim,value_size=128, key_size=128,isAttended=False):
        super(Seq2Seq,self).__init__()

        self.encoder = Encoder(input_dim, hidden_dim)
        self.decoder = Decoder(vocab_size, hidden_dim*2, isAttended=isAttended)
    def forward(self,speech_input, speech_len, rate, text_input=None,train=True):
        key, value,new_length_list = self.encoder(speech_input, speech_len , train=train)
        
        if(train):
              predictions = self.decoder(key, value, rate=rate, text=text_input,input_length=new_length_list)
        else:
              predictions = self.decoder(key, value, rate=rate, text=None, train=False, input_length=new_length_list)
        return predictions


# **DataLoader**

Below is the dataloader for the homework.

*    You are expected to fill in the collate function if you use this code skeleton.

In [13]:
class Speech2Text_Dataset(Dataset):
    def __init__(self, speech, text=None, train=True):
        self.speech = speech
        self.train = train
        if(text is not None):
              self.text = text
    def __len__(self):
        return self.speech.shape[0]
    def __getitem__(self, index):
        if(self.train):
              return torch.tensor(self.speech[index].astype(np.float32)), torch.tensor(self.text[index])
        else:
              return torch.tensor(self.speech[index].astype(np.float32))

In [14]:
def collate_train(batch_data):
    
#   '''
#   Complete this function.
#   I usually return padded speech and text data, and length of 
#   utterance and transcript from this function 
#   '''
    inputs,targets = zip(*batch_data)
    
    X=inputs
    
    Y=targets
    
    #print(X)
    
    X1=[]
        
    Y1=[]
        
    X_lens=[]

    Y_lens=[]
    
    utterances = len(X)

    for i in range(len(X)):

        X1.append(torch.FloatTensor(X[i]))
    
        Y1.append(torch.LongTensor(Y[i]))
    
        X_lens.append(X[i].shape[0])
    
        Y_lens.append(Y[i].shape[0])
        
    max_time_frame_length = max(X_lens)
        
    X2 = torch.zeros((len(X) , max_time_frame_length, 40))
        
    for i in range(len(X)):
    
        X2[i,:X_lens[i],:] = X1[i]
        
    seq_lengths = torch.LongTensor(X_lens)
    
    X = X2.transpose(0,1)
        
    max_Y_length = max(Y_lens)
        
    Y2 = torch.zeros((utterances , max_Y_length))
        
    for i in range(utterances):
        
        Y2[i,:Y_lens[i]] = Y1[i]
        
    seq_lengths_Y = torch.LongTensor(Y_lens)
    
    Y = Y2
 
    X_length = tuple(seq_lengths.tolist())
    
    Y_length = tuple(seq_lengths_Y.tolist())
    
    return X, Y , X_length, Y_length 
 



In [16]:
def collate_test(batch_data):
  '''
  Complete this function.
  I usually return padded speech and length of
  utterance from this function
  '''
  inputs = batch_data
  #print(inputs)
  input_lens = [len(seq) for seq in inputs]
  inputs = [torch.tensor(l) for l in inputs]
  inputs = pad_sequence(inputs)
  return inputs.to(device), tuple(input_lens)

In [17]:
Speech2Text_train_Dataset = Speech2Text_Dataset(speech_train, character_text_train)
Speech2Text_test_Dataset = Speech2Text_Dataset(speech_test, None, False)
Speech2Text_valid_Dataset = Speech2Text_Dataset(speech_valid, None,False)

In [18]:
train_loader = DataLoader(Speech2Text_train_Dataset, batch_size=64, shuffle=True, collate_fn=collate_train)
valid_loader = DataLoader(Speech2Text_valid_Dataset, batch_size=1, shuffle=False, collate_fn=collate_test)
test_loader = DataLoader(Speech2Text_test_Dataset, batch_size=1, shuffle=False, collate_fn=collate_test)

# **Learning**

Defining the Sequence to Sequence model, optimizer and criterion for learning.

Train routine is also provided here which can be referenced while writing validation and test routine.

In [19]:
def init_weights(m):
    for name, param in m.named_parameters():
      
      if 'weight' in name:
         nn.init.xavier_normal_(param)

In [20]:
model = Seq2Seq(input_dim=40,vocab_size=len(letter_list)+1,hidden_dim=256,isAttended=True)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(reduction='none').to(device)
model.apply(init_weights)
start_epoch=0

In [21]:
path = '/home/ubuntu/hw4part2/'
checkpoint = torch.load(path + "model_params_256gumbel15.tar")   
model.cuda()
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch']+1

In [121]:
path = '/home/ubuntu/hw4part2/'
def train(model,train_loader, num_epochs, criterion, optimizer):
  rate=0.1*(1.05)
  for epochs in range(start_epoch,num_epochs):
    loss_sum = 0
    since = time.time()
    print("Epochs:",epochs)
    
    for (batch_num, collate_output) in enumerate(train_loader):
      with torch.autograd.set_detect_anomaly(True):
        
        speech_input, text_input, speech_len, text_len = collate_output
        text_input = text_input.long()
        speech_input = speech_input.to(device)
        '''
        Please check if the text_input is of the (Batch_size, Sequence_length) i.e. (B,L)
        '''
        text_input = text_input.to(device)

        predictions = model(speech_input= speech_input, speech_len=speech_len ,text_input=text_input, rate=rate)
       
        mask = torch.zeros(text_input.size()).to(device)

        for i in range(len(text_len)):

          mask[i,:text_len[i]] = 1
        
        mask = mask[:,1:]
        
        mask = mask.reshape(-1).to(device)
        


        predictions = predictions.contiguous().view(-1, predictions.size(-1))
        '''
        If you do not have text_input as (B,L) but have (L,B) instead then make
        sure that you uncomment the next line of code
        '''

        text_input = text_input[:,1:]
        
        batch_size = text_input.shape[0]

       
        text_input = text_input.contiguous().view(-1)

        loss = criterion(predictions, text_input)

        masked_loss = torch.sum(loss*mask)

        masked_loss1 = masked_loss /(sum(text_len) - batch_size)

        
        optimizer.zero_grad()
        masked_loss1.backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), 2)
        optimizer.step()

        current_loss = float(masked_loss.item())/int(torch.sum(mask).item())

        if  batch_num % 25 == 1:
          print('train_loss', current_loss)
          print("perplexity",np.exp(masked_loss1.cpu().detach().numpy()))
            
    if(rate < 0.4):
        rate = rate*1.05
        
    torch.save({
        'epoch': epochs,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': current_loss,
        },path + "model_params_gumbel" + str(epochs) + ".tar" )

In [None]:
train(model, train_loader, 30, criterion, optimizer)

In [23]:
path = '/home/ubuntu/hw4part2/'
path1= '/home/ubuntu/hw4part2/withoutgumbel/'
checkpoint = torch.load(path + "model_params_256gumbel15.tar")   
model.cuda()
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
print(checkpoint['loss'])

0.08902769749237045


In [None]:
def test(model,test_loader,criterion, optimizer):
     output_list=[]
    
     for (batch_num, collate_output) in enumerate(test_loader):
        
            #speech_input, a , speech_len, b = collate_output
            speech_input,speech_len = collate_output
            speech_input = speech_input.to(device)
            #print(speech_input.size())
            predictions = model(speech_input, speech_len,train=False, rate=0.1)
            predictions = predictions.contiguous().view(-1, predictions.size(-1))
            #print(predictions.size())
            out_token = torch.argmax(predictions,dim=-1)
            #print(out_token)
            letter_list1= np.array(letter_list)
            #print(letter_list1[out_token.cpu()-1].shape)
            sentence = "".join(letter_list1[out_token.cpu()-1])
            #print(sentence)
            if('<eos>' in sentence):
                 
                 stop_index=sentence.index('<eos>')
                 sentence = sentence[:stop_index]

            print(sentence)
            output_list.append(sentence)
            

        
     return output_list       
            

    
    
output_list = test(model,test_loader,criterion,optimizer)  

In [25]:
path ='/home/ubuntu/hw4part2/'
ids=np.arange(523).reshape(523,1)
string_list = np.array(output_list).reshape(523,1)
np.savetxt(path + "outputsubmissiongumbel15_epoch.csv", string_list, delimiter=',',fmt="%s")