In [0]:
from google.colab import drive
drive.mount('/content/drive')

# Importing Libraries

In [0]:
import numpy as np
import torch
import random
from torch import nn
from torch.nn.utils.rnn import *
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tqdm import tqdm
from operator import itemgetter
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

In [0]:
!pip install python-Levenshtein
import Levenshtein

# Loading Datasets

In [0]:
train_x = np.load('/content/drive/My Drive/train_new.npy', allow_pickle=True, encoding='bytes')
valid_x = np.load('/content/drive/My Drive/dev_new.npy', allow_pickle=True, encoding='bytes')
test_x = np.load('/content/drive/My Drive/test_new.npy', allow_pickle=True, encoding='bytes')

train_y = np.load('/content/drive/My Drive/train_transcripts.npy', allow_pickle=True,encoding='bytes')
valid_y = np.load('/content/drive/My Drive/dev_transcripts.npy', allow_pickle=True,encoding='bytes')
vocab = ['<sos>']+sorted(list(set(''.join([''.join([p.decode("utf-8") for p in train_y[i]]) for i in range(train_y.shape[0])]))))+[' ']+['<eos>']+['<pad>']
letter2index = dict(zip(vocab, range(len(vocab))))
index2letter = dict(zip(range(len(vocab)),vocab))
train_Y = [torch.LongTensor([vocab.index('<sos>')] + [vocab.index(j)  for j in ' '.join([q.decode("utf-8") for q in train_y[i]])] + [vocab.index('<eos>')]) for i in range(train_y.shape[0])]
valid_Y = [torch.LongTensor([vocab.index('<sos>')] + [vocab.index(j)  for j in ' '.join([q.decode("utf-8") for q in valid_y[i]])] + [vocab.index('<eos>')]) for i in range(valid_y.shape[0])]

In [0]:
class Speech2TextDataset(Dataset):

    def __init__(self, speech, text=None, isTrain=True):
        self.speech =  [torch.FloatTensor(word) for word in speech]
        self.isTrain = isTrain
        if (text is not None):
            self.text = text

    def __len__(self):
        return len(self.speech)

    def __getitem__(self, index):
        if (self.isTrain == True):
            return self.speech[index],self.text[index]#self.text[index][:-1], self.text[index][1:]
        else:
            return self.speech[index]


def collate_train(seq_list):
    inputs =  [i[0] for i in seq_list]
    targets = [i[1] for i in seq_list]
   
    inputs = pad_sequence(inputs)
    targets = pad_sequence(targets,batch_first=True, padding_value = 34)
    X_lens = torch.LongTensor([len(seq[0]) for seq in seq_list])
    Y_lens = torch.LongTensor([len(seq[1]) for seq in seq_list])
    
    return inputs,targets,X_lens,Y_lens


def collate_test(seq_list):
    inputs = [i for i in seq_list]
    inputs = pad_sequence(inputs)
    X_lens = torch.LongTensor([len(seq) for seq in seq_list])
    return inputs,X_lens  

In [0]:
train_dataset = Speech2TextDataset(train_x,train_Y)
val_dataset =   Speech2TextDataset(valid_x,valid_Y)
test_dataset =  Speech2TextDataset(test_x,None,False)
train_loader =  DataLoader(train_dataset, shuffle=True, batch_size=64, collate_fn = collate_train,num_workers=12, pin_memory=True)
val_loader =    DataLoader(val_dataset, shuffle=False, batch_size=256, collate_fn = collate_train,num_workers=12, pin_memory=True)
test_loader =   DataLoader(test_dataset, shuffle=False, batch_size=1, collate_fn = collate_test,num_workers=12, pin_memory=True)

# Model

## Attention

In [0]:
class Attention(nn.Module):
    def __init__(self):
        super(Attention, self).__init__()
    
    def forward(self, query, keys, values,lengths):

        keys = (torch.transpose(keys,0,1)).to(DEVICE)
        lengths = lengths.to(DEVICE)
        attention = (torch.bmm(keys, query.unsqueeze(2)).squeeze(2)).to(DEVICE)
        mask = torch.arange(keys.size(1)).unsqueeze(0).to(DEVICE) >= lengths.unsqueeze(1)
        attention.masked_fill_(mask, -1e20)
        attention = nn.functional.softmax(attention, dim=1)
        values = torch.transpose(values,0,1)
        out = torch.bmm(attention.unsqueeze(1), values).squeeze(1)
        
        return out

## Encoder

In [0]:
class Encoder(nn.Module):

    def __init__(self, input_dim, hidden_dim, value_size=128,key_size=128):
        super(Encoder, self).__init__()
        
        self.cnn1 = torch.nn.Conv1d(input_dim,hidden_dim//4 , 3, stride=2, padding=1,bias=False)
        self.cnn2 = torch.nn.Conv1d(hidden_dim//4, hidden_dim//2, 3, stride=2, padding=1,bias=False)
        self.cnn3 = torch.nn.Conv1d(hidden_dim//2, hidden_dim//2, 3, stride=2, padding=1,bias=False)
        self.cnn4 = torch.nn.Conv1d(input_dim, hidden_dim//2, 1, stride=8, padding=0,bias=False)
        self.tanh = nn.Hardtanh()
        self.lstm1 = nn.LSTM(input_size=hidden_dim//2, hidden_size=hidden_dim, num_layers=3, dropout=0.5,bidirectional=True)
        self.lstm2 = nn.LSTM(input_size=hidden_dim*2, hidden_size=hidden_dim, bidirectional=True)

        self.key_network = nn.Linear(hidden_dim*2, value_size)
        self.value_network = nn.Linear(hidden_dim*2, key_size)


    def forward(self, x, lens,istraining=True):
        

        x1 = nn.functional.dropout(self.tanh(self.cnn1(x.transpose(0,1).transpose(1,2))),0.2,training=istraining)
        lens = (torch.floor((((lens - 1)/2) + 1).float())).long()
        
        x1 = nn.functional.dropout(self.tanh(self.cnn2(x1)),0.2,training=istraining)
        lens = (torch.floor((((lens - 1)/2) + 1).float())).long()

        x1 = self.cnn3(x1)
        lens = (torch.floor((((lens - 1)/2) + 1).float())).long()
        x1 = x1.transpose(1,2).transpose(0,1)
        x = (self.cnn4(x.transpose(0,1).transpose(1,2))).transpose(1,2).transpose(0,1)

        x = nn.functional.dropout(self.tanh(x + x1),0.2,training=istraining)
  
        rnn_inp = pack_padded_sequence(x, lengths=lens, batch_first=False, enforce_sorted=False)
        outputs = self.lstm1(rnn_inp)[0]
        outputs,final_state = self.lstm2(outputs)

        linear_input, lens = pad_packed_sequence(outputs)
        
        keys = self.key_network(linear_input)
        value = self.value_network(linear_input)

        return keys, value, lens, final_state

## Decoder

In [0]:
class Decoder(nn.Module):
    def __init__(self, vocab_size, hidden_dim, value_size=128, key_size=128, isAttended=True):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=34)
        self.lstm1 = nn.LSTMCell(input_size=hidden_dim + value_size, hidden_size=hidden_dim)
        self.lstm2 = nn.LSTMCell(input_size=hidden_dim, hidden_size=key_size)        
        self.linear = nn.Linear(key_size + value_size, hidden_dim)
        self.relu = nn.ReLU()
        self.character_prob = nn.Linear( hidden_dim, vocab_size)
        self.attention = Attention()

    def forward(self, x, context, key, values, context_lengths, hidden_states,istraining=True):

        char_embed = self.embedding(x)

        inp = torch.cat([char_embed, context], dim=1)    
        hidden_states[0] = self.lstm1(inp, hidden_states[0])
        inp_2 = hidden_states[0][0]
        hidden_states[1] = self.lstm2(inp_2, hidden_states[1])

        output = hidden_states[1][0]
        context = self.attention(output, key, values, context_lengths)
        prediction =  nn.functional.dropout(self.relu(self.linear(torch.cat([output, context], dim=1))),0.1,training=istraining)
        prediction = self.character_prob(prediction)
 
        return prediction, hidden_states,context

# Initializing Models

In [0]:
def init_weights(m):
    if type(m) == nn.Conv1d or type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight.data)

In [0]:
encoder = Encoder(40, 512, value_size=128,key_size=128)
decoder = Decoder(len(vocab), 512*2, value_size=128, key_size=128, isAttended=True)
encoder.load_state_dict(torch.load('/content/drive/My Drive/encoder3.pt'))
decoder.load_state_dict(torch.load('/content/drive/My Drive/decoder3.pt'))
#encoder.apply(init_weights)
#decoder.apply(init_weights)
#params = list(encoder.parameters()) + list(decoder.parameters())
optimizer_encoder = torch.optim.Adam(encoder.parameters(), lr=0.001)
optimizer_decoder = torch.optim.Adam(decoder.parameters(), lr=0.001)#, weight_decay=1e-6)
criterion = nn.CrossEntropyLoss(reduction='sum')

# Training

In [0]:
def train(encoder,decoder,train_loader,criterion, optimizer_encoder, optimizer_decoder,epoch,teacher):
    encoder.train()
    encoder.to(DEVICE)
    decoder.train()
    decoder.to(DEVICE)
    total_loss = 0
    perplexity = 0 

    outer = tqdm(total=(len(train_loader)), desc='Training Epoch', position=0)
    
    for batch_idx, (inputs,targets,xlens,ylens) in enumerate(train_loader):
          
          outer.update(1)
          with torch.autograd.set_detect_anomaly(True):
            optimizer_encoder.zero_grad()
            optimizer_decoder.zero_grad()

            targets = targets.to(DEVICE)
            batch_size = targets.size(0)
            keys, values, lens, hidden = encoder(inputs.to(DEVICE),xlens.to(DEVICE),True)
            hidden = tuple(st.transpose(0, 1).reshape(inputs.size(1), -1) for st in hidden)
            
            n_tokens = ylens.sum() - ylens.size(0)
            hidden_states = [hidden, None]
            pred= torch.zeros(batch_size,1).to(DEVICE)
            context = values[0,:,:].to(DEVICE)
            Y = targets.transpose(0,1)
            loss = 0
            
            for i in range(Y.size(0) - 1):
         
                if np.random.random_sample() > teacher:

                      pred, hidden_states, context = decoder(pred.argmax(dim=1), context, keys, values,lens, hidden_states,True)
                      if np.random.random_sample() > 0.5:
                          pred = torch.distributions.gumbel.Gumbel(pred.to('cpu'), torch.tensor([0.4])).sample().to(DEVICE)
                else:
                      pred, hidden_states, context = decoder(Y[i], context, keys, values,lens, hidden_states,True)

                active = i + 1 < ylens
                loss += criterion(pred[active], Y[i + 1, active])

            loss /= n_tokens
            current_loss = loss.item()
            total_loss = total_loss + current_loss
            perplexity = perplexity + np.exp(current_loss)
            if batch_idx%10==0 and batch_idx>0:
                  print("Batch: {:02d} \t Train loss: {:.2f} \t Train Perplexity : {:.2f}".format(batch_idx,current_loss,np.exp(current_loss)))

            loss.backward()
            torch.nn.utils.clip_grad_norm_(encoder.parameters(), 2)
            torch.nn.utils.clip_grad_norm_(decoder.parameters(), 2)
            optimizer_encoder.step() 
            optimizer_decoder.step()          

            torch.cuda.empty_cache()
            del xlens,inputs,ylens,Y,active,loss,pred,hidden,lens,hidden_states,keys,values,targets
    print("Epoch: {:02d} \t Train loss: {:.2f} \t Train Perplexity : {:.2f}".format(epoch+1,total_loss/len(train_loader),perplexity/len(train_loader)))

In [0]:
def validate(encoder,decoder, val_loader,criterion, epoch):
    val_loss = 0
    val_per = 0
    dis = 0
    tot = 0 
    outer = tqdm(total=(len(val_loader)), desc='Validation Epoch', position=0)
    
    with torch.no_grad():
      encoder.eval()
      encoder.to(DEVICE)
      decoder.eval()
      decoder.to(DEVICE)
      for batch_idx, (inputs,targets,xlens,ylens) in enumerate(val_loader):
            outer.update(1)
            targets = targets.to(DEVICE)
            batch_size = targets.size(0)
            keys, values, lens, hidden = encoder(inputs.to(DEVICE),xlens.to(DEVICE),False)
            hidden = tuple(st.transpose(0, 1).reshape(inputs.size(1), -1) for st in hidden)
            
            n_tokens = ylens.sum() - ylens.size(0)
            predictions = []
            hidden_states = [hidden, None]
            pred= torch.zeros(batch_size,1).to(DEVICE)
            context = values[0,:,:].to(DEVICE)
            Y = targets.transpose(0,1)
            loss = 0

            for i in range(Y.size(0) - 1):

                pred, hidden_states, context = decoder(pred.argmax(dim=1), context, keys, values,lens, hidden_states,False)
                active = i + 1 < ylens
                loss += criterion(pred[active], Y[i + 1, active])
                predictions.append(pred.unsqueeze(1))
            
            predictions = torch.cat(predictions, dim=1)
            loss /= n_tokens
            current_loss = loss.item()
            val_loss = val_loss + current_loss
            val_per = val_per + np.exp(current_loss)

            
            for i in range(targets.shape[0]):
                tar = ''.join(vocab[k] for k in targets[i] if k!=0 and k!=34 and k!=33)
                predict = '0'
                for j in range(len(predictions[i])):
                      if predictions[i][j].argmax() == 33:
                          break
                      if predictions[i][j].argmax() == 34 or (predict[-1] == ' ' and predictions[i][j].argmax() == 32):
                          pass
                      else:
                        predict += vocab[predictions[i][j].argmax()]
  
                dis = dis +  Levenshtein.distance(predict[1:].strip(), tar)
                tot = tot + 1
                                
            torch.cuda.empty_cache()
            del xlens,inputs,ylens,Y,predictions,active,loss,pred,hidden,lens,hidden_states,keys,values,targets
    print("Epoch: {:02d} \t Valid loss: {:.2f} \t Valid Perplexity : {:.2f} \t Valid Score : {:.2f}".format(epoch+1,val_loss/len(val_loader),val_per/len(val_loader),dis/tot))
    return dis/tot

# Decoding

In [0]:
def greedyinference(encoder,decoder, test_loader,epoch):

    test_pred = []
    outer = tqdm(total=(len(test_loader)), desc='Test Epoch', position=0)
    
    with torch.no_grad():
      encoder.eval()
      encoder.to(DEVICE)
      decoder.eval()
      decoder.to(DEVICE)
      for batch_idx, (inputs,xlens) in enumerate(test_loader):
            outer.update(1)
            batch_size = len(xlens)
            keys, values, lens, hidden = encoder(inputs.to(DEVICE),xlens.to(DEVICE),False)
            hidden = tuple(st.transpose(0, 1).reshape(inputs.size(1), -1) for st in hidden)

            predictions = []
            hidden_states = [hidden, None]
            pred= torch.zeros(batch_size,1).to(DEVICE)
            context = values[0,:,:].to(DEVICE)

            for i in range(250):

                pred, hidden_states, context = decoder(pred.argmax(dim=1), context, keys, values,lens, hidden_states,False)
                predictions.append(pred.unsqueeze(1))
            
            predictions = torch.cat(predictions, dim=1)
            

            predict = '0'
            for j in range(len(predictions[0])):
                  if predictions[0][j].argmax() == 33:
                      break
                  if predictions[0][j].argmax() == 34 or (predict[-1] == ' ' and predictions[0][j].argmax() == 32):
                      pass
                  else:
                    predict += vocab[predictions[0][j].argmax()]
            
            test_pred.append(predict[1:].strip())            
             
            if batch_idx%50==0 and batch_idx>0:
                 print("Batch: {:02d} ".format(batch_idx))
            torch.cuda.empty_cache()
            del xlens,inputs,predictions,pred,hidden,lens,hidden_states,keys,values
    
    return test_pred

In [0]:
def randominference(encoder,decoder, test_loader):

    test_pred = []
    
    outer = tqdm(total=(len(test_loader)), desc='Test Epoch', position=0)
    
    with torch.no_grad():
      encoder.eval()
      encoder.to(DEVICE)
      decoder.eval()
      decoder.to(DEVICE)
       
      for batch_idx, (inputs,xlens) in enumerate(test_loader):
          outer.update(1)
          predi = []
          losses = []
          inputs1 = inputs.repeat(1,100,1)
          xlens1 = xlens.repeat(100)
          batch_size = 100
          keys, values, lens, hidden = encoder(inputs1.to(DEVICE),xlens1.to(DEVICE))
          hidden = tuple(st.transpose(0, 1).reshape(inputs1.size(1), -1) for st in hidden)

          predictions = []
          hidden_states = [hidden, None]
          pred= torch.zeros(batch_size,1).to(DEVICE)
          context = values[0,:,:].to(DEVICE)
          del inputs1,xlens1

          for i in range(250):

                
                if np.random.random_sample() > 0.5:
                          pred = torch.distributions.gumbel.Gumbel(pred.to('cpu'), torch.tensor([0.4])).sample().to(DEVICE)
                          pred, hidden_states, context = decoder(pred.argmax(dim=1), context, keys, values,lens, hidden_states,False)
                else:
                          pred, hidden_states, context = decoder(pred.argmax(dim=1), context, keys, values,lens, hidden_states,False)
                
                predictions.append(pred.unsqueeze(1))
            
          predictions = torch.cat(predictions, dim=1)

          for i in range(predictions.shape[0]):
              predict = '0'            
              for j in range(len(predictions[i])):
                    if predictions[i][j].argmax() == 34:
                        break
                    if predictions[i][j].argmax() == 33 or (predictions[i][j].argmax() == ' ' and predictions[i][j].argmax() == 32):
                        pass
                    else:
                      predict += vocab[predictions[i][j].argmax()]
              predi.append(predict[1:].strip())
           
          pred_tar = [torch.LongTensor([vocab.index('<sos>')] + [vocab.index(k) for k in predi[i]] + [vocab.index('<eos>')]) for i in range(len(predi))]
                
          for f in range(len(pred_tar)):
                targets = pred_tar[f].unsqueeze(0).to(DEVICE)
                ylens = targets.shape[1]
                n_tokens = ylens 
                predictions = []
                keys, values, lens, hidden = encoder(inputs.to(DEVICE),xlens.to(DEVICE))
                hidden = tuple(st.transpose(0, 1).reshape(inputs.size(1), -1) for st in hidden)

                hidden_states = [hidden, None]
                pred= torch.zeros(1,1).to(DEVICE)
                context = values[0,:,:].to(DEVICE)
                Y = targets.transpose(0,1)
                loss = 0

                for i in range(Y.size(0) - 1):
                    

                    pred, hidden_states, context = decoder(Y[i], context, keys, values,lens, hidden_states,False)
                    active = i + 1 < ylens
                    loss += criterion(pred[active].squeeze(0), Y[i + 1, active].squeeze(0))
                    predictions.append(pred.unsqueeze(1))
                
                predictions = torch.cat(predictions, dim=1)
                loss /= n_tokens
                losses.append(loss.item())
                torch.cuda.empty_cache()
                del ylens,Y,predictions,active,loss,pred,hidden,lens,hidden_states,keys,values,targets
          torch.cuda.empty_cache()
          del xlens,inputs,pred_tar

          ind = np.argmin(np.array(losses))
          test_pred.append(predi[ind])

      
          if batch_idx==1 and batch_idx>0:
            print(batch_idx)
            break 
                
    
      return test_pred,losses,predi

In [0]:
def beamsearch(keys, values, maxlen, hidden, beamwidth, vocab):
        
        init_path = [{'path':[],'score':0,'hidden':[hidden, None],'context':values[0,:,:].to(DEVICE),'current':torch.LongTensor([0]).to(DEVICE)}]
        best_path = []

        for i in range(maxlen):
              interim = []
              for path in init_path:
      
                  pred, hidden_states, context = decoder(path['current'], path['context'], keys, values,lens,  path['hidden'],False)
                  score, ind = torch.topk(nn.functional.log_softmax(pred, dim = 1),beamwidth, dim = 1)

                  for j in range(beamwidth):
                      inter = {}
                      inter['current'] = torch.LongTensor([ind[0,j]]).to(DEVICE)
                      inter['score'] = path['score'] + score[0,j].cpu().detach()
                      inter['path'] = path['path'] + [ind[0,j].cpu().detach()]
                      inter['hidden'] = hidden_states[:]
                      inter['context'] = context
                      interim.append(inter)

              interim = sorted(interim, key=itemgetter('score'),reverse=True)[:beamwidth]

              if i== maxlen-1:

                  for path in interim:
                      path['current'] = torch.LongTensor([33]).to(DEVICE)

              init_path = []

              for path in interim:
                  if path['current'] !=33:
                    init_path.append(path)
                  else:
                    path['score'] = path['score']/len(path['path'])
                    best_path.append(path)

              if not init_path:
                  break
                
        return convert_to_chars(sorted( best_path, key=itemgetter('score'),reverse=True)[0]['path'],vocab)

def convert_to_chars(path,vocab):

    pred = ''.join(vocab[k] for k in path if k!=0 and k!=34 and k!=33)

    return pred

In [0]:
# Training the models
teacher = 0.3
for i in range(10):
        
        train(encoder,decoder,train_loader,criterion, optimizer_encoder, optimizer_decoder,i,teacher)
        score = validate(encoder,decoder, val_loader, criterion,  i)
        torch.save(encoder.state_dict(), '/content/drive/My Drive/encoder3.pt')
        torch.save(decoder.state_dict(), '/content/drive/My Drive/decoder3.pt')

In [0]:
# Decoding on Test Set
test_pred = []
with torch.no_grad():
      encoder.eval()
      encoder.to(DEVICE)
      decoder.eval()
      decoder.to(DEVICE)
      for batch_idx, (inputs,xlens) in enumerate(test_loader):
            keys, values, lens, hidden = encoder(inputs.to(DEVICE),xlens.to(DEVICE))
            hidden = tuple(st.transpose(0, 1).reshape(inputs.size(1), -1) for st in hidden)
            path = beamsearch(keys, values, 250, hidden, 6, vocab)
            test_pred.append(path)
            torch.cuda.empty_cache()
            del xlens,inputs,keys,values,lens,hidden
            
            if batch_idx==50 and batch_idx>0:
              print(batch_idx)
            

In [0]:
import pandas as pd
dataframe = pd.DataFrame({'Id':[i for i in range(len(test_pred))],'Predicted':test_pred})
dataframe.to_csv("/content/drive/My Drive/mbarman_beam.csv", index=False)