In [1]:
import os
import re
import sys
import urllib
from pathlib import Path
import pickle

import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm


import torch.nn as nn
import torch
import torch.nn.functional as F
from torch import optim
import time
import random

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
def load_pkl(path):
    """
    Loads data from given path to .pkl file.
    """
    with open(path, 'rb') as f:
        data = pickle.load(f)
    return data

audio_sequence_padded=load_pkl('preprocessed_np1/audio_seq_padded')

In [4]:
txt_sequence_length=np.load('preprocessed_np1/txt_length.npy')
txt_sequence_padded=np.load('preprocessed_np1/txt_seq_padded.npy')
audio_sequence_length=np.load('preprocessed_np1/audio_length.npy')

In [5]:
ind2word=load_pkl('preprocessed_np1/ind2word')
word2ind=load_pkl('preprocessed_np1/word2ind')

In [6]:
len(txt_sequence_padded)

2348

In [7]:
batch_size=64
y=txt_sequence_padded
X=audio_sequence_padded
X_len=audio_sequence_length
y_len=txt_sequence_length

number_of_batches = (len(X)+batch_size-1)//batch_size
sample_index = np.arange(len(X))
val_batches=int(np.floor(number_of_batches*0.2))
train_batches=number_of_batches-val_batches

def batch_generator(batch):
    x_batch,y_batch,x_batchel_len,y_batchel_len=[],[],[],[]
    if batch == (number_of_batches-batch_size):
        batch_index=sample_index[batch_size*batch::]
        print('end')
    else:
        batch_index=sample_index[batch_size*batch:batch_size*(batch+1)]
        
    for i in (batch_index):
        x_batch.append(X[i].toarray())
        x_batchel_len.append(X_len[i])
        y_batch.append(y[i])
        y_batchel_len.append(y_len[i])
    
    y_batch=np.array(y_batch)
#     y_batch=y_batch.astype('int32')
    y_batch=torch.tensor(y_batch)
    
    x_batch=np.array(x_batch)
#     x_batch=x_batch.astype('float64')
    x_batch=torch.tensor(x_batch).float()
    
    return x_batch,y_batch,torch.tensor(x_batchel_len),torch.tensor(y_batchel_len)

In [8]:
#input_shape=(2,1628,494)
class EncoderRNN(nn.Module):
    def __init__(self,hidden_size,n_layers=1,dropout=0):
        super(EncoderRNN,self).__init__()
        self.n_layers=n_layers
        self.hidden_size=hidden_size
#         self.embedding=nn.Embedding(len(word2ind),hidden_size)
        self.gru=nn.GRU(hidden_size,hidden_size,n_layers,dropout=(0 if n_layers == 1 else dropout), bidirectional=True)
        
       
    def forward(self,input_seq,input_lengths,hidden=None):
#         F.normalize(input_seq,p=2,dim=2)
        packed=nn.utils.rnn.pack_padded_sequence(input_seq,input_lengths,batch_first=True,enforce_sorted=False)  #enforce_sorted=False) since arranged in asscending order
#         packed=nn.utils.rnn.pack_padded_sequence(input_seq,input_lengths,batch_first=True)
        outputs,hidden=self.gru(packed,hidden)
        outputs, _ =nn.utils.rnn.pad_packed_sequence(outputs)
        # Sum bidirectional GRU outputs
        outputs=outputs[:,:,:self.hidden_size]+outputs[:,:,self.hidden_size:]
        return outputs,hidden

# encoder=EncoderRNN(494)
# op,hidden=encoder(x_in,xl)
# print(op.shape)
# print(hidden.shape)
# torch.sum(hidden*op,dim=2)

In [9]:
class Attn(nn.Module):
    def __init__(self,method,hidden_size):
        super(Attn,self).__init__()
        self.method=method
        if self.method not in ['dot','general','concat']:
            raise ValueError(self.method,"is not an appropriate attention method.")
        self.hidden_size=hidden_size
        if self.method == 'general':
            self.attn=nn.Linear(self.hidden_size,self.hidden_size)
        elif self.method == 'concat':
            self.attn=nn.Linear(self.hidden_size * 2,hidden_size)
            self.v=nn.Parameter(torch.FloatTensor(hidden_size))
            
    def dot_score(self,hidden,encoder_output):
        return torch.sum(hidden*encoder_output,dim=2)
        
    def general_score(self,hidden,encoder_output):
        energy=self.attn(encoder_output)
        return torch.sum(hidden * energy,dim=2)            
    
    def concat_score(self,hidden,encoder_output):
        energy=self.attn(torch.cat((hidden.expand(encoder_output.size(0),-1,-1),encoder_output),2)).tanh()
        return torch.sum(self.v*energy,dim=2)
    
    def forward(self,hidden,encoder_outputs):
    # Calculate the attention weights (energies) based on the given method
        if self.method == 'general':
            attn_energies = self.general_score(hidden,encoder_outputs)
        elif self.method == 'concat':
            attn_energies = self.concat_score(hidden,encoder_outputs)
        elif self.method == 'dot':
            attn_energies = self.dot_score(hidden,encoder_outputs)

        # Transpose max_length and batch_size dimensions
        attn_energies = attn_energies.t()                            #attn_energies has size (hidden_size,max_len)

        # Return the softmax normalized probability scores (with added dimension)
        return F.softmax(attn_energies,dim=1).unsqueeze(1)             #(hidden_size,1,max_len)

In [10]:
class LuongAttnDecoderRNN(nn.Module):
    def __init__(self,attn_model,hidden_size,output_size,n_layers=2,dropout=0.1):
        super(LuongAttnDecoderRNN,self).__init__()

        #keep for reference 
        self.attn_model=attn_model
        self.hidden_size=hidden_size
        self.output_size=output_size
        self.n_layers=n_layers
        self.dropout=dropout

        #Define layers
        self.embedding = nn.Embedding(len(word2ind), hidden_size)
        self.embedding_dropout=nn.Dropout(dropout)
        self.gru=nn.GRU(hidden_size,hidden_size,n_layers,dropout=(0 if n_layers==1 else dropout))
        self.concat=nn.Linear(hidden_size * 2,hidden_size)
        self.out=nn.Linear(hidden_size,output_size)
        self.attn=Attn(attn_model,hidden_size)

    def forward(self,input_step,last_hidden,encoder_outputs):
        #we run one step(word) at a time
        #Get embedding of current input word
        embedded=self.embedding(input_step)
        embedded=self.embedding_dropout(embedded)
        #Forward through unidirectional GRU
        rnn_output,hidden=self.gru(embedded,last_hidden)
        #Calculate attention weights from current GRU output
        attn_weights=self.attn(rnn_output,encoder_outputs)
        #Multiply attention weights to encoder to get new "weighted sum" context vector
        context=attn_weights.bmm(encoder_outputs.transpose(0,1))
        #Concatenate weighted context vector and GRU output using p(yt|y<t ,x) = softmax(Wsh̃t)
        rnn_output=rnn_output.squeeze(0)
        context=context.squeeze(1)
        concat_input=torch.cat((rnn_output,context),1)
        concat_output=torch.tanh(self.concat(concat_input))
        #Predict next word using p(yt|y<t ,x) = softmax(Wsh̃t)
        output=self.out(concat_output)
        output=F.softmax(output,dim=1)
        return output,hidden
    
# attn=Attn('dot',494)
# decoder_input=torch.LongTensor([[1 for _ in range(batch_size)]])
# Luong_attn=LuongAttnDecoderRNN('dot',494,len(word2ind))
# decoder_hidden=hidden[:Luong_attn.n_layers] 
# o,h=Luong_attn(decoder_input,decoder_hidden,op)
# print(o.shape)
# print(h.shape)

In [11]:
def train(x_batch,x_len,y_batch,y_len,encoder,decoder,word2ind,encoder_optimizer,decoder_optimizer,batch_size):
    encoder_optimizer.zero_grad()  #zero_grad clears old gradients from the last step (otherwise you'd just accumulate the gradients from all loss. backward() calls). ... step() causes the optimizer to take a step based on the gradients of the parameters.
    decoder_optimizer.zero_grad()

    #Set device options
    x_batch=x_batch.to(device)
    x_len=x_len.to(device)
    y_batch=y_batch.to(device)
    y_len=y_len.to(device)
    
    #Initialize variables
    loss=0
    num=0
    print_losses=[]
    
    #Forward pass through encoder
    encoder_outputs,encoder_hidden=encoder(x_batch,x_len)
    
    #Create initial decoder input (starting with SOS_token for each sent)
    decoder_input=torch.LongTensor([[word2ind['<SOS>'] for _ in range(batch_size)]])
    decoder_input = decoder_input.to(device) 
    
    #Set initial decoder hidden state to the encoder's final hidden state
    decoder_hidden=encoder_hidden[:decoder.n_layers]           #':deocder.n_layer' used to access 0th index i.e. if n_layers =1 it will access [0,:,:] and if n_layers=2 it will access [:2,0,0]

    #Determine if we are using teacher forcing this iteration
    use_teacher_forcing=True if random.random() < teacher_forcing_ratio else False
    
    max_target_len=max([indexes for indexes in y_len]) 
    
    if use_teacher_forcing :
        for t in range(max_target_len):
            decoder_output,decoder_hidden=decoder(decoder_input, decoder_hidden, encoder_outputs)
            
            #Teacher forcing: next input is current target
#             decoder_input = y_batch[t].view(1, -1)
            decoder_input=(torch.tensor([y_batch[i][t] for i in range(batch_size)]).view(-1,batch_size)).to(device)
#             decoder_input=(torch.tensor([y_batch[0][t],y_batch[1][t]]).view(-1,batch_size)).to(device)
            # Calculate and accumulate loss
            for i in range(batch_size):
                if y_batch[i][t]!=torch.tensor(0):
                    num+=1
                    decoder_output1=decoder_output[i,:].view(-1,decoder_output.shape[1])
                    mask_loss = nn.CrossEntropyLoss()(decoder_output1.to(device), torch.tensor([y_batch[i][t]]).to(device))
                    loss += mask_loss
                    
        loss=loss/num
    
            
    else:
        for t in range(max_target_len):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            # No teacher forcing: next input is decoder's own current output
            _, topi = decoder_output.topk(1)
            decoder_input = torch.LongTensor([[topi[i][0] for i in range(batch_size)]])
            decoder_input = decoder_input.to(device)
            # Calculate and accumulate loss
            for i in range(batch_size):
                if y_batch[i][t]!=torch.tensor(0):
                    num+=1
                    decoder_output1=decoder_output[i,:].view(-1,decoder_output.shape[1])
                    mask_loss = nn.CrossEntropyLoss()(decoder_output1.to(device), torch.tensor([y_batch[i][t]]).to(device))
                    loss += mask_loss
        loss=loss/num
            
    # Perform backpropatation
    loss.backward()
    
    # Clip gradients: gradients are modified in place
    _ = nn.utils.clip_grad_norm_(encoder.parameters(), clip)
    _ = nn.utils.clip_grad_norm_(decoder.parameters(), clip)

    # Adjust model weights
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss

In [12]:
plot_loss_list=[]
def trainIters(model_name,encoder,decoder,encoder_optimizer, decoder_optimizer, encoder_n_layers, decoder_n_layers,save_dir, clip, loadFilename,epoch,batch_size):
    # Load batches for each iteration
    print_loss=0
    plot_loss=0
    print_every=10
    plot_every=9
    start=time.time()
    
    if loadFilename:
        print('started with : {}'.format(checkpoint['iteration']))
        start_iteration = checkpoint['iteration'] + 1        #IF TRAINING IS DISCONNECTED INBETWEEN , WE NEED NOT TO TRAIN FROM SCRATCH AND CAN CONTINUE FROM WHERE IT IS LEFT BEFORE
    
    else:
        start_iteration=0
        print('STARTED')
        
    for _ in tqdm(range(epoch)): 
        for train_batch in range(start_iteration,val_batches+train_batches-1):
            x_train,y_train,x_len,y_len=batch_generator(train_batch)
            train_loss=train(x_train,x_len,y_train,y_len,encoder,decoder,word2ind,encoder_optimizer,decoder_optimizer,batch_size)
            print_loss+=train_loss
            plot_loss+=train_loss

            if train_batch % print_every == 0 and train_batch !=0:
                print_loss_avg = print_loss / (print_every+1)
                print("Iteration: {}; Average loss: {:.4f};time:{}".format(train_batch, print_loss_avg,time.time()-start))
                print('_'*30)
                print_loss = 0

            if train_batch % plot_every == 0 and train_batch !=0:
                plot_loss_list.append(plot_loss/(plot_every+1))
                plot_loss = 0
                
#         scheduler.step()
#         scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, steps)
#         decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(decoder_optimizer, val_batches+train_batches-1,eta_min=learning_rate*decoder_learning_ratio, last_epoch=5)


    plt.plot(plot_loss_list)

In [13]:
model_name='speech2text_model'
attn_model='dot'
hidden_size=494
encoder_n_layers=2
decoder_n_layers=2
dropout=0.1
n_iteration_val=False

# load_trained=input('enter True if using pretrained otherwise False')
load_trained='False'
if load_trained == 'True':
    print('Using Trained Model')
#     checkpoint_iter = 15000
#     loadFilename = os.path.join('/processed_np', model_name,
#                            '{}-{}_{}'.format(encoder_n_layers, decoder_n_layers, hidden_size),
#                            '{}_checkpoint.tar'.format(checkpoint_iter))
    #If loading on same machine the model was trained on
#     checkpoint = torch.load(loadFilename,device)
#     # If loading a model trained on GPU to CPU
#     #checkpoint = torch.load(loadFilename, map_location=torch.device('cpu'))
#     encoder_optimizer_sd = checkpoint['en_opt']
#     decoder_optimizer_sd = checkpoint['de_opt']
#     embedding_sd = checkpoint['embedding']
#     n_iteration_val=True 
    
else:
    loadFilename=None
    
print('Building encoder and decoder ...')
# Initialize encoder & decoder models
encoder = EncoderRNN(hidden_size, encoder_n_layers, dropout)
decoder = LuongAttnDecoderRNN(attn_model, hidden_size, len(word2ind), decoder_n_layers, dropout)
if loadFilename:
    encoder.load_state_dict(encoder_sd)
    decoder.load_state_dict(decoder_sd)
# Use appropriate device
encoder = encoder.to(device)
decoder = decoder.to(device)
print('Models built and ready to go!')

Building encoder and decoder ...
Models built and ready to go!


In [15]:
# Configure training/optimization
clip = 50.0
teacher_forcing_ratio = 0.4
learning_rate = 0.00001
decoder_learning_ratio = 5.0

# Ensure dropout layers are in train mode
encoder.train()
decoder.train()

# Initialize optimizers
print('Building optimizers ...')
# encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
# encoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(encoder_optimizer, 1 ,eta_min=learning_rate, last_epoch=5)

# decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate*decoder_learning_ratio)
# decoder_scheduler = optim.lr_scheduler.CosineAnnealingLR(decoder_optimizer, val_batches+train_batches-1,eta_min=learning_rate*decoder_learning_ratio, last_epoch=5)

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

if loadFilename:
    encoder_optimizer.load_state_dict(encoder_optimizer_sd)
    decoder_optimizer.load_state_dict(decoder_optimizer_sd)

# If you have cuda, configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

# Run training iterations
print("Starting Training!")

epoch=4

#95 IS Y_BATCH[1]
trainIters(model_name,encoder, decoder, encoder_optimizer, decoder_optimizer,
               encoder_n_layers, decoder_n_layers, 'processed_np/',
               clip, loadFilename,epoch,batch_size)


In [None]:
# def binaryMatrix(l,value=word2ind['<PAD>']):
#     m=[]
#     print(l)
#     for i,seq in enumerate(l):
#         m.append([])
#         for token in seq:
#             if token == value:
#                 m[i].append(0)
#             else:
#                 m[i].append(1)
#     return m 

# def maskNLLLoss(op,target):
#     mask=binaryMatrix(target)                 #mask is in form 1,0
#     mask=torch.BoolTensor(mask) 
#     nTotal=mask.sum()
#     crossEntropy=-torch.log(torch.gather(op,1,target.view(-1,1)).squeeze(1))                 #loss=summation(log(probabilities))
#     loss=crossEntropy.masked_select(mask).mean()
#     loss=loss.to(device)
#     return loss,nTotal.item()