In [445]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from torch import optim
import os 
import numpy as np
import time
import math
import matplotlib.pyplot as plt b
import matplotlib.ticker as ticker

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

dataset_path = '../dataset_224/activitynet/m_data.pt'  # get caption for decoder.
resfeat_path = '../dataset_224/activitynet/resfeat.pt'  # get resnet feature for encoder.

batch_size = 1
n_iter = 1

SOS_token = 0
EOS_token = 1

In [446]:
# get dataset vocab size
dataset = torch.load(dataset_path)
data_set = set()
for data_kind in dataset:
    for _, (_, value) in enumerate(dataset[data_kind].items()):
        captions = list(value.keys())
        for caption in captions:
            tokens = caption.split()
            for token in tokens:
                data_set.add(token)

dict_size = len(data_set) + 2 # binary array size for each unique word in dataset, EOS, SOS token.

# create dictionary for tokens in dataset
sos_array = np.eye(dict_size, dtype=np.int)[SOS_token] # word_to_index dictionary value
eos_array = np.eye(dict_size, dtype=np.int)[EOS_token]  # word_to_index dictionary value

sos_array_str = np.array2string(sos_array)  # index_to_word dictionary key.
eos_array_str = np.array2string(eos_array)  # index_to_word dictionary key. 

word_to_index = {"SOS":sos_array, "EOS":eos_array}
index_to_word = {sos_array_str:"SOS", eos_array_str:"EOS"}
for i, token in enumerate(data_set):
    one_hot_vector = np.eye(dict_size, dtype=np.int)[i]
    word_to_index[token] = one_hot_vector
    
    one_hot_str = np.array2string(one_hot_vector)
    index_to_word[one_hot_str] = token

In [447]:
resnet_feat = torch.load(resfeat_path) # resnet_feat is dict with key:id, value: torch array.
video_ids = list(resnet_feat.keys())

In [448]:
class EncoderRNN(nn.Module):
    
    def __init__(self, input_size, hidden_size, output_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.BLstm = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=2, bidirectional=True)
        
        
    def forward(self, input, hidden):
        output, hidden = self.BLstm(input, hidden)
        return output, hidden

    
    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size), torch.zeros(1, 1, self.hidden_size))
         

In [449]:
class DecoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.BLstm = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=2, bidirectional=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

        
    def forward(self, input, hidden):
        output, hidden = self.BLstm(input, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden


In [450]:
class Dataset_loader(Dataset):

    
    def __init__(self, data_kind):
        self.data_kind = data_kind  # train/valid/test
        
    
    def __len__(self):
        return len(resnet_feat)
  
    def word_to_array(self, word):
        return word_to_index[word]

    
    def __getitem__(self, idx):
        
        batch_start = idx*batch_size
        batch_end = min(batch_size*(idx+1), len(resnet_feat))
        vids = video_ids[batch_start:batch_end]
        
        # Get input 
        input = []
        for vid in vids:
            print(vid)
            value = dataset[self.data_kind][vid]
            time_list = list(value.values())
            for time in time_list:
                start_time = int(time[0])
                end_time = int(time[1])
                input.append(resnet_feat[vid][start_time:end_time])
                
        # Add zero padding 
        max_len = max([x.shape[0] for x in input])
        tensor = torch.zeros(len(input), max_len, input[0].shape[1])
        
        for i, item in enumerate(input):
            pack_array = np.zeros([max_len - item.shape[0], input[0].shape[1]], dtype=np.int8)
            item = np.concatenate((item,pack_array), axis=0)
            tensor[i] = torch.from_numpy(item)
                
        input_tensor = tensor
        
        # Get output
        output = []
        for vid in vids:
            value = dataset[self.data_kind][vid]
            for caption in value: # each sentence
                cap_array = np.zeros((len(caption),dict_size), dtype=np.int8)    
                words = caption.split()
                for i, word in enumerate(words):  # each word
                    word_arr = word_to_index[word]
                    cap_array[i] = word_arr
                output.append(cap_array)
        
        # Add zero padding 
        max_len = max([x.shape[0] for x in output])
        tensor = torch.zeros(len(output), max_len, output[0].shape[1])
        
        for i, item in enumerate(output):
            pack_array = np.zeros([max_len - item.shape[0], output[0].shape[1]])
            item = np.concatenate((item,pack_array), axis=0)
            tensor[i] = torch.from_numpy(item)
        
        output_tensor = tensor
        
        return input_tensor, output_tensor
        

In [451]:
# Training the model
teacher_forcing_ratio = 0.5

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    print('train started')
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
  
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
    
    loss = 0
    decoder_input = torch.from_numpy(word_to_index["SOS"])
    decoder_hidden = encoder_hidden
    decoder_output, _ = decoder(decoder_input, target_tensor)
    
    loss = criterion(decoder_output, target_tensor)
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()  # loss for batch.

In [452]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [453]:
plt.switch_backend('agg')

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [454]:
loader = Dataset_loader(data_kind='train')
dataloader = DataLoader(loader, batch_size=batch_size,shuffle=True)

def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        print('iter', iter)
        input_t, output_t = loader[0]
        loss = train(input_t, output_t, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        
        
#         for i, (input_tensor, output_tensor) in enumerate(dataloader):
#             print('i', i)
#             loss = train(input_tensor, output_tensor, encoder,
#                          decoder, encoder_optimizer, decoder_optimizer, criterion)
#             print_loss_total += loss
#             plot_loss_total += loss

#         if iter % print_every == 0:
#             print_loss_avg = print_loss_total / print_every
#             print_loss_total = 0
#             print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
#                                          iter, iter / n_iters * 100, print_loss_avg))

#         if iter % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0

#     showPlot(plot_losses)


In [455]:
enc_input_size = 2048

dec_input_size = dict_size
dec_output_size = dict_size

dec_hidden_size = dec_output_size
enc_hidden_size = dec_hidden_size
enc_output_size = dec_hidden_size

encoder1 = EncoderRNN(enc_input_size, enc_hidden_size, enc_output_size)
decoder1 = DecoderRNN(dec_input_size, dec_hidden_size, dec_output_size)

trainIters(encoder1, decoder1, 1, print_every=1)

iter 1
_0CqozZun3U
train started


AttributeError: 'tuple' object has no attribute 'size'

In [None]:
#     for index in range(num_actions):
    
#         for ei in range(input_length):
#             encoder_output, encoder_hidden = encoder(
#                 input_tensor[index][ei], encoder_hidden)
#         encoder_outputs = encoder_output[0, 0]

#         decoder_input = torch.from_numpy(word_to_index["SOS"]).to(device)

#         decoder_hidden = encoder_hidden

#         use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

#         if use_teacher_forcing:
#             # Teacher forcing: Feed the target as the next input
#             for di in range(target_length):
#                 decoder_output, decoder_hidden = decoder(
#                     decoder_input, decoder_hidden)
#                 loss += criterion(decoder_output, target_tensor[index][di])
#                 decoder_input = target_tensor[index][di]  # Teacher forcing

#         else:
#             # Without teacher forcing: use its own predictions as the next input
#             for di in range(target_length):
#                 decoder_output, decoder_hidden = decoder(
#                     decoder_input, decoder_hidden)
#                 topv, topi = decoder_output.topk(1)
#                 decoder_input = topi.squeeze().detach()  # detach from history as input

#                 loss += criterion(decoder_output, target_tensor[di])
#                 if decoder_input.item() == EOS_token:
#                     break