In [None]:
import torch
import torch.nn as nn 
from torch.utils.data import Dataset, DataLoader
from torch import optim
import torch.nn.utils.rnn as rnn_utils
import torch.nn.functional as F
import os 
import numpy as np
import time
import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import random
import re

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 10
n_iter = 5

In [None]:
class DatasetInfo():
    ####################################################################
    # Format of dataset files:
    #    1. Resfeat: {id:resnet feature of entire video id as tensor of size ?,1024} 
    #    2. Caption: {id:['caption1', 'caption2']}
    #    3. Video Summary: {id: [[10, 20], [30, 40]]}
    
    def __init__(self):
        self.caption = torch.load('../dataset_224/activitynet/caption.pt')
        self.summary = torch.load('../dataset_224/activitynet/video_summary.pt')
        self.resnet = torch.load('../dataset_224/activitynet/resfeat.pt')
        
        self.word_to_id_dict = {}
        self.id_to_word_dict = {}
     
    def max_frame_in_action(self):
        max_len = 0
        for i, (_, value) in enumerate(self.summary.items()):
            summary_size = max([x[1] - x[0] + 1 for x in value])
            max_len = max(max_len, summary_size)
        return max_len 
        
    def max_word_caption(self):
        max_len = 0
        for i, (_, value) in enumerate(self.caption.items()):
            # value is list of caption strings.
            caption_sizes = [len(re.findall(' ',x)) for x in value]
            caption_len = max(caption_sizes)
            max_len = max(max_len, caption_len)
        return max_len + 1  # re.findall(' ',str) counts one word less than in str.
    
    def total_actions(self):
        num_actions = 0
        for i, (_, value) in enumerate(self.caption.items()):
            num_actions += len(value)
        return num_actions
    
    def word_id_dict(self):
        self.word_to_id_dict = {"EOS":0, "SOS":1}
        self.id_to_word_dict = { 0:"EOS", 1:"SOS"}
        for i, (_, value) in enumerate(self.caption.items()):
            for caption in value:
                sentence = caption.split()
                for word in sentence:
                    if word not in self.word_to_id_dict:
                        len_dict = len(self.word_to_id_dict)
                        self.word_to_id_dict[word] = len_dict
                        self.id_to_word_dict[len_dict] = word
                        
        return self.word_to_id_dict, self.id_to_word_dict
    

In [None]:
class MakeDataset(DatasetInfo):
    #############################################################3
    # Returns input and output data along with padding.
    def __init__(self):
        super(MakeDataset, self).__init__()

    def action_features(self):
        action_list = []
        for i, (vid, value) in enumerate(self.resnet.items()):
            time_list = self.summary[vid]
            for time in time_list:
                start_time = time[0]
                end_time = time[1]
                action_tensor = value[start_time:end_time+1]
                action_list.append(action_tensor)
        return action_list   
    
    def input_data(self):
        action_list = self.action_features()
        input_data = rnn_utils.pad_sequence(action_list, batch_first=True)
        return input_data  # tensor size (#actions, max frame in actions, frame_size)
        
    def output_data(self):
        super(MakeDataset, self).word_id_dict()
        caption_list = []
        for i, (_, value) in enumerate(self.caption.items()):
            for caption in value:
                caption_ids = [self.word_to_id_dict[x] for x in caption.split()]
                caption_ids = [1] + caption_ids  # append SOS
                caption_tensor = torch.tensor(caption_ids, dtype=torch.long)
                caption_list.append(caption_tensor)
                
        output_data = rnn_utils.pad_sequence(caption_list, batch_first=True)  # append EOS 
        
        return output_data # tensor size (#actions, max word in caption)

In [None]:
class Dataset_loader(Dataset):
    ###############################################################
    # Returns input, output data per batch 
    def __init__(self, data_size, makedataset):
        self.data_size = data_size
        self.input_data = makedataset.input_data()
        self.output_data = makedataset.output_data()
    
    def __len__(self):
        return self.data_size
  
    def __getitem__(self, idx):
        return self.input_data[idx], self.output_data[idx]


In [None]:
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, bidirectional):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.num_layers = 1
        self.num_directions = 2 if bidirectional else 1
        self.Lstm = nn.LSTM(input_size = self.input_size, hidden_size=self.hidden_size, 
                            num_layers=self.num_layers, batch_first=True, bidirectional=bidirectional)
    
    def forward(self, input_state, hidden_state):
        output, hidden = self.Lstm(input_state, hidden_state)
        return output, hidden
    
    def initHidden(self):
        h0 = torch.zeros(self.num_layers * self.num_directions, batch_size, self.hidden_size)
        c0 = h0
        return (h0, c0)


In [None]:
class AttnDecoderLSTM(nn.Module):
    
    def __init__(self, input_size, hidden_size, embedding_size, bidirectional):
        
        super(AttnDecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.dec_output_size = input_size
        self.embedding = nn.Embedding(self.dec_output_size, self.embedding_size)
        self.num_layers = 1
        self.num_directions = 2 if bidirectional else 1
        self.Lstm = nn.LSTM(input_size = self.embedding_size, 
                            hidden_size=self.hidden_size, 
                            num_layers=self.num_layers, batch_first=True, 
                            bidirectional=bidirectional)
        self.output_layer = nn.Linear(self.num_directions*self.hidden_size, self.dec_output_size)
        
    def forward(self, input_state, hidden):
       # print('input state', input_state.size())
        embedded = self.embedding(input_state)
        #print('embedded', embedded.size())
        output, hidden = self.Lstm(embedded, hidden)
        #print('output', output.size())
        output = F.softmax(self.output_layer(output), dim=2)
        
        return output, hidden

In [None]:
def train(input_tensor, output_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    #print('train started')
    encoder_hidden = encoder.initHidden()
    
    #print('encoder hidden', encoder_hidden[0].size())

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
  
    encoder_output, encoder_hidden = encoder(input_tensor, encoder_hidden)
    decoder_hidden = encoder_hidden
    
    #print('encoder output', encoder_output.size())
    
    decoder_output, decoder_hidden = decoder(output_tensor, decoder_hidden)
    
    #print('decoder otuput', decoder_output.size())
    
    loss = criterion(decoder_output.view(-1, decoder_output.size(2)), output_tensor.view(-1))
    
    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item()

In [None]:
def trainIters(encoder, decoder, dataloader, n_iters=1, print_every=1000, 
               plot_every=100, learning_rate=0.01):
    
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)

    
    criterion = nn.CrossEntropyLoss()

    for iter in range(1, n_iters + 1):
        for idx, (input_tensor, output_tensor) in enumerate(dataloader):
            print('Id ', idx, end=' ')
            #print('input', input_tensor.size())
            #print('output', output_tensor.size())
            loss = train(input_tensor, output_tensor, encoder, decoder, encoder_optimizer,
                         decoder_optimizer, criterion)
            print('loss', loss)
        break


In [None]:
def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
plt.switch_backend('agg')

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
if __name__ == "__main__":
    
    info = DatasetInfo()
    data_size = info.total_actions()
    word2id_dict, id2word_dict = info.word_id_dict()

    make = MakeDataset()
    loader = Dataset_loader(data_size, make)
    dataloader = DataLoader(loader, batch_size=batch_size,shuffle=True)

    # encoder dimensions
    enc_input_size = 1024
    enc_hidden_size = 256
    
    bidirectional = True
    num_directions = 2 if bidirectional else 1
    
    # decoder dimensions
    dec_input_size = len(word2id_dict)
    dec_hidden_size = enc_hidden_size
    embedding_size = 100

    encoder = EncoderLSTM(enc_input_size, enc_hidden_size, bidirectional)
    decoder = AttnDecoderLSTM(dec_input_size, dec_hidden_size, embedding_size,bidirectional)

    trainIters(encoder, decoder, dataloader)

In [None]:
a = torch.load('../dataset_224/activitynet/resfeat.pt')
d = {}
for i, (key, value) in enumerate(a.items()):
    b = random.randint(760,800)
    array = torch.randn((b, 1024))
    d[key] = array
torch.save(d,''../dataset_224/activitynet/resfeat.pt')

In [None]:
# s = torch.load('../dataset_224/activitynet/video_summary.pt')
# c = torch.load('../dataset_224/activitynet/caption.pt')

# d = {}
# for i, (key, value) in enumerate(s.items()):
#     time = [value[0]]
#     d[key] = time

# torch.save(d, '../dataset_224/activitynet/video_summary.pt')

In [None]:
# a ={"hello":0, "how":1}
# b = "hello how how hello"
# embedded = nn.Embedding(2,5)
# ten1 = torch.tensor([0,1,1,0], dtype=torch.long)
# ten2 = torch.tensor([1,1,0,0], dtype=torch.long)
# ten3 = torch.tensor([0,1,0,0], dtype=torch.long)
# e = list(np.asarray(ten1))
# print(type(e))
# tens = torch.tensor([list(np.asarray(ten1)), list(np.asarray(ten2)), list(np.asarray(ten1))], dtype=torch.long)

# d = embedded(tens)
# print(d)

In [None]:
#     for index in range(num_actions):
    
#         for ei in range(input_length):
#             encoder_output, encoder_hidden = encoder(
#                 input_tensor[index][ei], encoder_hidden)
#         encoder_outputs = encoder_output[0, 0]

#         decoder_input = torch.from_numpy(word_to_index["SOS"]).to(device)

#         decoder_hidden = encoder_hidden

#         use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

#         if use_teacher_forcing:
#             # Teacher forcing: Feed the target as the next input
#             for di in range(target_length):
#                 decoder_output, decoder_hidden = decoder(
#                     decoder_input, decoder_hidden)
#                 loss += criterion(decoder_output, target_tensor[index][di])
#                 decoder_input = target_tensor[index][di]  # Teacher forcing

#         else:
#             # Without teacher forcing: use its own predictions as the next input
#             for di in range(target_length):
#                 decoder_output, decoder_hidden = decoder(
#                     decoder_input, decoder_hidden)
#                 topv, topi = decoder_output.topk(1)
#                 decoder_input = topi.squeeze().detach()  # detach from history as input

#                 loss += criterion(decoder_output, target_tensor[di])
#                 if decoder_input.item() == EOS_token:
#                     break

In [None]:
#         for i, (input_tensor, output_tensor) in enumerate(dataloader):
#             print('i', i)
#             loss = train(input_tensor, output_tensor, encoder,
#                          decoder, encoder_optimizer, decoder_optimizer, criterion)
#             print_loss_total += loss
#             plot_loss_total += loss

#         if iter % print_every == 0:
#             print_loss_avg = print_loss_total / print_every
#             print_loss_total = 0
#             print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
#                                          iter, iter / n_iters * 100, print_loss_avg))

#         if iter % plot_every == 0:
#             plot_loss_avg = plot_loss_total / plot_every
#             plot_losses.append(plot_loss_avg)
#             plot_loss_total = 0

#     showPlot(plot_losses)