In [1]:
    import time
    import torch.optim as optim
    from torch.utils.data import DataLoader
    import torchvision.transforms as transforms
    from torch import nn
    from torch.nn.utils.rnn import pack_padded_sequence
    from models import Encoder, Decoder
    from dataset import *
    from nltk.translate.bleu_score import corpus_bleu
    import pickle
    import numpy as np

In [2]:
#Parameters
emb_dim = 100
attention_dim = 1024
hidden_dim = 1024
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
decoder_lr = 0.01
epochs = 20
batch_size = 32
print_freq = 10

In [4]:
p_in = open('dicts/glove.pickle','rb')
glove = pickle.load(p_in)
p_in.close()

In [5]:
p_in = open('dicts/int_to_vocab.pickle','rb')
int_to_vocab = pickle.load(p_in)
p_in.close()

In [6]:
p_in = open('dicts/vocab_to_int.pickle','rb')
vocab_to_int = pickle.load(p_in)
p_in.close()

In [7]:
print(len(int_to_vocab),len(vocab_to_int),len(glove))

33159 33159 400000


In [8]:
weights = torch.zeros(len(int_to_vocab),100)

In [9]:
for i in range(len(weights)):
    word = int_to_vocab[i]
    if word in glove.keys():
        weights[i] = torch.from_numpy(glove[word])
    else:
        weights[i] = torch.from_numpy(np.random.randn(1,100))

In [10]:
encoder = Encoder()
# encoder_optim = optim.Adam(None)

decoder = Decoder(attention_dim=attention_dim,num_embedding=len(weights),
                  embedding_dim=emb_dim,hidden_dim=hidden_dim,n_layers=2)
decoder_optim = optim.Adam(decoder.parameters(),decoder_lr)

In [11]:
encoder = encoder.to(device)
decoder = decoder.to(device)

In [12]:
criterion = nn.CrossEntropyLoss().to(device)

In [13]:
# df = pd.read_csv('Encoded_captions.csv')
# shuffle_df = df.sample(frac=1)
# len(shuffle_df)
# train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')
# len(train_df)+len(test_df)

In [14]:
# from sklearn.model_selection import train_test_split 
# train_df,test_df = train_test_split(shuffle_df,test_size = 0.3, shuffle=True)
# print("train_df {}".format(train_df['img_id'].describe()['unique']))
# print("Test_df {}".format(test_df['img_id'].describe()['unique']))

In [15]:
# train_df.to_csv('train.csv')
# test_df.to_csv('test.csv')

In [16]:
# df['enc_captions'][0]

In [17]:
import torchvision.transforms as transforms
transform = transforms.Compose([
#     transforms.RandomHorizontalFlip(), # randomly flip and rotate
#     transforms.RandomRotation(10),
    transforms.Resize((300,300)),
    transforms.ToTensor(),
    transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    ])


In [18]:
train_data = MemeDataset(img_paths='data/train_imgpaths.npy',caplens='data/train_caplens.npy',
                         enc_caps='data/train_enc_captions.npy',transform=transform)
train_loader = DataLoader(train_data,shuffle=True, batch_size=32)

In [19]:
test_data = MemeDataset(img_paths='data/test_imgpaths.npy',caplens='data/test_caplens.npy',
                         enc_caps='data/test_enc_captions.npy',transform=transform)
test_loader = DataLoader(train_data,shuffle=True, batch_size=32)

In [22]:
decoder.load_glove(weights,True)

In [None]:
train_accuracies=[]
train_losses=[]
val_accuracies=[]
val_losses=[]
for e in range(epochs):
    decoder.train()
    for i,(img,cap,l) in enumerate(train_loader):
        
        imgs = img.to(device)
        caps = cap.to(device)
        l = l.to(device)
        
        enc_imgs = encoder(imgs)
        preds,alphas,enc_cap,dec_l,ind = decoder(enc_imgs,
                                               caps,l)
        
        targets = enc_cap[:,1:]
        
        preds,_ = pack_padded_sequence(preds, dec_l, batch_first=True)
        targets,_ = pack_padded_sequence(targets, dec_l)
        
        loss = criterion(scores, targets)
        
        decoder_optim.zero_grad()
        loss.backward()
        
        decoder_optim.step()
        
        _, ind = preds.topk(5,1,True,True)
        equals = ind.eq(preds.view(-1,1).expand_as(ind))
        accuracy = equals.view(-1).float().sum() / targets.size(0)
        
        train_accuracies.append(accuracy/sum(dec_l))
        train_losses.append(loss.item/sum(dec_l))
        
        
        
        if i % print_freq == 0:
            decoder.eval()
            for i,(imgs,caps,l) in enumerate(test_loader):
                imgs = imgs.to(device)
                caps = caps.to(device)
                caplens = caplens.to(device)
                
                enc_imgs = encoder(imgs)
                preds,alphas,enc_cap,dec_l,ind = decoder(enc_imgs,caps,l)
                
                targets = enc_cap[:,1:]
                preds,_ = pack_padded_sequence(preds,dec_l,batch_first=True)
                targets,_ = pack_padded_sequence(targets,dec_l,batch_first=True)
                
                loss = criterion(preds,targets)
                
                _, ind = preds.topk(5,1,True,True)
                equals = ind.eq(preds.view(-1,1).expand_as(ind))
                val_accuracy = equals.view(-1).float().sum() / targets.size(0)
                
                val_accuracies.append(val_accuracy/sum(dec_l))
                val_losses.append(loss.item()/sum(dec_l))
                
                
            
            print(f'Epoch: {e}/{epochs}',
                 f'training loss: {train_losses[-1]:.4f}',
                 f'training accuracy: {train_accuracies[-1]:.4f}',
                 f'validation loss: {val_losses[-1]:.4f}',
                 f'validation accuracy: {val_accuracies[-1]:.4f}')
            
        