In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/DL Lab/Lab6/code')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install Pillow
!pip install scipy==1.1.0



In [None]:
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torch.backends.cudnn as cudnn
from models import Encoder, DecoderWithRNN, DecoderWithAttention
from datasets import *
from solver import *

### Train model with attention

In [None]:
cfg = {
    # Data parameters
    'data_folder' : '/content/drive/MyDrive/Colab Notebooks/DL Lab/Lab6/data/coco2014',  # folder with data files saved by create_input_files.py
    'data_name' : 'coco_5_cap_per_img_5_min_word_freq',  # base name shared by data files
    # Model parameters
    'embed_dim' : 512,  # dimension of word embeddings
    'attention_dim' : 512,  # dimension of attention linear layers
    'decoder_dim' : 512,  # dimension of decoder RNN
    'dropout' : 0.5,
    'device' : torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),  # sets device for model and PyTorch tensors
    # Training parameters
    'start_epoch' : 0,
    'epochs' : 10,  # number of epochs to train for (if early stopping is not triggered)
    'epochs_since_improvement' : 0,  # keeps track of number of epochs since there's been an improvement in validation BLEU
    'batch_size' : 32,
    'workers' : 1,  # for data-loading; right now, only 1 works with h5py
    'encoder_lr' : 1e-4,  # learning rate for encoder if fine-tuning
    'decoder_lr' : 4e-4,  # learning rate for decoder
    'grad_clip' : 5.,  # clip gradients at an absolute value of
    'alpha_c' : 1.,  # regularization parameter for 'doubly stochastic attention', as in the paper
    'best_bleu4' : 0.,  # BLEU-4 score right now
    'print_freq' : 100,  # print training/validation stats every __ batches
    'fine_tune_encoder' : False,  # fine-tune encoder or not
    'checkpoint' : None,  # path to checkpoint, None if none
    'attention' : True, # train decoder with attention or not
    'adaptive': False, 
}
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

In [None]:
word_map_file = os.path.join(cfg['data_folder'], 'WORDMAP_' + cfg['data_name'] + '.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)
cfg['vocab_size'] = len(word_map)

In [None]:
if cfg['checkpoint'] is None:
    encoder = Encoder()
    encoder.fine_tune(cfg['fine_tune_encoder'])
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                         lr=cfg['encoder_lr']) if cfg['fine_tune_encoder'] else None
    if not cfg['attention']:
        decoder = DecoderWithRNN(cfg)
    else:
        decoder = DecoderWithAttention(cfg)
    decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                         lr=cfg['decoder_lr'])
else:
    checkpoint = torch.load(cfg['checkpoint'])
    cfg['start_epoch'] = checkpoint['epoch'] + 1
    cfg['epochs_since_improvement'] = checkpoint['epochs_since_improvement']
    cfg['best_bleu4'] = checkpoint['bleu-4']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    if cfg['fine_tune_encoder'] is True and encoder_optimizer is None:
        encoder.fine_tune(cfg['fine_tune_encoder'])
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                             lr=cfg['encoder_lr'])

In [None]:
# Move to GPU, if available
decoder = decoder.to(cfg['device'])
encoder = encoder.to(cfg['device'])

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss().to(cfg['device'])

In [None]:
# Custom dataloaders
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'TRAIN', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)
val_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'VAL', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)

In [None]:
# Epochs
for epoch in range(cfg['start_epoch'], cfg['epochs']):

    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if cfg['epochs_since_improvement'] == 20:
        break
    if cfg['epochs_since_improvement'] > 0 and cfg['epochs_since_improvement'] % 8 == 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
        if cfg['fine_tune_encoder']:
            adjust_learning_rate(encoder_optimizer, 0.8)

    # One epoch's training
    train(train_loader=train_loader,
          encoder=encoder,
          decoder=decoder,
          criterion=criterion,
          encoder_optimizer=encoder_optimizer,
          decoder_optimizer=decoder_optimizer,
          epoch=epoch,
          cfg=cfg)
    
    # One epoch's validation
    recent_bleu4 = validate(val_loader=val_loader,
                            encoder=encoder,
                            decoder=decoder,
                            criterion=criterion,
                            word_map=word_map,
                            cfg=cfg)

    # Check if there was an improvement
    is_best = recent_bleu4 > cfg['best_bleu4']
    cfg['best_bleu4'] = max(recent_bleu4, cfg['best_bleu4'])
    if not is_best:
        cfg['epochs_since_improvement'] += 1
        print("\nEpochs since last improvement: %d\n" % (cfg['epochs_since_improvement'],))
    else:
        cfg['epochs_since_improvement'] = 0

    # Save checkpoint
    save_checkpoint(cfg['data_name'], epoch, cfg['epochs_since_improvement'], encoder, decoder, encoder_optimizer,
                    decoder_optimizer, recent_bleu4, is_best)

  alpha = F.softmax(e)


Epoch: [0][0/17702]	Batch Time 17.449 (17.449)	Data Load Time 16.488 (16.488)	Loss 10.1011 (10.1011)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][100/17702]	Batch Time 0.210 (1.798)	Data Load Time 0.000 (1.567)	Loss 6.2100 (6.6252)	Top-5 Accuracy 38.356 (34.354)
Epoch: [0][200/17702]	Batch Time 0.362 (1.083)	Data Load Time 0.155 (0.861)	Loss 5.5830 (6.1906)	Top-5 Accuracy 45.055 (38.133)
Epoch: [0][300/17702]	Batch Time 0.250 (0.827)	Data Load Time 0.027 (0.608)	Loss 5.4286 (5.9172)	Top-5 Accuracy 47.091 (41.177)
Epoch: [0][400/17702]	Batch Time 0.345 (0.683)	Data Load Time 0.138 (0.466)	Loss 5.1491 (5.7057)	Top-5 Accuracy 47.541 (43.800)
Epoch: [0][500/17702]	Batch Time 0.300 (0.594)	Data Load Time 0.097 (0.379)	Loss 4.9765 (5.5543)	Top-5 Accuracy 52.342 (45.697)
Epoch: [0][600/17702]	Batch Time 0.243 (0.535)	Data Load Time 0.029 (0.320)	Loss 4.7463 (5.4301)	Top-5 Accuracy 55.670 (47.296)
Epoch: [0][700/17702]	Batch Time 0.203 (0.492)	Data Load Time 0.000 (0.277)	Loss 4.5887 (5.3346)	Top-5

In [None]:
!cp /content/BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar /content/drive/MyDrive/Colab\ Notebooks/DL\ Lab/Lab6/code
!cp /content/checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar /content/drive/MyDrive/Colab\ Notebooks/DL\ Lab/Lab6/code

#### parameter tuning

In [None]:
cfg = {
    # Data parameters
    'data_folder' : '/content/drive/MyDrive/Colab Notebooks/DL Lab/Lab6/data/coco2014',  # folder with data files saved by create_input_files.py
    'data_name' : 'coco_5_cap_per_img_5_min_word_freq',  # base name shared by data files
    # Model parameters
    'embed_dim' : 512,  # dimension of word embeddings
    'attention_dim' : 512,  # dimension of attention linear layers
    'decoder_dim' : 512,  # dimension of decoder RNN
    'dropout' : 0.5,
    'device' : torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),  # sets device for model and PyTorch tensors
    # Training parameters
    'start_epoch' : 0,
    'epochs' : 10,  # number of epochs to train for (if early stopping is not triggered)
    'epochs_since_improvement' : 0,  # keeps track of number of epochs since there's been an improvement in validation BLEU
    'batch_size' : 32,
    'workers' : 1,  # for data-loading; right now, only 1 works with h5py
    'encoder_lr' : 1e-4,  # learning rate for encoder if fine-tuning
    'decoder_lr' : 4e-4,  # learning rate for decoder
    'grad_clip' : 5.,  # clip gradients at an absolute value of
    'alpha_c' : .1,  # regularization parameter for 'doubly stochastic attention', as in the paper
    'best_bleu4' : 0.,  # BLEU-4 score right now
    'print_freq' : 100,  # print training/validation stats every __ batches
    'fine_tune_encoder' : False,  # fine-tune encoder or not
    'checkpoint' : None,  # path to checkpoint, None if none
    'attention' : True, # train decoder with attention or not
    'adaptive': False, 
}
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

In [None]:
word_map_file = os.path.join(cfg['data_folder'], 'WORDMAP_' + cfg['data_name'] + '.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)
cfg['vocab_size'] = len(word_map)

In [None]:
if cfg['checkpoint'] is None:
    encoder = Encoder()
    encoder.fine_tune(cfg['fine_tune_encoder'])
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                         lr=cfg['encoder_lr']) if cfg['fine_tune_encoder'] else None
    if not cfg['attention']:
        decoder = DecoderWithRNN(cfg)
    else:
        decoder = DecoderWithAttention(cfg)
    decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                         lr=cfg['decoder_lr'])
else:
    checkpoint = torch.load(cfg['checkpoint'])
    cfg['start_epoch'] = checkpoint['epoch'] + 1
    cfg['epochs_since_improvement'] = checkpoint['epochs_since_improvement']
    cfg['best_bleu4'] = checkpoint['bleu-4']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    if cfg['fine_tune_encoder'] is True and encoder_optimizer is None:
        encoder.fine_tune(cfg['fine_tune_encoder'])
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                             lr=cfg['encoder_lr'])

In [None]:
# Move to GPU, if available
decoder = decoder.to(cfg['device'])
encoder = encoder.to(cfg['device'])

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss().to(cfg['device'])

In [None]:
# Custom dataloaders
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'TRAIN', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)
val_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'VAL', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)

In [None]:
# Epochs
for epoch in range(cfg['start_epoch'], cfg['epochs']):

    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if cfg['epochs_since_improvement'] == 20:
        break
    if cfg['epochs_since_improvement'] > 0 and cfg['epochs_since_improvement'] % 8 == 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
        if cfg['fine_tune_encoder']:
            adjust_learning_rate(encoder_optimizer, 0.8)

    # One epoch's training
    train(train_loader=train_loader,
          encoder=encoder,
          decoder=decoder,
          criterion=criterion,
          encoder_optimizer=encoder_optimizer,
          decoder_optimizer=decoder_optimizer,
          epoch=epoch,
          cfg=cfg)
    
    # One epoch's validation
    recent_bleu4 = validate(val_loader=val_loader,
                            encoder=encoder,
                            decoder=decoder,
                            criterion=criterion,
                            word_map=word_map,
                            cfg=cfg)

    # Check if there was an improvement
    is_best = recent_bleu4 > cfg['best_bleu4']
    cfg['best_bleu4'] = max(recent_bleu4, cfg['best_bleu4'])
    if not is_best:
        cfg['epochs_since_improvement'] += 1
        print("\nEpochs since last improvement: %d\n" % (cfg['epochs_since_improvement'],))
    else:
        cfg['epochs_since_improvement'] = 0

    # Save checkpoint
    save_checkpoint(cfg['data_name'], epoch, cfg['epochs_since_improvement'], encoder, decoder, encoder_optimizer,
                    decoder_optimizer, recent_bleu4, is_best)

  alpha = F.softmax(e)


Epoch: [0][0/17702]	Batch Time 1.938 (1.938)	Data Load Time 1.687 (1.687)	Loss 9.2928 (9.2928)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][100/17702]	Batch Time 0.206 (0.580)	Data Load Time 0.000 (0.372)	Loss 5.1117 (5.8528)	Top-5 Accuracy 35.884 (34.314)
Epoch: [0][200/17702]	Batch Time 0.204 (0.421)	Data Load Time 0.000 (0.215)	Loss 4.9189 (5.3855)	Top-5 Accuracy 44.693 (38.443)
Epoch: [0][300/17702]	Batch Time 0.218 (0.361)	Data Load Time 0.000 (0.155)	Loss 4.6162 (5.1010)	Top-5 Accuracy 47.792 (41.703)
Epoch: [0][400/17702]	Batch Time 0.203 (0.324)	Data Load Time 0.000 (0.119)	Loss 4.3420 (4.8970)	Top-5 Accuracy 50.402 (44.051)
Epoch: [0][500/17702]	Batch Time 0.207 (0.301)	Data Load Time 0.000 (0.096)	Loss 3.8554 (4.7516)	Top-5 Accuracy 57.609 (45.844)
Epoch: [0][600/17702]	Batch Time 0.201 (0.286)	Data Load Time 0.000 (0.080)	Loss 3.6663 (4.6325)	Top-5 Accuracy 60.405 (47.398)
Epoch: [0][700/17702]	Batch Time 0.201 (0.274)	Data Load Time 0.000 (0.069)	Loss 4.0865 (4.5370)	Top-5 Accur

### Train model with adaptive attention

In [None]:
from models import EncoderWithGlobalFeature, DecoderWithAdaptiveAttention

In [None]:
cfg = {
    # Data parameters
    'data_folder' : '/content/drive/MyDrive/Colab Notebooks/DL Lab/Lab6/data/coco2014',  # folder with data files saved by create_input_files.py
    'data_name' : 'coco_5_cap_per_img_5_min_word_freq',  # base name shared by data files
    # Model parameters
    'embed_dim' : 512,  # dimension of word embeddings
    'attention_dim' : 512,  # dimension of attention linear layers
    'decoder_dim' : 512,  # dimension of decoder RNN
    'dropout' : 0.5,
    'device' : torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),  # sets device for model and PyTorch tensors
    # Training parameters
    'start_epoch' : 0,
    'epochs' : 10,  # number of epochs to train for (if early stopping is not triggered)
    'epochs_since_improvement' : 0,  # keeps track of number of epochs since there's been an improvement in validation BLEU
    'batch_size' : 80,
    'workers' : 1,  # for data-loading; right now, only 1 works with h5py
    'encoder_lr' : 1e-5,  # learning rate for encoder if fine-tuning
    'decoder_lr' : 5e-4,  # learning rate for decoder
    'grad_clip' : 5.,  # clip gradients at an absolute value of
    'alpha_c' : 1.,  # regularization parameter for 'doubly stochastic attention', as in the paper
    'best_bleu4' : 0.,  # BLEU-4 score right now
    'print_freq' : 100,  # print training/validation stats every __ batches
    'fine_tune_encoder' : False,  # fine-tune encoder or not
    'checkpoint' : None,  # path to checkpoint, None if none
    'attention' : True, # train decoder with attention or not
    'adaptive': True, # train decoder with adaptive attention or not
}
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

In [None]:
word_map_file = os.path.join(cfg['data_folder'], 'WORDMAP_' + cfg['data_name'] + '.json')
with open(word_map_file, 'r') as j:
    word_map = json.load(j)
cfg['vocab_size'] = len(word_map)

In [None]:
if cfg['checkpoint'] is None:
    encoder = EncoderWithGlobalFeature(cfg)
    encoder.fine_tune(cfg['fine_tune_encoder'])
    encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                         lr=cfg['encoder_lr']) if cfg['fine_tune_encoder'] else None
    if not cfg['attention']:
        decoder = DecoderWithRNN(cfg)
    else:
        decoder = DecoderWithAdaptiveAttention(cfg)
    decoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, decoder.parameters()),
                                         lr=cfg['decoder_lr'])
else:
    checkpoint = torch.load(cfg['checkpoint'])
    cfg['start_epoch'] = checkpoint['epoch'] + 1
    cfg['epochs_since_improvement'] = checkpoint['epochs_since_improvement']
    cfg['best_bleu4'] = checkpoint['bleu-4']
    encoder = checkpoint['encoder']
    encoder_optimizer = checkpoint['encoder_optimizer']
    decoder = checkpoint['decoder']
    decoder_optimizer = checkpoint['decoder_optimizer']
    if cfg['fine_tune_encoder'] is True and encoder_optimizer is None:
        encoder.fine_tune(cfg['fine_tune_encoder'])
        encoder_optimizer = torch.optim.Adam(params=filter(lambda p: p.requires_grad, encoder.parameters()),
                                             lr=cfg['encoder_lr'])

In [None]:
# Move to GPU, if available
decoder = decoder.to(cfg['device'])
encoder = encoder.to(cfg['device'])

In [None]:
# Loss function
criterion = nn.CrossEntropyLoss().to(cfg['device'])

In [None]:
# Custom dataloaders
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'TRAIN', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)
val_loader = torch.utils.data.DataLoader(
    CaptionDataset(cfg['data_folder'], cfg['data_name'], 'VAL', transform=transforms.Compose([normalize])),
    batch_size=cfg['batch_size'], shuffle=True, num_workers=cfg['workers'], pin_memory=True)

In [11]:
# Epochs
for epoch in range(cfg['start_epoch'], cfg['epochs']):

    # Decay learning rate if there is no improvement for 8 consecutive epochs, and terminate training after 20
    if cfg['epochs_since_improvement'] == 20:
        break
    if cfg['epochs_since_improvement'] > 0 and cfg['epochs_since_improvement'] % 8 == 0:
        adjust_learning_rate(decoder_optimizer, 0.8)
        if cfg['fine_tune_encoder']:
            adjust_learning_rate(encoder_optimizer, 0.8)


    # One epoch's training
    train(train_loader=train_loader,
          encoder=encoder,
          decoder=decoder,
          criterion=criterion,
          encoder_optimizer=encoder_optimizer,
          decoder_optimizer=decoder_optimizer,
          epoch=epoch,
          cfg=cfg)
    
    # One epoch's validation
    recent_bleu4 = validate(val_loader=val_loader,
                            encoder=encoder,
                            decoder=decoder,
                            criterion=criterion,
                            word_map=word_map,
                            cfg=cfg)

    # Check if there was an improvement
    is_best = recent_bleu4 > cfg['best_bleu4']
    cfg['best_bleu4'] = max(recent_bleu4, cfg['best_bleu4'])
    if not is_best:
        cfg['epochs_since_improvement'] += 1
        print("\nEpochs since last improvement: %d\n" % (cfg['epochs_since_improvement'],))
    else:
        cfg['epochs_since_improvement'] = 0

    # Save checkpoint
    save_checkpoint(cfg['data_name'], epoch, cfg['epochs_since_improvement'], encoder, decoder, encoder_optimizer,
                    decoder_optimizer, recent_bleu4, is_best)



Epoch: [0][0/7081]	Batch Time 2.060 (2.060)	Data Load Time 0.576 (0.576)	Loss 9.7415 (9.7415)	Top-5 Accuracy 0.000 (0.000)
Epoch: [0][100/7081]	Batch Time 0.351 (0.385)	Data Load Time 0.000 (0.006)	Loss 5.2778 (6.0655)	Top-5 Accuracy 45.913 (36.825)
Epoch: [0][200/7081]	Batch Time 0.340 (0.375)	Data Load Time 0.000 (0.003)	Loss 4.8722 (5.4840)	Top-5 Accuracy 50.792 (43.420)
Epoch: [0][300/7081]	Batch Time 0.346 (0.370)	Data Load Time 0.000 (0.002)	Loss 4.0554 (5.1533)	Top-5 Accuracy 60.815 (47.507)
Epoch: [0][400/7081]	Batch Time 0.349 (0.367)	Data Load Time 0.000 (0.002)	Loss 3.9230 (4.9324)	Top-5 Accuracy 63.499 (50.260)
Epoch: [0][500/7081]	Batch Time 0.343 (0.366)	Data Load Time 0.000 (0.001)	Loss 4.0953 (4.7705)	Top-5 Accuracy 61.605 (52.278)
Epoch: [0][600/7081]	Batch Time 0.357 (0.364)	Data Load Time 0.000 (0.001)	Loss 3.9031 (4.6470)	Top-5 Accuracy 61.991 (53.822)
Epoch: [0][700/7081]	Batch Time 0.349 (0.364)	Data Load Time 0.000 (0.001)	Loss 4.0979 (4.5470)	Top-5 Accuracy 60.5

In [12]:
!cp /content/BEST_checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar /content/drive/MyDrive/Colab\ Notebooks/DL\ Lab/Lab6/code
!cp /content/checkpoint_coco_5_cap_per_img_5_min_word_freq.pth.tar /content/drive/MyDrive/Colab\ Notebooks/DL\ Lab/Lab6/code