In [1]:
!nvidia-smi

Sat Jun 26 08:13:05 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

In [4]:
!pip install kaggle -q
!kaggle datasets download -d aladdinpersson/flickr8kimagescaptions
!unzip -q flickr8kimagescaptions.zip

Downloading flickr8kimagescaptions.zip to /content
 99% 1.03G/1.04G [00:12<00:00, 127MB/s]
100% 1.04G/1.04G [00:12<00:00, 88.5MB/s]


In [5]:
# get the code form github
!git clone https://github.com/moaaztaha/Image-Captioning
py_files_path = 'Image-Captioning/'
import sys
sys.path.append(py_files_path)

Cloning into 'Image-Captioning'...
remote: Enumerating objects: 547, done.[K
remote: Counting objects: 100% (547/547), done.[K
remote: Compressing objects: 100% (248/248), done.[K
remote: Total 547 (delta 333), reused 505 (delta 291), pack-reused 0[K
Receiving objects: 100% (547/547), 38.27 MiB | 23.52 MiB/s, done.
Resolving deltas: 100% (333/333), done.


In [6]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [7]:
import time 
import torch.backends.cudnn as cudnn
import torch.optim
import torch.utils.data
import torchvision.transforms as transforms
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence
from models import Encoder, DecoderWithAttention
from dataset import *
from utils import *
from train import *
from torch.utils.tensorboard import SummaryWriter
from os import path as osp

In [8]:
# Model parameters
encoder_dim = 2048 # resnet101
emb_dim = 512  # dimension of word embeddings
attention_dim = 512  # dimension of attention linear layers
decoder_dim = 512  # dimension of decoder RNN
dropout = 0.5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # sets device for model and PyTorch tensors
cudnn.benchmark = True  # set to true only if inputs to model are fixed size; otherwise lot of computational overhead

# training parameters
epochs = 30  # number of epochs to train for (if early stopping is not triggered)
batch_size = 256
workers = 2
encoder_lr = 1e-4  # learning rate for encoder if fine-tuning
decoder_lr = 4e-4  # learning rate for decoder
fine_tune_encoder = False  # fine-tune encoder?
pretrained_embeddings = False
fine_tune_embeddings = False
checkpoint = None  # path to checkpoint, None if none


In [10]:
DATA_NAME = 'flickr8k_ar'

# local
# DATA_JSON_PATH = 'ar_data.json'
# IMGS_PATH = 'flickr/Images/'
# kaggle paths
# DATA_JSON_PATH = '/kaggle/working/Image-Captioning/data.json'
# IMGS_PATH = '../input/flickr8kimagescaptions/flickr8k/images/'
#colab
DATA_JSON_PATH = 'Image-Captioning/ar_data.json'
IMGS_PATH = 'flickr8k/images/'

In [11]:
max_seq = 30
vocab = build_vocab(DATA_JSON_PATH, max_seq=max_seq)
vocab_len = len(vocab); vocab_len

100%|██████████| 24000/24000 [00:00<00:00, 380040.76it/s]


5788

In [12]:
list(vocab.itos.keys())[:10], list(vocab.itos.values())[:10]

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
 ['<pad>',
  '<sos>',
  '<eos>',
  '<unk>',
  'طفلة',
  'صغيرة',
  'تتسلق',
  'إلى',
  'كلب',
  'أسود'])

In [13]:
t_params = {
    'data_name': DATA_NAME,
    'imgs_path': IMGS_PATH,
    'df_path': DATA_JSON_PATH,
    'vocab': vocab,
    'epochs': epochs,
    'batch_size': batch_size,
    'workers': workers,
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
}

m_params = {
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim,
    'dropout': dropout
}

logger_dic = {
    'decoder_lr': decoder_lr,
    'encoder_lr': encoder_lr,
    'fine_tune_encoder': fine_tune_encoder,
    'pretrained_embeddings': pretrained_embeddings,
    'max_seq_length': max_seq,
    'vocab_size': vocab_len,
    'enocder': 'resnet101',
    'dropout': dropout,
    'attention_dim': attention_dim,
    'embed_dim': emb_dim,
    'decoder_dim': decoder_dim,
    'encoder_dim': encoder_dim 
    
}

In [14]:
t_params

{'batch_size': 256,
 'data_name': 'flickr8k_ar',
 'decoder_lr': 0.0004,
 'df_path': 'Image-Captioning/ar_data.json',
 'encoder_lr': 0.0001,
 'epochs': 30,
 'fine_tune_encoder': False,
 'imgs_path': 'flickr8k/images/',
 'pretrained_embeddings': False,
 'vocab': <dataset.Vocabulary at 0x7f7e18aaefd0>,
 'workers': 2}

In [17]:
# experiment name
name = DATA_NAME + ""
# path
log_dir = '/content/drive/MyDrive/ImageCaptioning/flickr8_ar/experiments'

logger = SummaryWriter(log_dir=osp.join(log_dir, name))

In [None]:
fit(t_params=t_params, m_params=m_params, logger=logger)

Downloading: "https://download.pytorch.org/models/resnet101-63fe2227.pth" to /root/.cache/torch/hub/checkpoints/resnet101-63fe2227.pth


HBox(children=(FloatProgress(value=0.0, max=178793939.0), HTML(value='')))


Loading Data
Dataset split: train
Unique images: 6000
Total size: 18000
Dataset split: val
Unique images: 1000
Total size: 3000
__________________________________________________
-------------------- Fitting --------------------
__________________________________________________
-------------------- Training --------------------


  return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


Epoch: [0][0/71]	Batch Time 8.471 (8.471)	Data Load Time 3.622 (3.622)	Loss 9.6074 (9.6074)	Top-5 Accuracy 0.144 (0.144)
Epoch train time 154.997 (epoch_time.avg:.3f)
-------------------- Validation --------------------
Validation: [0/12]	Batch Time 5.519 (5.519)	Loss 6.4465 (6.4465)	Top-5 Accuracy 36.128 (36.128)	
----- Bleu-n Scores -----
1: 34.28116163569975
2: 15.004966110980616
3: 5.752228240109723
4: 2.3468998100455627
-------------------------

 * LOSS - 6.468, TOP-5 ACCURACY - 35.838, BLEU-4 - 2.3468998100455627

Epoch validation time 32.159 (epoch_time.avg:.3f)
__________________________________________________
-------------------- Training --------------------
Epoch: [1][0/71]	Batch Time 5.749 (5.749)	Data Load Time 3.375 (3.375)	Loss 5.9978 (5.9978)	Top-5 Accuracy 40.631 (40.631)
