In [13]:
# # get the code for kaggle
# !git clone https://github.com/moaaztaha/Image-Captioning
# py_files_path = '/kaggle/working/Image-Captioning/'
# import sys
# sys.path.append(py_files_path)

fatal: destination path 'Image-Captioning' already exists and is not an empty directory.


In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [67]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
import torchvision.transforms as transforms


import pandas as pd
import numpy as np
import spacy

from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import time, math, random

from datasets import build_vocab, get_loaders
from models import EncoderCNN
from model2 import Img2Seq, DecoderRNN
from utils import train, evaluate, epoch_time, print_examples, predict_test, print_scores
from utils import get_test_data

%matplotlib inline

In [68]:
# making our results reproducable
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [69]:
# MODEL_PATH = 'models/splits.pth'
# IMAGES_PATH = '../input/flickr8k/Images/'
# DF_PATH = '/kaggle/working/Image-Captioning/data.csv'
# TEST_DF_PATH = '/kaggle/working/Image-Captioning/test.csv'
# TEST_EXAMPLES_PATH = '/kaggle/working/Image-Captioning/test_examples/'

IMAGES_PATH = 'flickr/Images/'
DF_PATH = 'data.csv'
TEST_DF_PATH = 'test.csv'
TEST_EXAMPLES_PATH = 'test_examples/'

In [70]:
vocab = build_vocab(DF_PATH)
pad_idx = vocab.stoi['<pad>']

In [71]:
HID_DIM = 256
EMB_DIM = 256
DROPOUT = .5
VOCAB_LENGTH = len(vocab)
TRAIN_CNN = False
bs = 16
lr = 3e-3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

enc = EncoderCNN(HID_DIM, DROPOUT)
dec = DecoderRNN(EMB_DIM, HID_DIM, VOCAB_LENGTH, DROPOUT)

model = Img2Seq(enc, dec, device).to(device)

In [72]:
# transforms 
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [73]:
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), lr=lr)

# only finetune the CNN
for name, param in model.encoder.inception.named_parameters():
    if "fc.weight" in name or "fc.bias" in name:
        param.requires_grad = True
    else:
        param.requires_grad = TRAIN_CNN

In [74]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 5,688,429 trainable parameters


In [75]:
train_loader, valid_loader = get_loaders(bs, IMAGES_PATH, DF_PATH, transform, vocab)

Dataset split: train
Unique Image: 6000
Size: 30000
Dataset split: val
Unique Image: 1000
Size: 5000


In [76]:
xb, yb = next(iter(valid_loader))
xb.shape, yb.shape

(torch.Size([16, 3, 224, 224]), torch.Size([18, 16]))

In [77]:
features = enc(xb.to(device))
features.shape

torch.Size([16, 256])

In [78]:
preds, hidden = (features, yb.to(device))
preds.shape, hidden.shape

(torch.Size([16, 256]), torch.Size([18, 16]))

In [79]:
pp = model(xb.to(device), yb.to(device))
pp.shape

torch.Size([18, 16, 4461])

In [80]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for idx, (imgs, captions) in tqdm(enumerate(iterator), total=len(iterator), position=0, leave=False, desc="training"):
        
        optimizer.zero_grad()
        
        imgs = imgs.to(model.device)
        captions = captions.to(model.device)
        
        outputs = model(imgs, captions)
        
        #print(outputs.shape, captions.shape)
        
        #output = [trg len, batch size, output dim]
        loss = criterion(
                outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1)
            )
        

        loss.backward()
        
        # clip the grads
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
        for i, (images, captions) in tqdm(enumerate(iterator), total=len(iterator), position=0, leave=False, desc="Evaluating"):
            
            images = images.to(model.device)
            captions = captions.to(model.device)
            
            outputs = model(images, captions)
            #output = [trg len, batch size, output dim]
            
            loss = criterion(
                outputs.reshape(-1, outputs.shape[2]), captions.reshape(-1)
            )
            
            epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)



In [81]:
N_EPOCHS = 3
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_loader, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'gru.pth')
        
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

training:   0%|          | 2/1875 [00:02<50:39,  1.62s/it]  

[A[A                                                     


[A[A[A                                                   
[A                                                        



                                                          

KeyboardInterrupt: 