# Image Captioning using PyTorch

### 1. Auto-reload by default for modules.

In [65]:
%load_ext autoreload
%autoreload 2

### 2. Importing required modules

In [101]:
import pandas as pd
from PIL import Image
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from utils.data_loader import data_loader
from torch.utils.tensorboard import SummaryWriter
from utils.models import Captioner
import torch.optim as optim

### 3. Exlpore the data

In [71]:
captions = pd.read_csv("Data/caption_train.csv")

In [72]:
captions.head(5)

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...


In [73]:
captions[:500]

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,A child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,A girl going into a wooden building .
2,1000268201_693b08cb0e.jpg,A little girl climbing into a wooden playhouse .
3,1000268201_693b08cb0e.jpg,A little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,A little girl in a pink dress going into a woo...
...,...,...
495,1107471216_4336c9b328.jpg,A little girl is holding a cine camera in fron...
496,1107471216_4336c9b328.jpg,A young girl is looking through an old fashion...
497,1107471216_4336c9b328.jpg,A young girl steadies her aim with a camera
498,1107471216_4336c9b328.jpg,Girl with rosy cheeks and lips holding black t...


In [75]:
little_girl = Image.open("Data/Images/train/1000268201_693b08cb0e.jpg")
little_girl.show()

### 4. Make transformation for the data

In [88]:
img_transform = transforms.Compose(
    [
        transforms.Resize((356, 356)),
        transforms.RandomCrop((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ]
)

### 5. Getting the training data in batches

In [89]:
data_loader__training, train_dataset = data_loader(root_dir="./Data/Images/train",
                                caption_file="./Data/caption_train.csv",
                                transform=img_transform)

In [90]:
for i, (image, caption) in enumerate(data_loader__training):
    print(i)
    print(image.shape)
    print(caption.shape)
    break

0
torch.Size([32, 3, 299, 299])
torch.Size([23, 32])


In [108]:
def train():
    """
    Train the captioner
    :return:
    """
    print("Training")
    # Apply some transformation to our data
    transform = transforms.Compose(
        [
            transforms.Resize((356, 356)),
            transforms.RandomCrop((299, 299)),
            transforms.ToTensor(),
            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ]
    )

    # get the data
    training_data, train_dataset = data_loader(root_dir="./Data/Images/train",
                                               caption_file="./Data/caption_train.csv",
                                               transform=transform, num_workers=6)

    # get the test data
    test_data, test_dataset = data_loader(root_dir="./Data/Images/test",
                                          caption_file="./Data/caption_test.csv",
                                          transform=transform, num_workers=6)

    # get validation data
    valid_data, valid_dataset = data_loader(root_dir="./Data/Images/valid",
                                            caption_file="./Data/caption_valid.csv",
                                            transform=transform, num_workers=6)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    load_model = False
    save_model = True

    # hyperparameters
    embed_size = 256
    hidden_size = 256
    vocabulary_size = len(train_dataset.vocabulary)
    num_layer = 1
    lr = 3e-4
    num_epochs = 101

    # Tensorboard
    writer = SummaryWriter("runs/flickr")
    step = 0

    # Initialize model
    model = Captioner(embed_size, hidden_size, vocabulary_size, num_layer).to(device)

    # loss function
    criterion = nn.CrossEntropyLoss(ignore_index=train_dataset.vocabulary.stoi["<PAD>"])

    # optimizer
    optimizer = optim.Adam(model.parameters(), lr=lr)

    # load checkpoint if already saved
    if load_model:
        step = load_checkpoint(torch.load("checkpoint.pth.tar"), model, optimizer)

    model.train()

    # training process
    for epoch in range(num_epochs):
        running_loss = 0.0

        for index, (images, captions) in enumerate(training_data):
            images = images.to(device)
            captions = captions.to(device)
            output = model(images, captions[:-1])

            loss = criterion(output.reshape(-1, output.shape[2]), captions.reshape(-1))

            writer.add_scalar("Training loss", loss.item(), global_step=step)
            step += 1
            optimizer.zero_grad()
            loss.backward(loss)
            optimizer.step()

            # print statistics
            # accumulate the training loss
            running_loss += loss.item()

            print(f'going through batches the current epoch {epoch}/{num_epochs}')
            print(f"current batch {index} / {len(training_data)}")

        print(f'Epoch: {epoch + 1}/{num_epochs} ... Training loss: {running_loss / len(training_data)}')

        # save the model at this stage
        if save_model:
            checkpoint = {
                "state_dict": model.state_dict(),
                "optimizer": optimizer.state_dict(),
                "step": step
            }
        # save model each 10 epochs
            if (epoch + 1) % 10 == 0:
                save_checkpoint(checkpoint, epoch)


In [109]:
train()

Training
going through batches the current epoch 0/101
current batch 0 / 796
going through batches the current epoch 0/101
current batch 1 / 796
going through batches the current epoch 0/101
current batch 2 / 796
going through batches the current epoch 0/101
current batch 3 / 796
going through batches the current epoch 0/101
current batch 4 / 796
going through batches the current epoch 0/101
current batch 5 / 796
going through batches the current epoch 0/101
current batch 6 / 796
going through batches the current epoch 0/101
current batch 7 / 796
going through batches the current epoch 0/101
current batch 8 / 796
going through batches the current epoch 0/101
current batch 9 / 796
going through batches the current epoch 0/101
current batch 10 / 796
going through batches the current epoch 0/101
current batch 11 / 796
going through batches the current epoch 0/101
current batch 12 / 796
going through batches the current epoch 0/101
current batch 13 / 796
going through batches the current e

KeyboardInterrupt: 

## Data Preparation

In [10]:
import pandas as pd
import os

In [12]:
df = pd.read_csv("Data/captions.txt")

In [27]:
len(df)

40455

In [18]:
df_valid = df[35455:]

In [35]:
df_valid

Unnamed: 0,image,caption
35455,378453580_21d688748e.jpg,A dog is jumping over a log in a wooded area w...
35456,378453580_21d688748e.jpg,A dog with a stick in his mouth jumps over a f...
35457,378453580_21d688748e.jpg,Dog carries stick and jumps over a log .
35458,378453580_21d688748e.jpg,The dog carries a stick and jumps over a log i...
35459,378453580_21d688748e.jpg,The dog jumps over the log with a stick in its...
...,...,...
40450,997722733_0cb5439472.jpg,A man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,A man is rock climbing high in the air .
40452,997722733_0cb5439472.jpg,A person in a red shirt climbing up a rock fac...
40453,997722733_0cb5439472.jpg,A rock climber in a red shirt .


In [42]:
valid_images_array = list(set(df_valid["image"].to_list()))

In [63]:
df_valid.to_csv("Data/caption_valid.csv", header=False, index=False)

In [44]:
len(valid_images_array)

1000

In [48]:
for image in valid_images_array:
    os.rename(f"Data/Images/{image}", f"Data/Images/valid/{image}")

In [19]:
df_test = df[25455:35455]

In [60]:
df_test.to_csv("Data/caption_test.csv", header=False, index=False)

In [21]:
len(df_test)

10000

In [49]:
test_images_array = list(set(df_test["image"].to_list()))

In [50]:
len(test_images_array)

2000

In [51]:
for image in test_images_array:
    os.rename(f"Data/Images/{image}", f"Data/Images/test/{image}")

In [53]:
df_train = df[:25455]

In [54]:
len(df_train)

25455

In [62]:
df_train.to_csv("Data/caption_train.csv", header=False, index=False)

In [55]:
train_images_array = list(set(df_train["image"].to_list()))

In [56]:
for image in train_images_array:
    os.rename(f"Data/Images/{image}", f"Data/Images/train/{image}")