In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
from dataset import ImageCaption
from sklearn.model_selection import train_test_split
import torch.nn as nn
import torch
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchvision.transforms as transforms
from models import *
import numpy as np
from torchtext.data.metrics import bleu_score
from utils import *
from torchtext.data.utils import get_tokenizer

input_size = 224
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(input_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize(input_size),
        transforms.CenterCrop(input_size),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}


tokenizer = get_tokenizer('spacy', language='en_core_web_sm')
captions_vocab = build_vocab("image_captions/image_mapping and captions/12/captions.txt", tokenizer)

In [11]:


image_cap_dataset_train = ImageCaption("image_captions/image_mapping and captions/12/image_names.txt", 
                                        "image_captions/image_mapping and captions/12/captions.txt",data_transforms["train"],"train")
image_cap_dataset_test = ImageCaption("image_captions/image_mapping and captions/12/image_names.txt",
                                        "image_captions/image_mapping and captions/12/captions.txt",data_transforms["val"],"val")


image_cap_train_dataloader  = DataLoader(image_cap_dataset_train, batch_size=2,num_workers=10, shuffle=True)
image_capt_test_dataloader  = DataLoader(image_cap_dataset_test, batch_size=2,num_workers=10, shuffle=False)

embed_size = 300
hidden_size = 32
lr = 3e-4
MAX_EPOCHS = 50

encoder = EncoderCNN(embed_size).to(device)
decoder = DecoderRNN(embed_size, hidden_size, len(captions_vocab)).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
params = list(decoder.parameters()) + list(encoder.linear.parameters()) + list(encoder.bn.parameters())

optimizer = torch.optim.Adam(params, lr=lr)

In [17]:
for image,embedding_vector,tokens in image_cap_train_dataloader:
    print(tokens)
    break

tensor([[   2, 5612,  520,   15,    4,   14,   13,   41,   97,  179,  125,    4,
           92,    5,    3,    1,    1,    1,    1,    1],
        [   2,    6,   32,  253,   10,   26,  381,   96,  206,   11,   53,    9,
            4,   61,  390,    5,    3,    1,    1,    1]])
