In [1]:
!pip install torch
!pip install torchvision
!pip install kaggle
!pip install matplotlib
!pip install spacy
!pip install pandas
# English tokenizer data
!spacy download en_core_web_sm

[0m2024-05-18 00:36:14.595932: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 00:36:14.596005: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 00:36:14.597518: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-18 00:36:14.605765: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-18 00:36:17.400529: I external/local_xla/

Imports

In [2]:
# Used for model
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

# Used to format data
import torchvision
import torchvision.transforms as transforms
# Used to show results
import matplotlib.pyplot as plt

# Used to download data
import os
#import kaggle
import zipfile
from pathlib import Path

# Additional file imports
from image_captioning_from_scratch import CNNtoRNN
from dataset import Flickr8kDataset
from dataset import Collate

# Used to convert test images
from PIL import Image

2024-05-18 00:36:30.417160: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 00:36:30.417265: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 00:36:30.418888: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-18 00:36:30.428108: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-05-18 00:36:33.062615: I external/local_xla/xla/

Display Images

In [3]:
# Displays an image with its respective caption
def displayImage(img, caption):
    plt.figure()
    plt.imshow(img.permute(1,2,0))
    plt.title(caption)
    plt.axis("off")

# Dataset

In [4]:
# Data transformations
transform = transforms.Compose(
    [
        transforms.Resize((512,512)),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ]
)

In [5]:
# Make methods to download the images and captions
# Going to have to figure out how to download through kaggle
def getData(directory):
    # Create dataset folder
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory {directory} created successfully...")
    else:
        print(f"Directory {directory} already exists...")
    # Download images and captions from kaggle
    !kaggle datasets download -d adityajn105/flickr8k
    path = Path('flickr8k')
    # Only extract if file hasn't been downloaded before
    if not path.exists():
        zipfile.ZipFile(f'{path}.zip').extractall(directory)
        print(f"Finished Extacting to {directory}...")


In [6]:
# Dataset path name
dataset_path = "./dataset/"
# Download images and captions (commented out if data exists)
#getData(dataset_path)

In [7]:
def saveModel(model, name):
    # Directory Name
    directory = './models/'
    # Create models folder and checks if it exists
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory {directory} created successfully...")
    else:
        print(f"Directory {directory} already exists...")
    # Define path and name
    path = os.path.join(directory, name)
    # Save model
    torch.save(model, path)

In [8]:
# DataLoader parameters
BATCH_SIZE = 4
num_workers = 8
# Initialize Dataset
root = os.path.join(dataset_path, 'Images')
annotations = os.path.join(dataset_path, 'captions.txt')
freq_threshold = 6
dataset = Flickr8kDataset(root, annotations, transform, freq_threshold)

ratio = 0.8

train_split = int(ratio * len(dataset))
valid_split = len(dataset) - train_split

train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_split, valid_split])

# Get numerical index of padding
pad_idx = dataset.vocab.stoi["<PAD>"]
# Pad dataset and initialize dataloader
train_loader = DataLoader(
    train_dataset,
    batch_size = BATCH_SIZE,
    num_workers = num_workers,
    shuffle = True,
    collate_fn=Collate(pad_idx)
    )
valid_loader = DataLoader(
    valid_dataset,
    batch_size = BATCH_SIZE,
    num_workers = num_workers,
    shuffle = True,
    collate_fn=Collate(pad_idx)
)

Training Loop

In [None]:
# Loop for one iteration through training data
def train_epoch(model, dataloader, loss_fn, optimizer, device = 'cpu'):
    
    # Put model into train mode
    model.train()
    # Initialize Loss
    avg_loss = 0.0
    
    for i, (images, captions) in enumerate(dataloader):
        # Get batch of images and captions
        images, captions = images.to(device), captions.to(device)
        # Make predictions
        # Don't include last token so embeddings and features 
        # can concat and produce the correct size prediction
        preds = model(images, captions[:-1])
        # Calculate Loss
        # Preds : Sentence Length x Batch Size x Vocab Size | Captions: Sentence Length x Batch Size
        # Reshape to Preds: Batch Size * Sentence Length x Vocab Size | Captions: Batch Size * Sentence Length
        loss = loss_fn(preds.reshape(-1, preds.shape[2]), captions.reshape(-1))
        # Add to total loss
        avg_loss += loss.item()
        # Optimization
        optimizer.zero_grad()
        loss.backward(loss)
        optimizer.step()
        
    # Calculate average loss
    avg_loss /= len(dataloader)
    
    return avg_loss

Validation Loop

In [16]:
def valid_epoch(model, dataloader, loss_fn, device = 'cpu'):
    
    # Put model into evaluation mode
    model.eval()
    # Initialize Loss
    avg_loss = 0.0
    with torch.inference_mode():
        for i, (images, captions) in enumerate(dataloader):
            # Get batch of images and captions
            images, captions = images.to(device), captions.to(device)
            # Make predictions 
            preds = model(images, captions[:-1])
            # Calculate Loss
            loss = loss_fn(preds.reshape(-1, preds.shape[2]), captions.reshape(-1))
            avg_loss += loss.item()
        # Calculate average loss
        avg_loss /= len(dataloader)
    
    return avg_loss

Hyperparameters

In [11]:
embed_size = 256
hidden_size = 256
vocab_size = len(dataset.vocab)
num_layers = 1
lr = 0.01
EPOCHS = 50

Model and Optimization

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNNtoRNN(embed_size, hidden_size, vocab_size, num_layers).to(device)
name = f"Iter:2|Layers:{num_layers}"

# Loading Model
# model = torch.load("Train:{EPOCHS}|Layers:{num_layers}").to(device)

In [22]:
# Optimization
loss_fn = nn.CrossEntropyLoss(ignore_index = dataset.vocab.stoi['<PAD>'])
optim = torch.optim.Adam(model.parameters(), lr)
lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=0.85)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Training

In [62]:
train_loss_i = []
validation_loss_i = []
for epoch in range(EPOCHS):
    # Train for a single epoch
    train_loss = train_epoch(model, train_loader, loss_fn, optim, device)
    # Validation Epoch
    valid_loss = valid_epoch(model, valid_loader, loss_fn, device)
    # Store loss
    train_loss_i.append(train_loss)
    validation_loss_i.append(valid_loss)
    # Decrease learning rate
    lr_scheduler.step()
    print("Epoch {} | Train Loss: {} | Validation Loss: {}".format(epoch, train_loss, valid_loss))

saveModel(model, name)

plt.figure
plt.plot(train_loss_i, label = "Training Loss")
plt.plot(validation_loss_i, label = "Validation Loss") 
plt.xlabel("Epochs")
plt.ylabel("Average Loss")
plt.title("Train vs Validation Loss")
   

torch.int64
Pred Shape: torch.Size([18, 4, 3432]) | One-Hot Captions: torch.Size([18, 4, 3432])
torch.float32 + torch.int64


RuntimeError: Expected floating point type for target with class probabilities, got Long

Actual Predictions / Testing

In [9]:
# Get device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load trained model
test_model = torch.load("./models/Train:100|Layers:2:40").to(device)

# Test image transforms
transform = transforms.Compose(
    [
        transforms.Resize((299, 299)),
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
    ]
)
# Get test image and transform to put through tensor
test_image1 = transform(Image.open("test_examples/dog.jpg").convert("RGB")).unsqueeze(0)


test_model.eval()
# Predictions
caption = test_model.caption_image(test_image1.to(device), dataset.vocab)

displayImage(test_image1, caption)

UnboundLocalError: cannot access local variable 'predicted' where it is not associated with a value

Display Images

In [41]:
# Test to see if works
a, b = train_dataset.__getitem__(7)
c, d = train_dataset.__getitem__(50)
displayImage(a,b)
displayImage(c,d)


TypeError: 'dict' object is not callable

: 