In [1]:
pip install torch torchvision transformers pandas Pillow


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
import pandas as pd
from PIL import Image
import torch
from torchvision import transforms
from transformers import BertTokenizer, BertModel


In [4]:
# Load the data
data = pd.read_csv('data.csv')

# Define image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def load_image(image_id, image_folder='images'):
    image_path = os.path.join(image_folder, f"{image_id}.png")  # Adjust extension if needed
    if not os.path.exists(image_path):
        print(f"FileNotFoundError: Image {image_path} not found.")
        raise FileNotFoundError(f"Image {image_path} not found.")
    image = Image.open(image_path).convert("RGB")
    return transform(image)

# Initialize lists to store images, questions, and labels
images = []
questions = []
labels = []

# Create label mapping
unique_labels = data['answer'].unique()
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print("Label mapping:", label_map)

# Load images, questions, and labels
for idx, row in data.iterrows():
    try:
        # Load and transform the image
        image_tensor = load_image(row['image_id'], image_folder='images')
        images.append(image_tensor)
        questions.append(row['englishquestion'])
        labels.append(label_map[row['answer']])
    except FileNotFoundError as e:
        print(e)
        continue

# Tokenize questions
question_tokens = tokenizer(questions, padding=True, truncation=True, return_tensors="pt")
print("Questions tokenized shape:", question_tokens['input_ids'].shape)

# Convert labels to a tensor
label_tensor = torch.tensor(labels)
print("Labels tensor shape:", label_tensor.shape)

# Stack images into a single tensor
image_stack = torch.stack(images)
print("Images stacked shape:", image_stack.shape)

print("Data preparation successful.")


Label mapping: {'shelves': 0, 'paper': 1, 'refridgerator': 2, 'clothes': 3, 'machine': 4, 'curtain': 5, 'door': 6, 'three': 7, 'bottle_of_liquid': 8, 'sofa': 9, 'table': 10, 'books': 11, 'cabinet': 12, 'orange': 13, 'pillow, paper, book': 14, 'blue': 15, 'two': 16, 'bed': 17, 'picture, decorative_item': 18, 'sink': 19, 'picture': 20, 'window': 21, 'floor_mat': 22, 'nine': 23, 'light': 24, 'five': 25, 'spoon_stand': 26, 'toys_basket': 27, 'one': 28, 'fire_extinguisher': 29, 'ladder': 30, 'tin_foil': 31, 'four': 32, 'whiteboard': 33, 'towel': 34, 'chair': 35, 'toy, telephone': 36, 'garbage_bin': 37, 'blinds': 38, 'hanger': 39, 'bag': 40, 'toilet': 41, 'toothpaste': 42, 'black': 43, 'orange, pink': 44, 'toy': 45, 'ten': 46, 'bottle': 47, 'television': 48, 'plant': 49, 'green': 50, 'white': 51, 'pillow, blanket, bed_sheets': 52, 'ornamental_plant': 53, 'pillow': 54, 'paper_holder': 55, 'lamp, photo, decorative_item': 56, 'container': 57, 'stove': 58, 'piano': 59, 'mirror': 60, 'notebook, d

In [5]:
import torch.nn as nn

class BertCNNModel(nn.Module):
    def __init__(self):
        super(BertCNNModel, self).__init__()
        # BERT model for text processing
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        # Simple CNN for image processing
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        )
        self.fc1 = nn.Linear(768 + 128 * 56 * 56, 256)
        self.fc2 = nn.Linear(256, len(set(labels)))  # Adjust output size based on the number of unique labels

    def forward(self, input_ids, attention_mask, images):
        # Process text with BERT
        text_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_embedding = text_output.last_hidden_state[:, 0, :]  # CLS token

        # Process image with CNN
        image_embedding = self.cnn(images)
        image_embedding = image_embedding.view(image_embedding.size(0), -1)  # Flatten

        # Concatenate text and image embeddings
        combined = torch.cat((text_embedding, image_embedding), dim=1)

        # Fully connected layers
        x = self.fc1(combined)
        x = nn.ReLU()(x)
        x = self.fc2(x)
        return x


In [8]:
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import torch

# Prepare data for DataLoader
dataset = TensorDataset(question_tokens['input_ids'], question_tokens['attention_mask'], image_stack, label_tensor)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize model, loss function, and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertCNNModel().to(device)
loss_fn = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)

# Training loop
for epoch in range(10):  # Adjust number of epochs as needed
    for batch in dataloader:
        input_ids, attention_mask, images, labels = [x.to(device) for x in batch]
        
        # Forward pass
        outputs = model(input_ids, attention_mask, images)
        
        # Compute loss
        loss = loss_fn(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 4.80350923538208
Epoch 2, Loss: 4.963907241821289
Epoch 3, Loss: 4.974913120269775
Epoch 4, Loss: 3.0998356342315674
Epoch 5, Loss: 1.7286838293075562
Epoch 6, Loss: 1.9252345561981201
Epoch 7, Loss: 1.023680567741394
Epoch 8, Loss: 1.1746867895126343
Epoch 9, Loss: 0.19674935936927795
Epoch 10, Loss: 0.0795847475528717
