In [None]:
pip install torch torchvision transformers pandas gradio
import pandas as pd
import torch
import torch.nn as nn
from PIL import Image
from torchvision import transforms
from torchvision.models import resnet50
from transformers import BertTokenizer, BertModel, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import gradio as gr
import os


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%cd /content/drive/MyDrive/nlp_colab

In [None]:
from torch.utils.data import DataLoader, TensorDataset
# Load the data
data = pd.read_csv('data.csv')

# Define image transformation
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def load_image(image_id, image_folder='images'):
    image_path = os.path.join(image_folder, f"{image_id}.png")  # Adjust extension if needed
    if not os.path.exists(image_path):
        print(f"FileNotFoundError: Image {image_path} not found.")
        raise FileNotFoundError(f"Image {image_path} not found.")
    image = Image.open(image_path).convert("RGB")
    return transform(image)

# Create label mapping
unique_labels = data['answer'].unique()
label_map = {label: idx for idx, label in enumerate(unique_labels)}
print("Label mapping:", label_map)

# Split data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

print("Training data size:", len(train_data))
print("Validation data size:", len(val_data))

# Function to prepare dataset
def prepare_dataset(data, label_map, tokenizer, image_folder):
    images = []
    questions = []
    labels = []

    for idx, row in data.iterrows():
        try:
            # Load and transform the image
            image_tensor = load_image(row['image_id'], image_folder)
            images.append(image_tensor)
            questions.append(row['englishquestion'])
            labels.append(label_map[row['answer']])
        except FileNotFoundError as e:
            print(e)
            continue

    # Tokenize questions
    question_tokens = tokenizer(questions, padding=True, truncation=True, return_tensors="pt")
    print(f"Questions tokenized shape: {question_tokens['input_ids'].shape}")

    # Convert labels to a tensor
    label_tensor = torch.tensor(labels)
    print(f"Labels tensor shape: {label_tensor.shape}")

    # Stack images into a single tensor
    image_stack = torch.stack(images)
    print(f"Images stacked shape: {image_stack.shape}")

    dataset = TensorDataset(question_tokens['input_ids'], question_tokens['attention_mask'], image_stack, label_tensor)
    return dataset

# Prepare training and validation datasets
train_dataset = prepare_dataset(train_data, label_map, tokenizer, image_folder='images')
val_dataset = prepare_dataset(val_data, label_map, tokenizer, image_folder='images')

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

print("Data preparation successful.")

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class BertCNNModel(nn.Module):
    def __init__(self):
        super(BertCNNModel, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        self.fc = nn.Linear(768 + 128, len(label_map))  # Adjust the output size to match number of labels

    def forward(self, input_ids, attention_mask, images):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        cnn_output = self.cnn(images).view(images.size(0), -1)
        combined_output = torch.cat((bert_output, cnn_output), dim=1)
        output = self.fc(combined_output)
        return output


In [None]:
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize model, loss function, and optimizer
model = BertCNNModel().to(device)
loss_fn = CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=1e-4)

def calculate_accuracy(predictions, labels):
    _, preds = torch.max(predictions, dim=1)
    return torch.sum(preds == labels).item() / len(labels)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    total_correct = 0
    total_samples = 0

    for batch in train_dataloader:
        input_ids, attention_mask, images, labels = [x.to(device) for x in batch]
        
        # Forward pass
        outputs = model(input_ids, attention_mask, images)
        
        # Compute loss
        loss = loss_fn(outputs, labels)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * len(labels)
        total_correct += calculate_accuracy(outputs, labels) * len(labels)
        total_samples += len(labels)
        
    train_loss = total_loss / total_samples
    train_accuracy = total_correct / total_samples
    print(f"Epoch {epoch+1}, Loss: {train_loss}, Training Accuracy: {train_accuracy}")
    
    # Evaluation on validation set
    model.eval()
    val_loss = 0
    val_correct = 0
    val_samples = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, images, labels = [x.to(device) for x in batch]
            
            outputs = model(input_ids, attention_mask, images)
            
            loss = loss_fn(outputs, labels)
            val_loss += loss.item() * len(labels)
            val_correct += calculate_accuracy(outputs, labels) * len(labels)
            val_samples += len(labels)
            
    val_loss = val_loss / val_samples
    val_accuracy = val_correct / val_samples
    print(f"Epoch {epoch+1}, Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}")


In [None]:
import gradio as gr

def predict(question, image):
    model.eval()
    with torch.no_grad():
        # Preprocess input
        question_tokens = tokenizer(question, padding=True, truncation=True, return_tensors="pt").to(device)
        image_tensor = transform(image).unsqueeze(0).to(device)
        
        # Make prediction
        outputs = model(question_tokens['input_ids'], question_tokens['attention_mask'], image_tensor)
        _, preds = torch.max(outputs, dim=1)
        prediction = list(label_map.keys())[preds.item()]
        
    return prediction

# Gradio interface
iface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter the question here..."),
        gr.Image(type="pil")
    ],
    outputs="text"
)

iface.launch(share=True)
