In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets

# Custom Dataset
class ThumbsUpDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels  # [(class_label, x_min, y_min, x_max, y_max)]
        self.transform = transform
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        # Load image and label
        image = load_image(self.image_paths[idx])  # Define load_image function
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
        
        return image, torch.tensor(label)

# Define MLP model for classification + localization
class ThumbsUpMLP(nn.Module):
    def __init__(self):
        super(ThumbsUpMLP, self).__init__()
        self.fc1 = nn.Linear(128*128*3, 512)  # assuming 128x128 RGB images
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 4)  # Bounding box: x_min, y_min, x_max, y_max
        self.fc5 = nn.Linear(128, 1)  # Classification output (0 or 1)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten image
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        bbox = self.fc4(x)  # Bounding box prediction
        class_pred = torch.sigmoid(self.fc5(x))  # Binary classification (thumbs up or not)
        return bbox, class_pred

# Data Augmentation Pipeline
transform = transforms.Compose([
    transforms.RandomRotation(30),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.RandomResizedCrop(128, scale=(0.8, 1.0)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2),
    transforms.ToTensor(),
])

# Create DataLoader
train_dataset = ThumbsUpDataset(image_paths=train_image_paths, labels=train_labels, transform=transform)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Model, Loss and Optimizer
model = ThumbsUpMLP()
criterion_class = nn.BCEWithLogitsLoss()  # For binary classification
criterion_bbox = nn.MSELoss()  # For bounding box regression
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training Loop
for epoch in range(num_epochs):
    model.train()
    for images, labels in train_loader:
        optimizer.zero_grad()
        
        bbox, class_pred = model(images)
        
        # Split labels into class and bounding box
        class_labels = labels[:, 0]
        bbox_labels = labels[:, 1:]
        
        # Compute losses
        loss_class = criterion_class(class_pred.squeeze(), class_labels)
        loss_bbox = criterion_bbox(bbox, bbox_labels)
        
        # Total loss
        loss = loss_class + loss_bbox
        
        # Backpropagation
        loss.backward()
        optimizer.step()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

# After training, you can use the model to predict thumbs-up locations


ModuleNotFoundError: No module named 'torchvision'