In [2]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, CLIPProcessor, CLIPModel, BertModel
from PIL import Image, ImageFile
import pandas as pd
from sklearn.model_selection import train_test_split

# Allow PIL to load truncated images
ImageFile.LOAD_TRUNCATED_IMAGES = True

# ---------------------------
# 1. Data Preparation
# ---------------------------

csv_path = 'labels.csv'  # Update with your actual CSV file path
df = pd.read_csv(csv_path, encoding='utf-8')
df.columns = df.columns.str.strip()

# Ensure that text fields are strings (fill NaN with empty strings)/
df['text_ocr'] = df['text_ocr'].fillna("")

# Define sentiment mapping (5 classes: 0 through 4)
sentiment_mapping = {
    'very_positive': 4,
    'positive': 3,
    'neutral': 2,
    'negative': 1,
    'very_negative': 0,
}
df['overall_sentiment'] = df['overall_sentiment'].map(sentiment_mapping)
print("First few sentiment labels:")
print(df[['overall_sentiment']].head())

# Split data into train and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# ---------------------------
# 2. Setup Tokenizers and Processors
# ---------------------------

# BERT tokenizer for text
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# CLIP processor and model for images
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_model.eval()  # We use CLIP's image encoder in evaluation mode

# ---------------------------
# 3. Define the Dataset
# ---------------------------

class MultimodalDataset(Dataset):
    def __init__(self, df, text_tokenizer, clip_processor, clip_model, image_folder, max_text_length=128):
        self.df = df.reset_index(drop=True)
        self.text_tokenizer = text_tokenizer
        self.clip_processor = clip_processor
        self.clip_model = clip_model
        self.image_folder = image_folder
        self.max_text_length = max_text_length

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        
        # --- Process Text ---
        text = row['text_ocr']
        if not isinstance(text, str):
            text = ""
        text_inputs = self.text_tokenizer(
            text,
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=self.max_text_length
        )
        # Remove extra batch dimension for proper collation
        text_inputs = {k: v.squeeze(0) for k, v in text_inputs.items()}
        
        # --- Process Image ---
        image_path = os.path.join(self.image_folder, row['image_name'])
        image = Image.open(image_path).convert('RGB')
        image_inputs = self.clip_processor(images=image, return_tensors='pt')
        with torch.no_grad():
            image_features = self.clip_model.get_image_features(**image_inputs)
        image_features = image_features.squeeze(0)  # Remove batch dimension
        
        # --- Process Label ---
        label = torch.tensor(row['overall_sentiment']).long()
        
        return text_inputs, image_features, label

# Set the folder where your images are stored
image_folder = 'images'  # Update with your actual images folder path

# Create datasets and dataloaders
train_dataset = MultimodalDataset(train_df, bert_tokenizer, clip_processor, clip_model, image_folder)
val_dataset   = MultimodalDataset(val_df, bert_tokenizer, clip_processor, clip_model, image_folder)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=8, shuffle=False)

# ---------------------------
# 4. Define the Multimodal Model
# ---------------------------

class MultimodalModel(nn.Module):
    def __init__(self, bert_model, image_dim, num_classes):
        super(MultimodalModel, self).__init__()
        self.bert_model = bert_model  # Pretrained BERT model
        # Project BERT's pooled output (typically 768) to 128 dimensions
        self.text_fc = nn.Linear(bert_model.config.hidden_size, 128)
        # Project CLIP image features (512-dim for clip-vit-base-patch32) to 128 dimensions
        self.image_fc = nn.Linear(image_dim, 128)
        # Combine both and predict sentiment (ensure num_classes=5 for labels 0–4)
        self.fc = nn.Sequential(
            nn.Linear(128 + 128, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )
    
    def forward(self, text_inputs, image_features):
        bert_outputs = self.bert_model(**text_inputs)
        pooled_text = bert_outputs.pooler_output  # [batch_size, hidden_size]
        text_out = self.text_fc(pooled_text)
        image_out = self.image_fc(image_features)
        combined_features = torch.cat((text_out, image_out), dim=1)
        logits = self.fc(combined_features)
        return logits

# Load pretrained BERT model
bert_model = BertModel.from_pretrained("bert-base-uncased")
bert_model.eval()  # Optionally set to eval mode if you don't want to fine-tune BERT

# IMPORTANT: Set num_classes=5 to match the sentiment mapping!
num_classes = 5
model = MultimodalModel(bert_model=bert_model, image_dim=512, num_classes=num_classes)

# Move model to device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# ---------------------------
# 5. Training Setup
# ---------------------------

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# ---------------------------
# 6. Training Loop
# ---------------------------

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0.0
    for batch in train_loader:
        text_inputs, image_features, labels = batch
        
        # Move inputs to the device
        text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
        image_features = image_features.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(text_inputs, image_features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss:.4f}")

First few sentiment labels:
   overall_sentiment
0                  4
1                  4
2                  3
3                  3
4                  2




Epoch 1/5, Loss: 1.2933
Epoch 2/5, Loss: 1.2812
Epoch 3/5, Loss: 1.2701
Epoch 4/5, Loss: 1.2647
Epoch 5/5, Loss: 1.2586
