In [12]:
import pandas as pd
import re
from tqdm import tqdm
import ftfy
import html
import os, zipfile
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel
from sklearn.preprocessing import LabelEncoder
from torch.optim import AdamW
from sklearn.metrics import classification_report
from torchvision.models import resnet50
import torch.nn as nn
from torchvision import transforms,models
from PIL import Image
from torch.utils.data import WeightedRandomSampler
import numpy as np

In [5]:
CSV_PATH = "/kaggle/input/cleaned-fusion-with-text/cleaned_fusion_with_text.csv"
IMAGE_DIR = "/kaggle/input/data-image/data_image"
TEXT_MODEL_PATH = "/kaggle/input/text-agent-bertweet/text_agent_bertweet.pth"
VISION_MODEL_PATH = "/kaggle/input/vision-agent-resnet50/vision_agent_resnet50.pth"
FUSION_MODEL_PATH = "/kaggle/working/reasoning_agent_fusion.pth"
REPORT_PATH = "/kaggle/working/reasoning_agent_classification_report.txt"

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 16
EPOCHS = 5
NUM_CLASSES = 6


In [13]:
vision_model = models.resnet50(weights=None)  # Don't load pretrained weights
vision_model.fc = nn.Linear(vision_model.fc.in_features, NUM_CLASSES)  # 6 classes as in your training
vision_model.load_state_dict(torch.load(VISION_MODEL_PATH, map_location=DEVICE, weights_only=False))
vision_model.to(DEVICE)
vision_model.eval()

# Create a feature extractor from the vision model (remove the final classification layer)
class VisionFeatureExtractor(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        # Remove the final fc layer to get features
        self.features = nn.Sequential(*list(base_model.children())[:-1])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
    def forward(self, x):
        x = self.features(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        return x

vision_agent = VisionFeatureExtractor(vision_model)
vision_agent.to(DEVICE)
vision_agent.eval()

# Load text model
text_agent = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=6)
checkpoint = torch.load(TEXT_MODEL_PATH, map_location=DEVICE, weights_only=False)
text_agent.load_state_dict(checkpoint["model_state_dict"])
text_agent.to(DEVICE)
text_agent.eval()

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

class FusionDataset(Dataset):
    def __init__(self, df, label_encoder=None):
        self.df = df.copy()
        
        # Create or use provided label encoder
        if label_encoder is None:
            from sklearn.preprocessing import LabelEncoder
            self.label_encoder = LabelEncoder()
            self.df['label'] = self.label_encoder.fit_transform(self.df['disaster_type'])
        else:
            self.label_encoder = label_encoder
            self.df['label'] = self.label_encoder.transform(self.df['disaster_type'])
        
        print(f"Dataset created with {len(self.df)} samples")
        print(f"Sample labels: {self.df['label'].head().tolist()}")
        print(f"Sample disaster_types: {self.df['disaster_type'].head().tolist()}")
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = os.path.join(IMAGE_DIR, row["image_name"])
        tweet = row["tweet_text_clean"]
        # Use the numeric label (guaranteed to exist now)
        label = row["label"]
        
        # Debug print to check label type (only for first few items)
        if idx < 3:
            print(f"Sample {idx} - Label: {label} (type: {type(label)}), Disaster type: {row['disaster_type']}")
        
        # Load and process image
        image = Image.open(img_path).convert('RGB')
        image = image_transform(image).unsqueeze(0).to(DEVICE)
        
        # Extract vision features (now 2048-dimensional from ResNet50 before final fc)
        with torch.no_grad():
            vision_feat = vision_agent(image).squeeze(0)
        
        # Process text
        encoded = tokenizer(tweet, return_tensors="pt", padding="max_length", 
                          truncation=True, max_length=128)
        input_ids = encoded['input_ids'].to(DEVICE)
        attention_mask = encoded['attention_mask'].to(DEVICE)
        
        # Extract text features (get hidden states from the model)
        with torch.no_grad():
            outputs = text_agent(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
            # Use the [CLS] token representation from the last hidden layer
            text_feat = outputs.hidden_states[-1][:, 0, :].squeeze(0)
        
        # Ensure label is numeric
        if isinstance(label, str):
            label = self.label_encoder.transform([label])[0]
        
        return vision_feat.cpu(), text_feat.cpu(), torch.tensor(int(label), dtype=torch.long)

class ReasoningAgent(nn.Module):
    def __init__(self, vision_dim=2048, text_dim=768, hidden_dim=512, num_classes=6):
        super().__init__()
        self.fc1 = nn.Linear(vision_dim + text_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, vision_feat, text_feat):
        x = torch.cat((vision_feat, text_feat), dim=1)
        x = self.dropout(self.relu(self.fc1(x)))
        return self.fc2(x)

def train_model():
    df = pd.read_csv(CSV_PATH)
    
    # Create and fit the label encoder
    from sklearn.preprocessing import LabelEncoder
    from torch.utils.data import WeightedRandomSampler
    import numpy as np
    
    label_encoder = LabelEncoder()
    
    print("Original disaster types:", df['disaster_type'].unique())
    
    # Fit the label encoder first
    label_encoder.fit(df['disaster_type'])
    
    print("Classes:", label_encoder.classes_)
    print("Label mapping:", dict(zip(label_encoder.classes_, range(len(label_encoder.classes_)))))
    
    # Create the dataset with the fitted label encoder
    dataset = FusionDataset(df, label_encoder)
    
    # Calculate class weights for balanced sampling
    labels = dataset.df['label'].values
    class_counts = np.bincount(labels)
    print("Class distribution:", dict(zip(range(len(class_counts)), class_counts)))
    
    # Calculate weights (inverse of class frequency)
    class_weights = 1.0 / class_counts
    sample_weights = class_weights[labels]
    
    print("Class weights:", class_weights)
    print("Using WeightedRandomSampler for balanced training")
    
    # Create sampler for balanced sampling
    sampler = WeightedRandomSampler(
        weights=sample_weights,
        num_samples=len(sample_weights),
        replacement=True
    )
    
    dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, sampler=sampler)
    
    model = ReasoningAgent().to(DEVICE)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    
    model.train()
    for epoch in range(EPOCHS):
        total_loss = 0
        for vision_feat, text_feat, labels in dataloader:
            vision_feat = vision_feat.to(DEVICE)
            text_feat = text_feat.to(DEVICE)
            labels = labels.to(DEVICE)
            
            outputs = model(vision_feat, text_feat)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataloader):.4f}")
    
    return model, dataset

def evaluate_model(model, dataset):
    model.eval()
    y_true, y_pred = [], []
    
    with torch.no_grad():
        for vision_feat, text_feat, labels in DataLoader(dataset, batch_size=1):
            vision_feat = vision_feat.to(DEVICE)
            text_feat = text_feat.to(DEVICE)
            
            outputs = model(vision_feat, text_feat)
            preds = torch.argmax(outputs, dim=1).cpu().item()
            
            y_pred.append(preds)
            y_true.append(labels.item())
    
    report = classification_report(y_true, y_pred, digits=4)
    print(report)
    
    with open(REPORT_PATH, "w") as f:
        f.write(report)

# Main execution
model, dataset = train_model()
evaluate_model(model, dataset)
torch.save(model.state_dict(), FUSION_MODEL_PATH)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Original disaster types: ['hurricane_maria' 'california_wildfires' 'hurricane_harvey'
 'hurricane_irma' 'srilanka_floods' 'iraq_iran_earthquake']
Classes: ['california_wildfires' 'hurricane_harvey' 'hurricane_irma'
 'hurricane_maria' 'iraq_iran_earthquake' 'srilanka_floods']
Label mapping: {'california_wildfires': 0, 'hurricane_harvey': 1, 'hurricane_irma': 2, 'hurricane_maria': 3, 'iraq_iran_earthquake': 4, 'srilanka_floods': 5}
Dataset created with 8534 samples
Sample labels: [3, 0, 3, 1, 2]
Sample disaster_types: ['hurricane_maria', 'california_wildfires', 'hurricane_maria', 'hurricane_harvey', 'hurricane_irma']
Class distribution: {0: 942, 1: 2095, 2: 1944, 3: 2711, 4: 550, 5: 292}
Class weights: [0.00106157 0.00047733 0.0005144  0.00036887 0.00181818 0.00342466]
Using WeightedRandomSampler for balanced training
Sample 2 - Label: 3 (type: <class 'numpy.int64'>), Disaster type: hurricane_maria
Sample 1 - Label: 0 (type: <class 'numpy.int64'>), Disaster type: california_wildfires
Sam