## PART A

In [None]:
!pip install evaluate rouge_score
!pip install --upgrade nltk

In [None]:
import os
import json
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm
from PIL import Image
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from transformers import (
    AutoProcessor,
    AutoModelForVision2Seq,
    ViTModel,
    GPT2LMHeadModel,
    GPT2Tokenizer,
    ViTFeatureExtractor
)
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
from rouge_score import rouge_scorer
from nltk.translate.meteor_score import meteor_score
import nltk
import evaluate

In [None]:
!mkdir -p /kaggle/working/nltk_data/corpora

In [None]:
nltk_data_dir = "/kaggle/working/nltk_data"
nltk.download('wordnet', download_dir=nltk_data_dir)
nltk.download('omw-1.4', download_dir=nltk_data_dir)
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.data.path.append(nltk_data_dir)

In [None]:
!unzip -o /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora/
!unzip -o /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora/

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

In [None]:
def zero_shot_captioning(image_path, model_name="HuggingFaceTB/SmolVLM-256M-Instruct"):
    """
    Generate captions using the pre-trained SmolVLM model without training.
    
    Args:
        image_path (str): Path to the input image.
        model_name (str): Name of the pre-trained model (default: SmolVLM).
        
    Returns:
        str: The generated caption
    """
    
    if not hasattr(zero_shot_captioning, "model") or not hasattr(zero_shot_captioning, "processor"):
        print("Loading model and processor...")
        zero_shot_captioning.processor = AutoProcessor.from_pretrained(model_name)
        zero_shot_captioning.model = AutoModelForVision2Seq.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
            device_map=DEVICE,
            _attn_implementation="eager"
        ).to(DEVICE)
        print("Model and processor loaded successfully!")
        
    try:
        image = load_image(image_path)
    except Exception as e:
        print(f"Loading image from {image_path} failed: {e}")
        return None
    
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        }
    ]
    
    prompt = zero_shot_captioning.processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = zero_shot_captioning.processor(text=prompt, images=image, return_tensors="pt").to(DEVICE)
    inputs = inputs.to(DEVICE)
    
    with torch.no_grad():
        generated_ids = zero_shot_captioning.model.generate(**inputs, max_new_tokens=100)
        generated_ids = generated_ids[:, inputs["input_ids"].shape[1]:]
        generated_text = zero_shot_captioning.processor.batch_decode(
            generated_ids, 
            skip_special_tokens=True
        )[0].strip()
        
    return generated_text

In [None]:
def generate_and_save_captions(test_dir, output_csv_path):
    """
    Generate captions for all images in a directory and save them to a CSV file.
    
    Args:
        test_dir (str): Path to the directory containing test images
        output_csv_path (str): Path to save the output CSV file
    """
    
    image_files = [f for f in os.listdir(test_dir) if f.lower().endswith('.jpg')]
    image_files.sort()
    
    results = []
    
    for image_file in tqdm(image_files, desc="Generating captions"):
        image_path = os.path.join(test_dir, image_file)
        caption = zero_shot_captioning(image_path)
        
        if caption:
            results.append({
                'filename': image_file,
                'generated_caption': caption
            })
    
    df = pd.DataFrame(results)
    df.to_csv(output_csv_path, index=True)
    print(f"Captions saved to {output_csv_path}")
    
    return df

In [None]:
def evaluate_model(test_csv_path, generated_csv_path):
    """
    Evaluate model performance using BLEU, ROUGE-L, METEOR.
    
    Args:
        test_csv_path (str): Path to the CSV with ground truth captions
        generated_csv_path (str): Path to the CSV with generated captions
        
    Returns:
        dict: BLEU, ROUGE-L, METEOR scores for the test set
    """
    
    test_df = pd.read_csv(test_csv_path)
    generated_df = pd.read_csv(generated_csv_path)
    
    merged_df = pd.merge(test_df, generated_df, on='filename', how='inner')
    
    if len(merged_df) == 0:
        print("No matching filenames found between test and generated captions.")
        return None
    
    references = []
    hypotheses = []
    
    for _, row in merged_df.iterrows():
        reference = nltk.word_tokenize(row['caption'].lower())
        hypothesis = nltk.word_tokenize(row['generated_caption'].lower())
        
        references.append([reference])
        hypotheses.append(hypothesis)
        
    # BLEU score
    smooth = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smooth)
    
    # ROUGE-L score
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_scores = []
    
    for i in range(len(merged_df)):
        score = rouge.score(merged_df.iloc[i]['caption'], merged_df.iloc[i]['generated_caption'])
        rouge_scores.append(score['rougeL'].fmeasure)
    rouge_l_score = np.mean(rouge_scores)
    
    # METEOR score
    meteor_scores = []
    for i in range(len(merged_df)):
        ref = nltk.word_tokenize(merged_df.iloc[i]['caption'].lower())
        hyp = nltk.word_tokenize(merged_df.iloc[i]['generated_caption'].lower())
        score = meteor_score([ref], hyp)
        meteor_scores.append(score)
    meteor_score_avg = np.mean(meteor_scores)
    
    results = {
        'BLEU': bleu_score,
        'ROUGE-L': rouge_l_score,
        'METEOR': meteor_score_avg
    }
    
    return results

In [None]:
test_dir = "/kaggle/input/dataset/Dataset/test"
generated_csv_path = "smolvlm_captions.csv"
test_csv_path = "/kaggle/input/dataset/Dataset/test.csv"

In [None]:
print("Generating captions for test images...")
generate_and_save_captions(test_dir, generated_csv_path)

In [None]:
print("Evaluating model performance...")
evaluation_results = evaluate_model(test_csv_path, generated_csv_path)
if evaluation_results:
    print("\nEvaluation Results:")
    print(f"BLEU Score: {evaluation_results['BLEU']:.4f}")
    print(f"ROUGE-L Score: {evaluation_results['ROUGE-L']:.4f}")
    print(f"METEOR Score: {evaluation_results['METEOR']:.4f}")
df_results = pd.DataFrame([evaluation_results]) 
df_results.to_csv("smolvlm_results.csv", index=False)
print("Evaluation results saved as smolvlm_results.csv")

In [None]:
class ImageCaptionDataset(Dataset):
    """
    Dataset for image captioning with ViT and GPT2
    """
    def __init__(self, csv_file, img_dir, tokenizer, max_length=50, transform=None):
        self.df = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        if transform is None:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transform
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        img_name = os.path.join(self.img_dir, self.df.iloc[idx]['filename'])
        caption = self.df.iloc[idx]['caption']
        
        try:
            image = Image.open(img_name).convert('RGB')
            image = self.transform(image)
        except Exception as e:
            print(f"Error loading image {img_name}: {e}")
            image = torch.zeros((3, 224, 224))
            
        # tokenize captions
        caption_encoding = self.tokenizer(
            caption, padding='max_length',
            truncation=True, max_length=self.max_length, return_tensors='pt'
        )
        
        caption_ids = caption_encoding.input_ids.squeeze(0)
        
        return image, caption_ids

In [None]:
class ImageCaptionModel(nn.Module):
    """
    Image Captioning Model combining ViT and GPT2
    """
    def __init__(self, vit_model='WinKawaks/vit-small-patch16-224', gpt2_model='gpt2', freeze_vit=True, freeze_gpt2_partial=True):
        super(ImageCaptionModel, self).__init__()
        
        self.encoder = ViTModel.from_pretrained(vit_model)
        self.encoder_dim = self.encoder.config.hidden_size
        
        if freeze_vit:
            for param in self.encoder.parameters():
                param.requires_grad = False
                
        self.decoder = GPT2LMHeadModel.from_pretrained(gpt2_model)
        self.decoder_dim = self.decoder.config.hidden_size
        
        if freeze_gpt2_partial:
            for i, block in enumerate(self.decoder.transformer.h):
                if i < len(self.decoder.transformer.h) - 2:
                    for param in block.parameters():
                        param.requires_grad = False
                        
        self.connect = nn.Linear(self.encoder_dim, self.decoder_dim)
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model)
        
        if self.tokenizer.pad_token is None:
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        self.img_token_id = self.tokenizer.convert_tokens_to_ids("<|img|>") if "<|img|>" in self.tokenizer.get_vocab() else self.tokenizer.convert_tokens_to_ids("<|endoftext|>")
        
    def forward(self, images, captions=None):
        """
        Forward pass of the model.
        
        Args:
            images: Tensor of input images [batch_size, channels, height, width]
            captions: Optional tensor of tokenized captions for training
            
        Returns:
            Tensor containing model predictions (logits)
        """
        encoder_outputs = self.encoder(images).last_hidden_state
        cls_output = encoder_outputs[:, 0, :]
        img_features = self.connect(cls_output)
        
        if captions is not None:
            outputs = self.decoder(input_ids=captions, labels=captions, encoder_hidden_states=img_features.unsqueeze(1))
            return outputs.logits
        else:
            batch_size = images.size(0)
            input_ids = torch.ones((batch_size, 1), dtype=torch.long, device=images.device) * self.img_token_id
            
            encoder_outputs = img_features.unsqueeze(1)
            
            outputs = self.decoder.generate(
                input_ids=input_ids,
                max_length=50,
                num_beams=4,
                early_stopping=True,
                encoder_hidden_states=encoder_outputs,
            )
            
            return outputs

In [None]:
def create_dataloaders(train_csv, val_csv, train_img_dir, val_img_dir, tokenizer, batch_szie=8):
    train_dataset = ImageCaptionDataset(csv_file=train_csv, img_dir=train_img_dir, tokenizer=tokenizer)
    val_dataset = ImageCaptionDataset(csv_file=val_csv, img_dir=val_img_dir, tokenizer=tokenizer)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_szie, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_szie, shuffle=False)
    
    return train_loader, val_loader

In [None]:
def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs=3, save_path="best_custom_model.pth"):
    """
    Train the encoder-decoder model.
    
    Args:
        model (nn.Module): Custom image captioning model.
        train_loader (DataLoader): Training data loader.
        val_loader (DataLoader): Validation data loader.
        optimizer: Optimizer (e.g., Adam).
        criterion (Loss): Loss function.
        device (str): Device to use ('cuda' or 'cpu').
        epochs (int): Number of epochs.
        save_path (str): Path to save the best model.
    """
    
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} [Training]")
        
        for images, captions in train_pbar:
            images = images.to(device)
            captions = captions.to(device)
            
            outputs = model(images, captions)
            
            loss = criterion(outputs.view(-1, outputs.size(-1)), captions.view(-1))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            train_pbar.set_postfix({'loss' : loss.item()})
            
        avg_train_loss = train_loss / len(train_loader)
        
        model.eval()
        val_loss = 0
        
        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} [Validation]")
        
        with torch.no_grad():
            for images, captions in val_pbar:
                images = images.to(device)
                captions = captions.to(device)
                
                outputs = model(images, captions)
                
                loss = criterion(outputs.view(-1, outputs.size(-1)), captions.view(-1))
                
                val_loss += loss.item()
                val_pbar.set_postfix({'loss' : loss.item()})
                
        avg_val_loss = val_loss / len(val_loader)
        
        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), save_path)
            print(f"Best model saved to {save_path}")

In [None]:
def generate_captions_with_custom_model(model, image_path, tokenizer, device):
    """
    Generate caption for a single image using the custom model
    
    Args:
        model: Trained custom model
        image_path: Path to the image
        tokenizer: GPT2 tokenizer
        device: Device to use ('cuda' or 'cpu')
        
    Returns:
        str: Generated caption
    """
    
    model.eval()
    
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    try:
        image = Image.open(image_path).convert('RGB')
        image = transform(image).unsqueeze(0).to(device)
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None
    
    with torch.no_grad():
        output_ids = model(image)
        
    caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return caption

In [None]:
def generate_captions_for_test_set(model, test_dir, output_csv, tokenizer, device):
    """
    Generate captions for all images in test directory using custom model
    
    Args:
        model: Trained custom model
        test_dir: Directory containing test images
        output_csv: Path to save output CSV
        tokenizer: GPT2 tokenizer
        device: Device to use
    """
    image_files = [f for f in os.listdir(test_dir) if f.lower().endswith('.jpg')]
    image_files.sort()
    
    results = []
    
    for image_file in tqdm(image_files, desc="Generating captions"):
        image_path = os.path.join(test_dir, image_file)
        caption = generate_captions_with_custom_model(model, image_path, tokenizer, device)
        
        if caption:
            results.append({
                'filename': image_file,
                'generated_caption': caption
            })
            
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"Captions saved to {output_csv}")

In [None]:
if __name__ == "__main__":
    # Paths
    train_csv = "train.csv"
    val_csv = "val.csv"
    test_csv = "test.csv"
    train_img_dir = "train"
    val_img_dir = "val"
    test_img_dir = "test"
    
    # Initialize model, tokenizer, criterion, optimizer
    model = ImageCaptionModel()
    tokenizer = model.tokenizer
    model = model.to(DEVICE)
    
    # Create dataloaders
    train_loader, val_loader = create_dataloaders(
        train_csv=train_csv,
        val_csv=val_csv,
        train_img_dir=train_img_dir,
        val_img_dir=val_img_dir,
        tokenizer=tokenizer,
        batch_size=16  # Adjust based on your GPU memory
    )
    
    # Initialize optimizer and criterion
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-4)
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
    
    # Train the model
    print("Starting model training...")
    train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=DEVICE,
        epochs=10,  # Adjust as needed
        save_path="best_custom_model.pth"
    )
    
    # Load the best model for inference
    print("Loading best model for inference...")
    model.load_state_dict(torch.load("best_custom_model.pth"))
    
    # Generate captions for test set
    print("Generating captions for test set...")
    generate_captions_for_test_set(
        model=model,
        test_dir=test_img_dir,
        output_csv="custom_model_captions.csv",
        tokenizer=tokenizer,
        device=DEVICE
    )
    
    # Evaluate the model
    print("Evaluating model performance...")
    evaluation_results = evaluate_model(
        test_csv_path=test_csv,
        generated_csv_path="custom_model_captions.csv"
    )
    
    if evaluation_results:
        print("\nEvaluation Results for Custom Model:")
        print(f"BLEU Score: {evaluation_results['BLEU']:.4f}")
        print(f"ROUGE-L Score: {evaluation_results['ROUGE-L']:.4f}")
        print(f"METEOR Score: {evaluation_results['METEOR']:.4f}")

## PART B

In [None]:
def occlude_image(image, mask_percentage):
    """
    Apply patch-wise occlusion to an image.
    
    Args:
        image (np.array): Input image.
        mask_percentage (int): Percentage of image to be masked.
    
    Returns:
        np.array: Occluded image.
    """
    if isinstance(image, torch.Tensor):
        image = image.cpu().numpy()
    
    occluded_image = image.copy()
    
    if len(image.shape) == 3: 
        height, width, channels = image.shape
    else: 
        height, width = image.shape
        channels = 1
        occluded_image = occluded_image.reshape(height, width, channels)
    
    patch_height = height // 16
    patch_width = width // 16
    
    total_patches = 16 * 16
    patches_to_mask = int((mask_percentage / 100) * total_patches)
    
    patch_indices = np.arange(total_patches)
    np.random.shuffle(patch_indices)
    masked_patches = patch_indices[:patches_to_mask]
    
    for patch_idx in masked_patches:
        row = patch_idx // 16
        col = patch_idx % 16
        
        row_start = row * patch_height
        row_end = min((row + 1) * patch_height, height)
        col_start = col * patch_width
        col_end = min((col + 1) * patch_width, width)
        
        occluded_image[row_start:row_end, col_start:col_end, :] = 0
    
    if channels == 1 and len(image.shape) == 2:
        occluded_image = occluded_image.reshape(height, width)
    
    return occluded_image

In [None]:
class OccludedImageDataset(Dataset):
    """
    Dataset for occluded images with their captions
    """
    def __init__(self, csv_file, img_dir, occlusion_percentage=0, transform=None):
        """
        Args:
            csv_file: Path to CSV with image filenames and captions
            img_dir: Directory containing images
            occlusion_percentage: Percentage of image to occlude
            transform: Optional transforms to apply
        """
        self.df = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.occlusion_percentage = occlusion_percentage
        
        if transform is None:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transform
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        img_name = os.path.join(self.img_dir, self.df.iloc[idx]['filename'])
        caption = self.df.iloc[idx]['caption']
        filename = self.df.iloc[idx]['filename']
        
        try:
            image = Image.open(img_name).convert('RGB')
            image_np = np.array(image)
            
            if self.occlusion_percentage > 0:
                occluded_image_np = occlude_image(image_np, self.occlusion_percentage)
                image = Image.fromarray(occluded_image_np.astype(np.uint8))
            
            image = self.transform(image)
        except Exception as e:
            print(f"Error processing image {img_name}: {e}")
            image = torch.zeros((3, 224, 224))
        
        return image, caption, filename

In [None]:
def evaluate_on_occluded_images(model, dataloader, device, model_name, occlusion_level, metrics_only=False):
    """
    Evaluate performance after occluding images.
    
    Args:
        model (nn.Module): Image captioning model.
        dataloader (DataLoader): Test dataloader.
        device (str): 'cuda' or 'cpu'.
        model_name (str): Name of the model ('SmolVLM' or 'Custom')
        occlusion_level (int): Current occlusion percentage.
        metrics_only (bool): If True, only return metrics without saving CSVs
    
    Returns:
        dict: BLEU, ROUGE-L, METEOR scores for the test set.
        DataFrame: Generated captions with filenames
    """
    is_custom_model = not isinstance(model, (AutoModelForVision2Seq))
    
    generated_captions = []
    reference_captions = []
    image_filenames = []
    
    model.eval()
    with torch.no_grad():
        for images, captions, filenames in tqdm(dataloader, desc=f"Evaluating {model_name} at {occlusion_level}% occlusion"):
            images = images.to(device)
            
            if is_custom_model:
                output_ids = model(images)
                generated_caption = model.tokenizer.decode(output_ids[0], skip_special_tokens=True)
            else:
                messages = [
                    {
                        "role": "user",
                        "content": [
                            {"type": "image"},
                            {"type": "text", "text": "Describe this image in detail."}
                        ]
                    }
                ]
                
                processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
                prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
                inputs = processor(text=prompt, images=images, return_tensors="pt")
                inputs = inputs.to(device)
                
                generated_ids = model.generate(**inputs, max_new_tokens=500)
                generated_ids = generated_ids[:, inputs["input_ids"].shape[1]:]
                generated_caption = processor.batch_decode(
                    generated_ids,
                    skip_special_tokens=True,
                )[0].strip()
            
            for i in range(len(captions)):
                generated_captions.append(generated_caption)
                reference_captions.append(captions[i])
                image_filenames.append(filenames[i])
    
    results_df = pd.DataFrame({
        'filename': image_filenames,
        'original_caption': reference_captions,
        'generated_caption': generated_captions,
        'occlusion_level': occlusion_level,
        'model': model_name
    })
    
    references = []
    hypotheses = []
    
    for i in range(len(results_df)):
        reference = nltk.word_tokenize(results_df.iloc[i]['original_caption'].lower())
        hypothesis = nltk.word_tokenize(results_df.iloc[i]['generated_caption'].lower())
        
        references.append([reference])  
        hypotheses.append(hypothesis)
    
    smooth = SmoothingFunction().method1
    bleu_score = corpus_bleu(references, hypotheses, smoothing_function=smooth)
    
    rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
    rouge_scores = []
    
    for i in range(len(results_df)):
        score = rouge.score(results_df.iloc[i]['original_caption'], results_df.iloc[i]['generated_caption'])
        rouge_scores.append(score['rougeL'].fmeasure)
    
    rouge_l_score = np.mean(rouge_scores)
    
    meteor_scores = []
    for i in range(len(results_df)):
        ref = nltk.word_tokenize(results_df.iloc[i]['original_caption'].lower())
        hyp = nltk.word_tokenize(results_df.iloc[i]['generated_caption'].lower())
        score = meteor_score([ref], hyp)
        meteor_scores.append(score)
    
    meteor_score_avg = np.mean(meteor_scores)
    
    metrics = {
        'BLEU': bleu_score,
        'ROUGE-L': rouge_l_score,
        'METEOR': meteor_score_avg
    }
    
    if not metrics_only:
        output_csv = f"{model_name.lower()}_occlusion_{occlusion_level}.csv"
        results_df.to_csv(output_csv, index=False)
        print(f"Results saved to {output_csv}")
        
        part_c_data = []
        for i in range(len(results_df)):
            # Format as required for Part C
            input_text = f"{results_df.iloc[i]['original_caption']} <SEP> {results_df.iloc[i]['generated_caption']} <SEP> {occlusion_level}"
            output_label = "Model A" if model_name == "SmolVLM" else "Model B"
            
            part_c_data.append({
                'input_text': input_text,
                'output_label': output_label
            })
        
        part_c_df = pd.DataFrame(part_c_data)
        
        if os.path.exists('partc.csv'):
            part_c_df.to_csv('partc.csv', mode='a', header=False, index=False)
        else:
            part_c_df.to_csv('partc.csv', index=False)
    
    return metrics, results_df

In [None]:
def run_occlusion_study(test_csv, test_img_dir, custom_model_path, occlusion_levels=[10, 50, 80]):
    """
    Run a complete occlusion study for both models at different levels.
    
    Args:
        test_csv: Path to test CSV file
        test_img_dir: Directory containing test images
        custom_model_path: Path to saved custom model
        occlusion_levels: List of occlusion percentages to test
    
    Returns:
        dict: Results for all models and occlusion levels
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    all_results = {
        'SmolVLM': {},
        'Custom': {}
    }
    
    # Load SmolVLM model
    print("Loading SmolVLM model...")
    processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct")
    smolVLM_model = AutoModelForVision2Seq.from_pretrained(
        "HuggingFaceTB/SmolVLM-Instruct",
        torch_dtype=torch.bfloat16,
        device_map="auto",
        _attn_implementation="eager"
    ).to(device)
    
    print("Loading custom model...")
    custom_model = ImageCaptionModel()
    custom_model.load_state_dict(torch.load(custom_model_path, map_location=device))
    custom_model = custom_model.to(device)
    
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    
    print("\nEvaluating on original images (0% occlusion)...")
    
    original_dataset = OccludedImageDataset(
        csv_file=test_csv,
        img_dir=test_img_dir,
        occlusion_percentage=0,
        transform=transform
    )
    
    original_loader = DataLoader(
        original_dataset,
        batch_size=8,
        shuffle=False,
        num_workers=4
    )
    
    baseline_smolVLM, _ = evaluate_on_occluded_images(
        model=smolVLM_model, 
        dataloader=original_loader,
        device=device,
        model_name="SmolVLM",
        occlusion_level=0
    )
    
    all_results['SmolVLM'][0] = baseline_smolVLM
    
    baseline_custom, _ = evaluate_on_occluded_images(
        model=custom_model, 
        dataloader=original_loader,
        device=device,
        model_name="Custom",
        occlusion_level=0
    )
    
    all_results['Custom'][0] = baseline_custom
    
    for occlusion in occlusion_levels:
        print(f"\nEvaluating at {occlusion}% occlusion...")
        
        occluded_dataset = OccludedImageDataset(
            csv_file=test_csv,
            img_dir=test_img_dir,
            occlusion_percentage=occlusion,
            transform=transform
        )
        
        occluded_loader = DataLoader(
            occluded_dataset,
            batch_size=8,
            shuffle=False,
            num_workers=4
        )
        
        smolVLM_metrics, _ = evaluate_on_occluded_images(
            model=smolVLM_model, 
            dataloader=occluded_loader,
            device=device,
            model_name="SmolVLM",
            occlusion_level=occlusion
        )
        
        all_results['SmolVLM'][occlusion] = smolVLM_metrics
        
        custom_metrics, _ = evaluate_on_occluded_images(
            model=custom_model, 
            dataloader=occluded_loader,
            device=device,
            model_name="Custom",
            occlusion_level=occlusion
        )
        
        all_results['Custom'][occlusion] = custom_metrics
    
    print("\nCalculating metric changes across occlusion levels...")
    metric_changes = {
        'SmolVLM': {occlusion: {} for occlusion in occlusion_levels},
        'Custom': {occlusion: {} for occlusion in occlusion_levels}
    }
    
    for model_name in ['SmolVLM', 'Custom']:
        for occlusion in occlusion_levels:
            for metric in ['BLEU', 'ROUGE-L', 'METEOR']:
                # Calculate change from baseline (0% occlusion)
                change = all_results[model_name][occlusion][metric] - all_results[model_name][0][metric]
                metric_changes[model_name][occlusion][metric] = change
                
                print(f"{model_name} {metric} change at {occlusion}% occlusion: {change:.4f}")
    
    return all_results, metric_changes

In [None]:
test_csv = "/kaggle/input/dataset/Dataset/test.csv"
test_img_dir = "/kaggle/input/dataset/Dataset/test"
custom_model_path = "best_custom_model.pth"
occlusion_levels = [10]

print("Starting Occlusion Images ...")
all_results, metric_changes = run_occlusion_study(
    test_csv=test_csv,
    test_img_dir=test_img_dir,
    custom_model_path=custom_model_path,
    occlusion_levels=occlusion_levels
)
print("\nOcclusion study completed!")

In [None]:
print("Metrics at different Occlusion Levels : ")
for model_name in ['SmolVLM', 'Custom']:
    print(f"\n{model_name} Model:")
    print(f"{'Occlusion Level':<15} {'BLEU':<10} {'ROUGE-L':<10} {'METEOR':<10}")
    
    bleu = all_results[model_name][0]['BLEU']
    rouge = all_results[model_name][0]['ROUGE-L']
    meteor = all_results[model_name][0]['METEOR']
    
    print(f"{'0% (baseline)':<15} {bleu:.4f}      {rouge:.4f}      {meteor:.4f}")
    
    for occlusion in occlusion_levels:
        bleu = all_results[model_name][occlusion]['BLEU']
        rouge = all_results[model_name][occlusion]['ROUGE-L']
        meteor = all_results[model_name][occlusion]['METEOR']
        
        bleu_change = metric_changes[model_name][occlusion]['BLEU']
        rouge_change = metric_changes[model_name][occlusion]['ROUGE-L']
        meteor_change = metric_changes[model_name][occlusion]['METEOR']
        
        print(f"{f'{occlusion}%':<15} {bleu:.4f}      {rouge:.4f}      {meteor:.4f}")
        print(f"{'Change':<15} {bleu_change:+.4f}      {rouge_change:+.4f}      {meteor_change:+.4f}")

print("dataset saved ot partc.csv")

## PART C

In [None]:
class CaptionClassifier(nn.Module):
    """
    BERT-based classifier for identifying which model generated a caption.
    """
    def __init__(self, bert_model_name='bert-base-uncased', num_classes=2):
        super(CaptionClassifier, self).__init__()
        
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained(bert_model_name)
        
        # BERT hidden size
        self.hidden_size = self.bert.config.hidden_size
        
        # Add dropout layer for regularization
        self.dropout = nn.Dropout(0.1)
        
        # Classification layer
        self.classifier = nn.Linear(self.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask):
        """
        Forward pass of the model.
        
        Args:
            input_ids: Token IDs
            attention_mask: Attention mask for BERT
            
        Returns:
            torch.Tensor: Output logits for classification
        """
        # Get BERT outputs
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        # Use the [CLS] token representation for classification
        pooled_output = outputs.pooler_output
        
        # Apply dropout
        pooled_output = self.dropout(pooled_output)
        
        # Get logits
        logits = self.classifier(pooled_output)
        
        return logits

In [None]:
class CaptionClassifierDataset(Dataset):
    """
    Dataset for the caption classifier
    """
    def __init__(self, dataframe, tokenizer, max_length=128):
        """
        Args:
            dataframe: DataFrame containing input_text and output_label columns
            tokenizer: BERT tokenizer
            max_length: Maximum sequence length for tokenization
        """
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Map text labels to numeric labels
        self.label_map = {
            'Model A': 0,  # SmolVLM
            'Model B': 1   # Custom
        }
    
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        
        # Get input text and label
        input_text = self.dataframe.iloc[idx]['input_text']
        label = self.dataframe.iloc[idx]['output_label']
        
        # Convert label to numeric
        numeric_label = self.label_map[label]
        
        # Tokenize input text
        encoding = self.tokenizer(
            text=input_text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Remove batch dimension
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': torch.tensor(numeric_label, dtype=torch.long)
        }

In [None]:
def split_part_c_dataset(part_c_csv, train_ratio=0.7, val_ratio=0.1, test_ratio=0.2, seed=42):
    """
    Split the Part C dataset into train, validation, and test sets.
    
    Args:
        part_c_csv: Path to the Part C CSV file
        train_ratio: Ratio of data for training
        val_ratio: Ratio of data for validation
        test_ratio: Ratio of data for testing
        seed: Random seed for reproducibility
        
    Returns:
        tuple: (train_df, val_df, test_df)
    """
    # Verify ratios sum to 1
    assert abs(train_ratio + val_ratio + test_ratio - 1.0) < 1e-10, "Ratios must sum to 1"
    
    # Set random seed for reproducibility
    np.random.seed(seed)
    
    # Load the dataset
    df = pd.read_csv(part_c_csv)
    
    # Extract unique image identifiers from filenames in the input text
    df['image_id'] = df['input_text'].apply(lambda x: x.split('<SEP>')[0].strip())
    
    # Get unique image IDs
    unique_images = df['image_id'].unique()
    np.random.shuffle(unique_images)
    
    # Calculate split sizes
    n_train = int(len(unique_images) * train_ratio)
    n_val = int(len(unique_images) * val_ratio)
    
    # Split image IDs
    train_images = unique_images[:n_train]
    val_images = unique_images[n_train:n_train+n_val]
    test_images = unique_images[n_train+n_val:]
    
    # Create masks for each split
    train_mask = df['image_id'].isin(train_images)
    val_mask = df['image_id'].isin(val_images)
    test_mask = df['image_id'].isin(test_images)
    
    # Create DataFrames for each split
    train_df = df[train_mask].reset_index(drop=True)
    val_df = df[val_mask].reset_index(drop=True)
    test_df = df[test_mask].reset_index(drop=True)
    
    print(f"Dataset split complete:")
    print(f"Total samples: {len(df)}")
    print(f"Training samples: {len(train_df)} ({len(train_df)/len(df):.2%})")
    print(f"Validation samples: {len(val_df)} ({len(val_df)/len(df):.2%})")
    print(f"Test samples: {len(test_df)} ({len(test_df)/len(df):.2%})")
    
    # Remove the temporary image_id column
    train_df = train_df.drop(columns=['image_id'])
    val_df = val_df.drop(columns=['image_id'])
    test_df = test_df.drop(columns=['image_id'])
    
    return train_df, val_df, test_df

In [None]:
def train_classifier(model, train_loader, val_loader, optimizer, criterion, device, epochs=5):
    """
    Train the BERT-based caption classifier.
    
    Args:
        model (nn.Module): Caption classifier model
        train_loader (DataLoader): Training data loader
        val_loader (DataLoader): Validation data loader
        optimizer: Optimizer (e.g., Adam)
        criterion (Loss): Loss function
        device (str): Device to use ('cuda' or 'cpu')
        epochs (int): Number of epochs
    """
    # Best model tracking
    best_val_loss = float('inf')
    best_model_state = None
    
    # Training loop
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        
        # Training phase
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        
        train_pbar = tqdm(train_loader, desc="Training")
        for batch in train_pbar:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Calculate loss
            loss = criterion(outputs, labels)
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()
            
            # Calculate accuracy
            _, predicted = torch.max(outputs, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
            
            # Update loss
            train_loss += loss.item()
            
            # Update progress bar
            train_pbar.set_postfix({'loss': loss.item(), 'accuracy': train_correct/train_total})
        
        # Calculate training metrics
        train_loss = train_loss / len(train_loader)
        train_accuracy = train_correct / train_total
        
        # Validation phase
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            val_pbar = tqdm(val_loader, desc="Validation")
            for batch in val_pbar:
                # Move batch to device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['label'].to(device)
                
                # Forward pass
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                
                # Calculate loss
                loss = criterion(outputs, labels)
                
                # Calculate accuracy
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
                
                # Update loss
                val_loss += loss.item()
                
                # Update progress bar
                val_pbar.set_postfix({'loss': loss.item(), 'accuracy': val_correct/val_total})
        
        # Calculate validation metrics
        val_loss = val_loss / len(val_loader)
        val_accuracy = val_correct / val_total
        
        # Print epoch results
        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}")
        
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            print(f"  New best model saved with validation loss: {best_val_loss:.4f}")
    
    # Load best model
    model.load_state_dict(best_model_state)
    
    # Save best model to disk
    torch.save(best_model_state, 'best_caption_classifier.pth')
    print(f"Best model saved to 'best_caption_classifier.pth'")
    
    return model

In [None]:
def evaluate_classifier(model, dataloader, device):
    """
    Evaluate the classification model.
    
    Args:
        model (nn.Module): Trained model
        dataloader (DataLoader): Test data loader
        device (str): 'cuda' or 'cpu'
        
    Returns:
        dict: Precision, Recall and F1 scores for the test set
    """
    model.eval()
    
    # Lists to store predictions and true labels
    all_preds = []
    all_labels = []
    
    # Evaluate model
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            
            # Get predictions
            _, predicted = torch.max(outputs, 1)
            
            # Store predictions and true labels
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Calculate metrics
    from sklearn.metrics import precision_recall_fscore_support, classification_report
    
    # Get precision, recall, f1 for each class (macro averaging)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='macro'
    )
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(
        all_labels, all_preds, 
        target_names=['Model A (SmolVLM)', 'Model B (Custom)']
    ))
    
    # Return metrics
    metrics = {
        'Precision': precision,
        'Recall': recall,
        'F1': f1
    }
    
    return metrics

In [None]:
## we need to implement the function calls for PartC