In [1]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.102-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [2]:
from google.colab import files
files.upload()  # Manually upload `kaggle.json`

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"f4saken","key":"3e950e6cf11c7fe39797cd9d2104db6b"}'}

In [3]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
import kagglehub
aiml_gc_2025_path = kagglehub.competition_download('aiml-gc-2025')

Downloading from https://www.kaggle.com/api/v1/competitions/data/download-all/aiml-gc-2025...


100%|██████████| 1.06G/1.06G [00:49<00:00, 22.8MB/s]

Extracting files...





In [5]:
import shutil

destination = "/content/aiml-gc-2025"
shutil.move(aiml_gc_2025_path, destination)
print(f"Files moved to: {destination}")

Files moved to: /content/aiml-gc-2025


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
import torchvision.models as models
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from ultralytics import YOLO
from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
import albumentations as A
from albumentations.pytorch import ToTensorV2
import timm
from torch.cuda.amp import GradScaler, autocast
import random
import warnings
warnings.filterwarnings('ignore')

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [7]:
# Set seeds for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything()

In [8]:
# Create result directories
os.makedirs("/content/working/data/", exist_ok=True)
os.makedirs("/content/working/models/", exist_ok=True)

# Initialize YOLO model with the latest version (YOLOv8x)
model_yolo = YOLO('yolov8x.pt')

# Paths
train_path = '/content/aiml-gc-2025/AI-ML GC 2025 Dataset/train'
test_path = "/content/aiml-gc-2025/AI-ML GC 2025 Dataset/test"

train_folders = os.listdir(train_path)
le = LabelEncoder()

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8x.pt to 'yolov8x.pt'...


100%|██████████| 131M/131M [00:00<00:00, 492MB/s] 


In [9]:
# Enhanced bird detection and cropping with padding and confidence thresholds
def detect_and_crop(file_path, folder=None, is_train=True):
    """
    Enhanced detection and cropping with better padding and handling of edge cases.
    Implements dynamic padding based on bird size and aspect ratio preservation.
    """
    try:
        image = Image.open(file_path).convert('RGB')
        image_np = np.array(image)
        orig_height, orig_width = image_np.shape[:2]

        # Run YOLOv8 detection with higher confidence threshold for precision
        results = model_yolo(image_np, verbose=False, conf=0.25, classes=[14, 15, 16, 17, 18, 19])  # Bird-related classes

        best_box = None
        best_conf = 0

        for result in results:
            for i, box in enumerate(result.boxes.xyxy):
                # Get confidence score
                conf = result.boxes.conf[i].item()

                if conf > best_conf:
                    best_conf = conf
                    best_box = box

        if best_box is not None:
            x1, y1, x2, y2 = map(int, best_box[:4])

            # Calculate padding (30% of the bird size, dynamic)
            width, height = x2 - x1, y2 - y1
            pad_x = int(width * 0.3)
            pad_y = int(height * 0.3)

            # Apply padding with boundary checks
            x1 = max(0, x1 - pad_x)
            y1 = max(0, y1 - pad_y)
            x2 = min(orig_width, x2 + pad_x)
            y2 = min(orig_height, y2 + pad_y)

            cropped_img = image_np[y1:y2, x1:x2]

            if cropped_img.size == 0:
                # Fallback to original image if crop failed
                cropped_img = image_np
        else:
            # No detection - use original image
            cropped_img = image_np

        cropped_img = Image.fromarray(cropped_img)

        # Keep aspect ratio when resizing
        if is_train:
            cropped_img = cropped_img.resize((384, 384), Image.LANCZOS)  # Adjusted size for transformers
            save_path = f"/content/working/data/{folder}_{os.path.basename(file_path)}"
            cropped_img.save(save_path)
            return save_path
        else:
            cropped_img = cropped_img.resize((384, 384), Image.LANCZOS)  # Adjusted size for transformers
            return cropped_img

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        # Return original image in case of error
        if is_train:
            image = Image.open(file_path).convert('RGB')
            image = image.resize((384, 384), Image.LANCZOS)  # Adjusted size for transformers
            save_path = f"/content/working/data/{folder}_{os.path.basename(file_path)}"
            image.save(save_path)
            return save_path
        else:
            return Image.open(file_path).convert('RGB').resize((384, 384), Image.LANCZOS)  # Adjusted size for transformers


In [10]:
# Process and prepare dataset
print("Preparing dataset...")
image_paths = []
labels = []

for folder in tqdm(train_folders, desc="Processing Folders"):
    train_files = os.listdir(os.path.join(train_path, folder))
    for file in train_files:
        file_path = os.path.join(train_path, folder, file)
        save_path = detect_and_crop(file_path, folder)
        image_paths.append(save_path)
        labels.append(folder)

Preparing dataset...


Processing Folders: 100%|██████████| 200/200 [04:09<00:00,  1.25s/it]


In [11]:
# Encode labels
le.fit(labels)
labels = le.transform(labels)

# Create stratified folds for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Convert to numpy arrays for easier handling
image_paths = np.array(image_paths)
labels = np.array(labels)

In [12]:
# Advanced augmentations using Albumentations
train_transforms = A.Compose([
    A.RandomResizedCrop(size=(384, 384), scale=(0.8, 1.0)),
    A.OneOf([
        A.RandomRotate90(),
        A.Rotate(limit=40),
    ], p=0.5),
    A.OneOf([
        A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2),
        A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20),
        A.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    ], p=0.5),
    A.OneOf([
        A.GaussianBlur(blur_limit=3),
        A.MedianBlur(blur_limit=3),
        A.MotionBlur(blur_limit=3),
    ], p=0.3),
    A.CoarseDropout(max_holes=8, max_height=64, max_width=64, min_holes=1, min_height=32, min_width=32, p=0.3),
    A.GaussNoise(var_limit=(10.0, 50.0), p=0.2),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

val_transforms = A.Compose([
    A.Resize(height=384, width=384),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

In [13]:
# Enhanced Dataset Class
class BirdDataset(Dataset):
    def __init__(self, image_paths, labels=None, transform=None, is_test=False):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]

        image = Image.open(img_path).convert('RGB')
        image = np.array(image)

        if self.transform:
            image = self.transform(image=image)['image']

        if self.is_test:
            return image
        else:
            label = self.labels[idx]
            return image, torch.tensor(label)# ViT-based model with transformer architecture

In [14]:
# ViT-based model with transformer architecture
def create_vit_model(num_classes=200):
    """
    Creates a Vision Transformer model with pretrained weights
    """
    # Using ViT-Large model
    model = timm.create_model('vit_large_patch16_384', pretrained=True)

    # Freeze early layers for better transfer learning
    ct = 0
    for name, param in model.named_parameters():
        if ct < 150:  # Freeze first 150 parameters
            param.requires_grad = False
        ct += 1

    # Modify the head for bird classification
    in_features = model.head.in_features
    model.head = nn.Sequential(
        nn.LayerNorm(in_features),
        nn.Linear(in_features, 1024),
        nn.GELU(),
        nn.Dropout(0.2),
        nn.Linear(1024, num_classes)
    )

    return model

In [15]:
# Swin Transformer model
def create_swin_model(num_classes=200):
    """
    Creates a Swin Transformer model with pretrained weights
    """
    model = timm.create_model('swin_large_patch4_window12_384', pretrained=True)

    # Freeze early layers
    ct = 0
    for name, param in model.named_parameters():
        if ct < 150:  # Freeze first 150 parameters
            param.requires_grad = False
        ct += 1

    # Modify the head
    in_features = model.head.in_features
    model.head = nn.Sequential(
        nn.LayerNorm(in_features),
        nn.Linear(in_features, 1024),
        nn.GELU(),
        nn.Dropout(0.2),
        nn.Linear(1024, num_classes)
    )

    return model


In [16]:
# Advanced ConvNeXt model (transformer-inspired architecture)
def create_convnext_model(num_classes=200):
    """
    Creates a ConvNeXt model which incorporates transformer design principles
    into a convolutional network
    """
    model = timm.create_model('convnext_large', pretrained=True)

    # Freeze early layers
    ct = 0
    for name, param in model.named_parameters():
        if ct < 150:
            param.requires_grad = False
        ct += 1

    # Modify the head
    in_features = model.head.fc.in_features
    model.head.fc = nn.Sequential(
        nn.LayerNorm(in_features),
        nn.Linear(in_features, 1024),
        nn.GELU(),
        nn.Dropout(0.2),
        nn.Linear(1024, num_classes)
    )

    return model

In [28]:
def create_efficientnet_model(num_classes=200):
    """
    Creates an EfficientNet-B7 model with pretrained weights.
    It freezes the first 150 parameters and replaces the classification head.
    """
    # Create an EfficientNet-B7 model with pretrained weights.
    model = timm.create_model('tf_efficientnet_b7_ns', pretrained=True)

    # Freeze early layers: iterate over parameters and freeze the first 150.
    ct = 0
    for name, param in model.named_parameters():
        if ct < 150:
            param.requires_grad = False
        ct += 1

    # Modify the classifier head.
    # Retrieve the input features for the classifier.
    in_features = model.get_classifier().in_features
    # Replace the classifier with a custom head.
    model.classifier = nn.Sequential(
        nn.LayerNorm(in_features),
        nn.Linear(in_features, 1024),
        nn.GELU(),
        nn.Dropout(0.2),
        nn.Linear(1024, num_classes)
    )

    return model

In [18]:
# Custom model ensemble
class ModelEnsemble(nn.Module):
    def __init__(self, models):
        super(ModelEnsemble, self).__init__()
        self.models = nn.ModuleList(models)

    def forward(self, x):
        outputs = [model(x) for model in self.models]
        return torch.mean(torch.stack(outputs), dim=0)


In [19]:

# Training function with mixed precision
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, device, model_name, num_epochs=10):
    best_val_acc = 0.0
    scaler = GradScaler()  # For mixed precision training

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0

        train_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")
        for batch_idx, (images, labels) in enumerate(train_bar):
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()

            # Mixed precision training
            with autocast():
                outputs = model(images)
                loss = criterion(outputs, labels)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            train_bar.set_postfix(loss=running_loss/(batch_idx+1), acc=f"{100.0*correct/total:.2f}%")

        train_acc = 100.0 * correct / total
        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {train_acc:.2f}%")

        # Validation phase
        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0

        with torch.no_grad():
            for images, labels in tqdm(val_loader, desc="Validation"):
                images, labels = images.to(device), labels.to(device)

                outputs = model(images)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        val_acc = 100.0 * correct / total
        print(f"Validation Loss: {val_loss/len(val_loader):.4f}, Accuracy: {val_acc:.2f}%")

        # Update scheduler
        scheduler.step()

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), f"/content/working/models/best_{model_name}_epoch_{epoch}.pth")
            print(f"Best model saved with accuracy: {best_val_acc:.2f}%")

    return model

In [20]:
# Test Time Augmentation (TTA)
def tta_inference(model, image, device, transforms_list):
    """
    Performs Test Time Augmentation by averaging predictions from
    multiple augmented versions of the input image
    """
    model.eval()
    predictions = []

    with torch.no_grad():
        # Original image
        outputs = model(image)
        predictions.append(outputs)

        # Horizontally flipped image
        flipped_image = torch.flip(image, dims=[3])
        outputs = model(flipped_image)
        predictions.append(outputs)

        # Vertically flipped image
        flipped_image = torch.flip(image, dims=[2])
        outputs = model(flipped_image)
        predictions.append(outputs)

        # Both flipped
        flipped_image = torch.flip(image, dims=[2, 3])
        outputs = model(flipped_image)
        predictions.append(outputs)

    # Average predictions
    return torch.mean(torch.stack(predictions), dim=0)

In [21]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Define hyperparameters
batch_size = 16
num_epochs = 10

# Using stratified cross-validation
models = []

Using device: cuda


In [56]:
from sklearn.model_selection import train_test_split

train_images, val_images, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.1, stratify=labels, random_state=42
)

In [57]:
# Create datasets
train_dataset = BirdDataset(
    train_images,
    train_labels,
    transform=train_transforms
)

val_dataset = BirdDataset(
    val_images,
    val_labels,
    transform=val_transforms
)

# Create dataloaders
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    pin_memory=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=4,
    pin_memory=True
)

In [None]:
# Train each model separately
models = []
model_types = ["vit", "convnext"]

for model_type in model_types:
    print(f"\nTraining {model_type} model")

    # Create model based on type
    if model_type == "vit":
        model = create_vit_model(num_classes=200)
    elif model_type == "swin":
        model = create_swin_model(num_classes=200)
    else:  # convnext
        model = create_convnext_model(num_classes=200)

    model = model.to(device)

    # Use label smoothing cross entropy loss
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Use AdamW optimizer with weight decay and different learning rates for different parts
    # Higher learning rate for newly added layers
    params = [
        {'params': [p for n, p in model.named_parameters() if 'head' not in n], 'lr': 1e-5},
        {'params': [p for n, p in model.named_parameters() if 'head' in n], 'lr': 1e-4}
    ]

    optimizer = optim.AdamW(params, weight_decay=1e-5)

    # Use cosine annealing scheduler
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=1, eta_min=1e-6)
    try:
      # Train model
      model = train_model(
          model,
          train_loader,
          val_loader,
          criterion,
          optimizer,
          scheduler,
          device,
          model_type,
          num_epochs=num_epochs
      )
    except:
      continue

    # Save the trained model
    torch.save(model.state_dict(), f"/content/working/models/{model_type}_model.pth")
    models.append(model)


Training vit model


Epoch 1/10: 100%|██████████| 559/559 [01:02<00:00,  8.96it/s, acc=59.16%, loss=2.48]


Epoch 1, Loss: 2.4822, Accuracy: 59.16%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Validation Loss: 1.3280, Accuracy: 84.93%
Best model saved with accuracy: 84.93%


Epoch 2/10: 100%|██████████| 559/559 [01:02<00:00,  8.99it/s, acc=84.99%, loss=1.35]


Epoch 2, Loss: 1.3497, Accuracy: 84.99%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Validation Loss: 1.2239, Accuracy: 89.60%
Best model saved with accuracy: 89.60%


Epoch 3/10: 100%|██████████| 559/559 [01:02<00:00,  8.98it/s, acc=89.38%, loss=1.22]


Epoch 3, Loss: 1.2191, Accuracy: 89.38%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.81it/s]


Validation Loss: 1.1617, Accuracy: 91.93%
Best model saved with accuracy: 91.93%


Epoch 4/10: 100%|██████████| 559/559 [01:02<00:00,  8.98it/s, acc=92.24%, loss=1.14]


Epoch 4, Loss: 1.1436, Accuracy: 92.24%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.81it/s]


Validation Loss: 1.1505, Accuracy: 92.14%
Best model saved with accuracy: 92.14%


Epoch 5/10: 100%|██████████| 559/559 [01:02<00:00,  8.99it/s, acc=93.25%, loss=1.11]


Epoch 5, Loss: 1.1111, Accuracy: 93.25%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.83it/s]


Validation Loss: 1.1438, Accuracy: 92.36%
Best model saved with accuracy: 92.36%


Epoch 6/10: 100%|██████████| 559/559 [01:02<00:00,  8.97it/s, acc=91.86%, loss=1.15]


Epoch 6, Loss: 1.1512, Accuracy: 91.86%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Validation Loss: 1.1585, Accuracy: 91.08%


Epoch 7/10: 100%|██████████| 559/559 [01:02<00:00,  9.00it/s, acc=93.57%, loss=1.11]


Epoch 7, Loss: 1.1057, Accuracy: 93.57%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Validation Loss: 1.1596, Accuracy: 91.93%


Epoch 8/10: 100%|██████████| 559/559 [01:02<00:00,  9.01it/s, acc=95.15%, loss=1.05]


Epoch 8, Loss: 1.0460, Accuracy: 95.15%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.81it/s]


Validation Loss: 1.1512, Accuracy: 92.36%


Epoch 9/10: 100%|██████████| 559/559 [01:02<00:00,  8.99it/s, acc=96.31%, loss=1.01]


Epoch 9, Loss: 1.0135, Accuracy: 96.31%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Validation Loss: 1.1519, Accuracy: 92.57%
Best model saved with accuracy: 92.57%


Epoch 10/10: 100%|██████████| 559/559 [01:02<00:00,  8.98it/s, acc=97.25%, loss=0.99]


Epoch 10, Loss: 0.9904, Accuracy: 97.25%


Validation: 100%|██████████| 30/30 [00:10<00:00,  2.82it/s]


Validation Loss: 1.1494, Accuracy: 92.36%

Training convnext model


Epoch 1/10: 100%|██████████| 559/559 [00:56<00:00,  9.96it/s, acc=51.37%, loss=2.82]


Epoch 1, Loss: 2.8239, Accuracy: 51.37%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.69it/s]


Validation Loss: 1.3547, Accuracy: 86.41%
Best model saved with accuracy: 86.41%


Epoch 2/10: 100%|██████████| 559/559 [00:55<00:00, 10.00it/s, acc=78.77%, loss=1.63]


Epoch 2, Loss: 1.6322, Accuracy: 78.77%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.72it/s]


Validation Loss: 1.2401, Accuracy: 89.17%
Best model saved with accuracy: 89.17%


Epoch 3/10: 100%|██████████| 559/559 [00:56<00:00,  9.97it/s, acc=83.56%, loss=1.48]


Epoch 3, Loss: 1.4844, Accuracy: 83.56%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.67it/s]


Validation Loss: 1.1880, Accuracy: 90.66%
Best model saved with accuracy: 90.66%


Epoch 4/10: 100%|██████████| 559/559 [00:56<00:00,  9.96it/s, acc=85.55%, loss=1.41]


Epoch 4, Loss: 1.4092, Accuracy: 85.55%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.72it/s]


Validation Loss: 1.1784, Accuracy: 90.87%
Best model saved with accuracy: 90.87%


Epoch 5/10: 100%|██████████| 559/559 [00:55<00:00,  9.98it/s, acc=86.26%, loss=1.38]


Epoch 5, Loss: 1.3753, Accuracy: 86.26%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.70it/s]


Validation Loss: 1.1612, Accuracy: 91.72%
Best model saved with accuracy: 91.72%


Epoch 6/10: 100%|██████████| 559/559 [00:55<00:00,  9.99it/s, acc=85.91%, loss=1.39]


Epoch 6, Loss: 1.3875, Accuracy: 85.91%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.72it/s]


Validation Loss: 1.1811, Accuracy: 90.02%


Epoch 7/10: 100%|██████████| 559/559 [00:55<00:00, 10.00it/s, acc=87.04%, loss=1.34]


Epoch 7, Loss: 1.3398, Accuracy: 87.04%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.71it/s]


Validation Loss: 1.1631, Accuracy: 92.57%
Best model saved with accuracy: 92.57%


Epoch 8/10: 100%|██████████| 559/559 [00:55<00:00, 10.02it/s, acc=88.70%, loss=1.28]


Epoch 8, Loss: 1.2792, Accuracy: 88.70%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.67it/s]


Validation Loss: 1.1625, Accuracy: 90.87%


Epoch 9/10: 100%|██████████| 559/559 [00:55<00:00,  9.99it/s, acc=90.04%, loss=1.24]


Epoch 9, Loss: 1.2411, Accuracy: 90.04%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.70it/s]


Validation Loss: 1.1435, Accuracy: 91.30%


Epoch 10/10: 100%|██████████| 559/559 [00:55<00:00, 10.00it/s, acc=90.37%, loss=1.22]


Epoch 10, Loss: 1.2240, Accuracy: 90.37%


Validation: 100%|██████████| 30/30 [00:06<00:00,  4.70it/s]


Validation Loss: 1.1398, Accuracy: 91.51%

Training swin model


model.safetensors:  47%|####7     | 377M/801M [00:00<?, ?B/s]

Epoch 1/10:   0%|          | 0/559 [00:00<?, ?it/s]


In [29]:
model_types = ["efficientnet"]

for model_type in model_types:
    print(f"\nTraining {model_type} model")

    # Create model based on type
    if model_type == "vit":
        model = create_vit_model(num_classes=200)
    elif model_type == "swin":
        model = create_swin_model(num_classes=200)
    elif model_type == "efficientnet":
        model = create_efficientnet_model(num_classes=200)
    else:  # convnext
        model = create_convnext_model(num_classes=200)

    model = model.to(device)

    # Use label smoothing cross entropy loss
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

    # Use AdamW optimizer with weight decay and different learning rates for different parts
    # Higher learning rate for newly added layers
    params = [
        {'params': [p for n, p in model.named_parameters() if 'head' not in n], 'lr': 1e-5},
        {'params': [p for n, p in model.named_parameters() if 'head' in n], 'lr': 1e-4}
    ]

    optimizer = optim.AdamW(params, weight_decay=1e-5)

    # Use cosine annealing scheduler
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=1, eta_min=1e-6)
    try:
      # Train model
      model = train_model(
          model,
          train_loader,
          val_loader,
          criterion,
          optimizer,
          scheduler,
          device,
          model_type,
          num_epochs=num_epochs
      )
    except:
      continue

    # Save the trained model
    torch.save(model.state_dict(), f"/content/working/models/{model_type}_model.pth")
    models.append(model)


Training efficientnet model


model.safetensors:   0%|          | 0.00/267M [00:00<?, ?B/s]

Epoch 1/10: 100%|██████████| 559/559 [01:17<00:00,  7.22it/s, acc=12.10%, loss=4.93]


Epoch 1, Loss: 4.9329, Accuracy: 12.10%


Validation: 100%|██████████| 30/30 [00:02<00:00, 12.94it/s]


Validation Loss: 4.0331, Accuracy: 37.58%
Best model saved with accuracy: 37.58%


Epoch 2/10: 100%|██████████| 559/559 [01:16<00:00,  7.33it/s, acc=42.76%, loss=3.47]


Epoch 2, Loss: 3.4735, Accuracy: 42.76%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.40it/s]


Validation Loss: 2.4060, Accuracy: 68.79%
Best model saved with accuracy: 68.79%


Epoch 3/10: 100%|██████████| 559/559 [01:18<00:00,  7.16it/s, acc=64.70%, loss=2.39]


Epoch 3, Loss: 2.3875, Accuracy: 64.70%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.08it/s]


Validation Loss: 1.8681, Accuracy: 78.13%
Best model saved with accuracy: 78.13%


Epoch 4/10: 100%|██████████| 559/559 [01:19<00:00,  7.06it/s, acc=73.63%, loss=2.01]


Epoch 4, Loss: 2.0079, Accuracy: 73.63%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.27it/s]


Validation Loss: 1.6996, Accuracy: 80.89%
Best model saved with accuracy: 80.89%


Epoch 5/10: 100%|██████████| 559/559 [01:17<00:00,  7.20it/s, acc=76.67%, loss=1.87]


Epoch 5, Loss: 1.8668, Accuracy: 76.67%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.14it/s]


Validation Loss: 1.6554, Accuracy: 81.10%
Best model saved with accuracy: 81.10%


Epoch 6/10: 100%|██████████| 559/559 [01:19<00:00,  7.06it/s, acc=78.08%, loss=1.75]


Epoch 6, Loss: 1.7471, Accuracy: 78.08%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.22it/s]


Validation Loss: 1.4818, Accuracy: 83.23%
Best model saved with accuracy: 83.23%


Epoch 7/10: 100%|██████████| 559/559 [01:17<00:00,  7.18it/s, acc=82.60%, loss=1.56]


Epoch 7, Loss: 1.5623, Accuracy: 82.60%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.21it/s]


Validation Loss: 1.4013, Accuracy: 85.77%
Best model saved with accuracy: 85.77%


Epoch 8/10: 100%|██████████| 559/559 [01:16<00:00,  7.33it/s, acc=85.96%, loss=1.45]


Epoch 8, Loss: 1.4453, Accuracy: 85.96%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.23it/s]


Validation Loss: 1.3576, Accuracy: 86.41%
Best model saved with accuracy: 86.41%


Epoch 9/10: 100%|██████████| 559/559 [01:16<00:00,  7.28it/s, acc=88.45%, loss=1.38]


Epoch 9, Loss: 1.3806, Accuracy: 88.45%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.38it/s]


Validation Loss: 1.3413, Accuracy: 87.47%
Best model saved with accuracy: 87.47%


Epoch 10/10: 100%|██████████| 559/559 [01:16<00:00,  7.27it/s, acc=88.84%, loss=1.35]


Epoch 10, Loss: 1.3501, Accuracy: 88.84%


Validation: 100%|██████████| 30/30 [00:02<00:00, 13.34it/s]


Validation Loss: 1.3341, Accuracy: 87.69%
Best model saved with accuracy: 87.69%


In [40]:
model1=create_vit_model(num_classes=200)
model1.load_state_dict(torch.load('/content/working/models/vit_model.pth'))
model1 = model1.cuda()  # Moves the model to GPU
for param in model1.parameters():
    param.data = param.data.to(torch.float32).cuda()

model2=create_convnext_model(num_classes=200)
model2.load_state_dict(torch.load('/content/working/models/convnext_model.pth'))
model2 = model2.cuda()  # Moves the model to GPU
for param in model2.parameters():
    param.data = param.data.to(torch.float32).cuda()

model3=create_efficientnet_model(num_classes=200)
model3.load_state_dict(torch.load('/content/working/models/efficientnet_model.pth'))
model3 = model3.cuda()  # Moves the model to GPU
for param in model3.parameters():
    param.data = param.data.to(torch.float32).cuda()

In [54]:
models=[model1,model2]

In [55]:
ensemble_model = ModelEnsemble(models)
ensemble_model = ensemble_model.to(device)

In [58]:
# Test inference with ensemble model
print("\nPerforming inference on test set with ensemble model...")
test_images = sorted(os.listdir(test_path))


predictions = []

with torch.no_grad():
    for img_name in tqdm(val_images):
        img_path = img_name

        # Process test image
        cropped_image = detect_and_crop(img_path, is_train=False)
        image_np = np.array(cropped_image)

        # Apply transformations
        image = val_transforms(image=image_np)['image']
        image = image.unsqueeze(0).to(device)

        # TTA (Test Time Augmentation)
        outputs = tta_inference(ensemble_model, image, device, val_transforms)

        # Get prediction
        predicted_label = torch.argmax(outputs, dim=1).item()
        predictions.append((img_name, predicted_label))


Performing inference on test set with ensemble model...


100%|██████████| 942/942 [03:26<00:00,  4.57it/s]


In [59]:
preds=[]
for i in predictions:
  preds.append(i[1])

In [60]:
from sklearn.metrics import precision_score


# Compute precision score
precision = precision_score(val_labels, np.array(preds), average='weighted')

print("Precision Score:", precision)


Precision Score: 0.9938782731776362


In [47]:
np.array(preds).shape

(1883,)

In [52]:
# Test inference with ensemble model
print("\nPerforming inference on test set with ensemble model...")
test_images = sorted(os.listdir(test_path))


predictions = []

with torch.no_grad():
    for img_name in tqdm(test_images, desc="Predicting"):
        img_path = os.path.join(test_path, img_name)

        # Process test image
        cropped_image = detect_and_crop(img_path, is_train=False)
        image_np = np.array(cropped_image)

        # Apply transformations
        image = val_transforms(image=image_np)['image']
        image = image.unsqueeze(0).to(device)

        # TTA (Test Time Augmentation)
        outputs = tta_inference(ensemble_model, image, device, val_transforms)

        # Get prediction
        predicted_label = torch.argmax(outputs, dim=1).item()
        predictions.append((img_name, predicted_label))


Performing inference on test set with ensemble model...


Predicting: 100%|██████████| 2374/2374 [14:32<00:00,  2.72it/s]


In [53]:
submission_df = pd.DataFrame(predictions, columns=["ID", "label"])
submission_df["label"] = submission_df["label"] + 1  # Adjusting for 1-indexed labels
submission_df.to_csv("submission_ensemble_model.csv", index=False)
print("Submission file saved: submission.csv")

Submission file saved: submission.csv
