# Caltech-UCSD Birds-200-2011

The project focuses on object recognition and computer vision as part of the BDMA 7 curriculum. The main objective is to develop and refine an image classification model using a subset of the Caltech-UCSD Birds-200-2011 dataset.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import os
import pandas as pd
from PIL import Image

In [None]:
# Path definitions according to folder structure as found in Kaggle
BASE_PATH = "/content/drive/MyDrive/BDMA7_project_files"
TRAIN_DIR = os.path.join(BASE_PATH, "train_images")
VAL_DIR = os.path.join(BASE_PATH, "val_images")
TEST_DIR = os.path.join(BASE_PATH, "test_images")

In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Transformations (Data Augmentation for Training and Normalization for all)
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.1, contrast=0.1),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val_test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [None]:
# Loading Datasets using ImageFolder
# ImageFolder automatically associates the subfolder name with the label
image_datasets = {
    'train': datasets.ImageFolder(TRAIN_DIR, data_transforms['train']),
    'val': datasets.ImageFolder(VAL_DIR, data_transforms['val_test']),
    'test': datasets.ImageFolder(TEST_DIR, data_transforms['val_test'])
}

dataloaders = {
    'train': DataLoader(image_datasets['train'], batch_size=32, shuffle=True, num_workers=4),
    'val': DataLoader(image_datasets['val'], batch_size=32, shuffle=False, num_workers=4),
    'test': DataLoader(image_datasets['test'], batch_size=32, shuffle=False, num_workers=4)
}

dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val', 'test']}
class_names = image_datasets['train'].classes
print(f"Detected classes: {class_names}") # Here we validate that they should be the 20 from the project

NameError: name 'data_transforms' is not defined

In [None]:
# Model Definition (ResNet-101 for greater capacity)
model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)

# Adjust the final layer for the 20 classes of the project
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(class_names))
model = model.to(device)

In [None]:
# Loss Function and Optimization
criterion = nn.CrossEntropyLoss()
# We only optimize the parameters of the final layer and upper layers (Fine-tuning)
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [None]:

# Training and Validation Loop
def train_model(model, criterion, optimizer, num_epochs=10):
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')

        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs, labels = inputs.to(device), labels.to(device)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

    return model


In [None]:
# Execution
model_ft = train_model(model, criterion, optimizer, num_epochs=15)

Epoch 0/14
train Loss: 2.7377 Acc: 0.2884
val Loss: 1.7429 Acc: 0.7573
Epoch 1/14
train Loss: 1.4426 Acc: 0.6996
val Loss: 0.4705 Acc: 0.8738
Epoch 2/14
train Loss: 0.7607 Acc: 0.7828
val Loss: 0.3596 Acc: 0.8835
Epoch 3/14
train Loss: 0.5757 Acc: 0.8355
val Loss: 0.3017 Acc: 0.9029
Epoch 4/14
train Loss: 0.4968 Acc: 0.8503
val Loss: 0.2798 Acc: 0.8738
Epoch 5/14
train Loss: 0.4530 Acc: 0.8632
val Loss: 0.3351 Acc: 0.8932
Epoch 6/14
train Loss: 0.4248 Acc: 0.8752
val Loss: 0.3789 Acc: 0.8738
Epoch 7/14
train Loss: 0.3494 Acc: 0.8983
val Loss: 0.3926 Acc: 0.8738
Epoch 8/14
train Loss: 0.3204 Acc: 0.8983
val Loss: 0.3774 Acc: 0.8641
Epoch 9/14
train Loss: 0.2693 Acc: 0.9177
val Loss: 0.5224 Acc: 0.8641
Epoch 10/14
train Loss: 0.2832 Acc: 0.9187
val Loss: 0.4101 Acc: 0.8738
Epoch 11/14
train Loss: 0.2626 Acc: 0.9288
val Loss: 0.3607 Acc: 0.8738
Epoch 12/14
train Loss: 0.2183 Acc: 0.9381
val Loss: 0.4742 Acc: 0.8641
Epoch 13/14
train Loss: 0.2509 Acc: 0.9279
val Loss: 0.3861 Acc: 0.8932
Ep

In [None]:
# We define a specific Dataset for Test
# This is necessary because we need the file NAME for the Kaggle CSV
class CUBTestDataset(torch.utils.data.Dataset):
    def __init__(self, main_dir, transform=None):
        self.main_dir = main_dir
        self.transform = transform
        self.all_imgs = []

        # We iterate through the structure: test_images / bird_name / image.jpg
        for bird_dir in sorted(os.listdir(main_dir)):
            bird_path = os.path.join(main_dir, bird_dir)
            if os.path.isdir(bird_path):
                for img_name in os.listdir(bird_path):
                    self.all_imgs.append((os.path.join(bird_dir, img_name), img_name))

    def __len__(self):
        return len(self.all_imgs)

    def __getitem__(self, idx):
        full_path, img_name = self.all_imgs[idx]
        image = Image.open(os.path.join(self.main_dir, full_path)).convert('RGB')
        if self.transform:
            image = self.transform(image)
        return image, img_name

# Function to generate the .csv file
def generate_submission(model, test_dir, output_file='sample_submission.csv'):
    model.eval()

    # We use validation transformations (without random data augmentation)
    test_dataset = CUBTestDataset(test_dir, transform=data_transforms['val_test'])
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

    submission_data = []

    print(f"Generating predictions for images in: {test_dir}")

    with torch.no_grad():
        for inputs, filenames in test_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            # We save each image-prediction pair as requested in the PDF
            for fname, p in zip(filenames, preds):
                submission_data.append({
                    'id': fname,       # File name (e.g. 001.jpg)
                    'label': p.item()  # Predicted class (0 to 19)
                })

    # Save the final file
    df = pd.DataFrame(submission_data)
    df.to_csv(output_file, index=False)
    print(f"The file '{output_file}' has been created in csv format to upload to Kaggle.")

In [None]:
 # Function call
 generate_submission(model_ft, TEST_DIR)

Generating predictions for images in: /content/drive/MyDrive/BDMA7_project_files/test_images
The file has been created 'sample_submission.csv' in csv format to upload to Kaggle.


# Justifications

The objective is to develop and refine a deep learning model to classify bird species using a subset of the Caltech-UCSD Birds-200-2011 dataset. The competition takes place on Kaggle and success is measured by the accuracy of the model on a test dataset.


1. Architecture (ResNet-50): It was chosen for its ability to handle vanishing gradients through residual connections. It is deep enough to capture fine details of feathers and beaks, but efficient to train in reasonable times. ResNet-101 was chosen for its balance between depth and ease of training through Transfer Learning, which is ideal for this bird dataset.

2. Transfer Learning: Since the CUB-200-2011 dataset is relatively small (approx. 30 images per class), training from scratch would cause overfitting. Using pre-trained ImageNet weights allows the model to already know basic shapes and textures.

3. Data Augmentation: I have included RandomResizedCrop and RandomHorizontalFlip so that the model is robust to changes in scale and orientation, something critical in photos of birds in nature.


4. Class Mapping: ImageFolder sorts classes alphabetically. Make sure that the order of subfolders matches the index (0-19) required by the Kaggle competition to avoid errors in the submission file.



5. val_images Folder (Validation)
It is used during the training process, at the end of each epoch (complete learning cycle).

Purpose: It serves to evaluate how the model is learning in real time with data that was not used to adjust the weights.

Technical utility: Detect Overfitting, if the accuracy on train increases but decreases on val, the model is memorizing instead of learning.

Adjust Hyperparameters: It helps to decide whether to change the learning rate, the number of layers or the data augmentation technique.

Save the best model: Normally, you only save the model weights if the accuracy in the validation folder improves compared to the previous epoch.

6. test_images Folder (Test)
It is used at the very end, once training has been completed.

Purpose: To provide a final and impartial measure of model performance.


This folder contains images that do not have a known label for you.

It is the folder you process to generate the submission.csv file.


7. A custom DataLoader was used to ensure that the order of images in the .csv file was consistent with their names,

# Architecture 2

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import pandas as pd
import os
from PIL import Image

In [None]:
official_mapping = {
    'Groove_billed_Ani': 0, 'Red_winged_Blackbird': 1, 'Rusty_Blackbird': 2,
    'Gray_Catbird': 3, 'Brandt_Cormorant': 4, 'Eastern_Towhee': 5,
    'Indigo_Bunting': 6, 'Brewer_Blackbird': 7, 'Painted_Bunting': 8,
    'Bobolink': 9, 'Lazuli_Bunting': 10, 'Yellow_headed_Blackbird': 11,
    'American_Crow': 12, 'Fish_Crow': 13, 'Brown_Creeper': 14,
    'Yellow_billed_Cuckoo': 15, 'Yellow_breasted_Chat': 16,
    'Black_billed_Cuckoo': 17, 'Gray_crowned_Rosy_Finch': 18,
    'Bronzed_Cowbird': 19
}

In [None]:
# Paths and Device Configuration
BASE_DIR = '/content/drive/MyDrive/BDMA7_project_files'
TRAIN_DIR = os.path.join(BASE_DIR, 'train_images')
VAL_DIR = os.path.join(BASE_DIR, 'val_images')
TEST_DIR = os.path.join(BASE_DIR, 'test_images')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# Aumento de Datos
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(0.2, 0.2, 0.2),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val_test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Function to ensure correct mapping according to folder name
def get_official_id(target_idx, dataset_class_to_idx):
    folder_name = list(dataset_class_to_idx.keys())[list(dataset_class_to_idx.values()).index(target_idx)]
    for bird_name, official_id in official_mapping.items():
        if bird_name in folder_name:
            return official_id
    return target_idx


# Loading Datasets
temp_ds = datasets.ImageFolder(TRAIN_DIR)
internal_mapping = temp_ds.class_to_idx

train_dataset = datasets.ImageFolder(TRAIN_DIR, transform=data_transforms['train'],
                                     target_transform=lambda y: get_official_id(y, internal_mapping))
val_dataset = datasets.ImageFolder(VAL_DIR, transform=data_transforms['val_test'],
                                   target_transform=lambda y: get_official_id(y, internal_mapping))

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
# 4. ResNet-101 Model (Improved for >85% accuracy)
model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(nn.Dropout(0.4), nn.Linear(num_ftrs, 20))
model = model.to(device)

optimizer = optim.Adam([
    {'params': model.layer4.parameters(), 'lr': 1e-5},
    {'params': model.fc.parameters(), 'lr': 1e-3}
])

# Scheduler (Reduce Learning rate if val loss doesn't decrease in 3 epochs)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3)
criterion = nn.CrossEntropyLoss()

In [None]:
def train_and_validate(epochs=20):
    best_acc = 0.0
    for epoch in range(epochs):
        # Training Phase
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Validation Phase
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                total += labels.size(0)
                correct += (preds == labels).sum().item()

        val_acc = 100 * correct / total
        print(f"Epoch {epoch+1} | Loss: {running_loss/len(train_loader):.4f} | Val Acc: {val_acc:.2f}%")

        # Update Scheduler based on validation accuracy
        scheduler.step(val_acc)

        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), 'best_model.pth')

# Start improved training
train_and_validate(epochs=25)

Epoch 1 | Loss: 2.6976 | Val Acc: 49.51%
Epoch 2 | Loss: 1.9355 | Val Acc: 70.87%
Epoch 3 | Loss: 1.3657 | Val Acc: 73.79%
Epoch 4 | Loss: 1.0806 | Val Acc: 77.67%
Epoch 5 | Loss: 0.9233 | Val Acc: 80.58%
Epoch 6 | Loss: 0.7854 | Val Acc: 80.58%
Epoch 7 | Loss: 0.7592 | Val Acc: 82.52%
Epoch 8 | Loss: 0.7085 | Val Acc: 80.58%
Epoch 9 | Loss: 0.6936 | Val Acc: 81.55%
Epoch 10 | Loss: 0.7182 | Val Acc: 80.58%
Epoch 11 | Loss: 0.6232 | Val Acc: 82.52%
Epoch 12 | Loss: 0.6033 | Val Acc: 80.58%
Epoch 13 | Loss: 0.5857 | Val Acc: 82.52%
Epoch 14 | Loss: 0.5942 | Val Acc: 83.50%
Epoch 15 | Loss: 0.5673 | Val Acc: 82.52%
Epoch 16 | Loss: 0.5706 | Val Acc: 81.55%
Epoch 17 | Loss: 0.5648 | Val Acc: 81.55%
Epoch 18 | Loss: 0.5954 | Val Acc: 84.47%
Epoch 19 | Loss: 0.5560 | Val Acc: 82.52%
Epoch 20 | Loss: 0.5255 | Val Acc: 84.47%
Epoch 21 | Loss: 0.5844 | Val Acc: 82.52%
Epoch 22 | Loss: 0.5848 | Val Acc: 80.58%
Epoch 23 | Loss: 0.5216 | Val Acc: 83.50%
Epoch 24 | Loss: 0.5801 | Val Acc: 82.52%
E

In [None]:
# csv generation for kaggle
def generate_submission(model, test_dir, output_name='sample_submission.csv'):
    model.eval()
    results = []

    # Special Test Dataset that gives us the file name
    test_ds = datasets.ImageFolder(test_dir, transform=data_transforms['val_test'])
    test_loader = DataLoader(test_ds, batch_size=1, shuffle=False)

    print("Starting prediction generation...")

    with torch.no_grad():
        for i, (inputs, _) in enumerate(test_loader):
            inputs = inputs.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            # We get the actual file name
            img_path, _ = test_ds.samples[i]
            img_name = os.path.basename(img_path)

            results.append({
                'id': img_name,
                'label': preds.item()
            })

    # Create DataFrame and export
    df = pd.DataFrame(results)
    df.to_csv(output_name, index=False)
    print(f"File '{output_name}' generated correctly for Kaggle.")


In [None]:
generate_submission(model, TEST_DIR)

Starting prediction generation...
File 'sample_submission.csv' generated correctly for Kaggle.


# Architecture 3

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import pandas as pd
import os
from PIL import Image


In [None]:
BASE_DIR = '/content/drive/MyDrive/BDMA7_project_files'
TRAIN_DIR = os.path.join(BASE_DIR, 'train_images')
VAL_DIR = os.path.join(BASE_DIR, 'val_images')
TEST_DIR = os.path.join(BASE_DIR, 'test_images')

In [None]:

official_mapping = {
    'Groove_billed_Ani': 0, 'Red_winged_Blackbird': 1, 'Rusty_Blackbird': 2,
    'Gray_Catbird': 3, 'Brandt_Cormorant': 4, 'Eastern_Towhee': 5,
    'Indigo_Bunting': 6, 'Brewer_Blackbird': 7, 'Painted_Bunting': 8,
    'Bobolink': 9, 'Lazuli_Bunting': 10, 'Yellow_headed_Blackbird': 11,
    'American_Crow': 12, 'Fish_Crow': 13, 'Brown_Creeper': 14,
    'Yellow_billed_Cuckoo': 15, 'Yellow_breasted_Chat': 16,
    'Black_billed_Cuckoo': 17, 'Gray_crowned_Rosy_Finch': 18,
    'Bronzed_Cowbird': 19
}

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Transformations de Alta Resolución (384px)
# Increasing resolution is key to distinguishing similar species
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(384),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(0.3, 0.3, 0.3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val_test': transforms.Compose([
        transforms.Resize(410),
        transforms.CenterCrop(384),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Data Loading with Label Correction
temp_ds = datasets.ImageFolder(TRAIN_DIR)
internal_mapping = temp_ds.class_to_idx

def get_official_id(target_idx):
    folder_name = list(internal_mapping.keys())[list(internal_mapping.values()).index(target_idx)]
    for bird_name, official_id in official_mapping.items():
        if bird_name in folder_name:
            return official_id
    return target_idx

train_dataset = datasets.ImageFolder(TRAIN_DIR, transform=data_transforms['train'], target_transform=get_official_id)
val_dataset = datasets.ImageFolder(VAL_DIR, transform=data_transforms['val_test'], target_transform=get_official_id)

# Batch size reduced to 16 due to increased resolution
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)

# Model ConvNeXt-Tiny
model = models.convnext_tiny(weights=models.ConvNeXt_Tiny_Weights.DEFAULT)
n_inputs = model.classifier[2].in_features
model.classifier[2] = nn.Sequential(
    nn.LayerNorm((n_inputs,), eps=1e-06, elementwise_affine=True),
    nn.Dropout(p=0.3),
    nn.Linear(n_inputs, 20)
)
model = model.to(device)

# AdamW Optimizer and Scheduler
optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.1)
criterion = nn.CrossEntropyLoss()

# Training
def train_model(epochs=15):
    best_acc = 0.0
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        model.eval()
        correct = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, preds = torch.max(outputs, 1)
                correct += (preds == labels).sum().item()

        val_acc = 100 * correct / len(val_dataset)
        print(f"Epoch {epoch+1}/{epochs} | Loss: {train_loss/len(train_loader):.4f} | Acc: {val_acc:.2f}%")

        scheduler.step(val_acc)
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), 'best_convnext.pth')

# CSV Generation
def generate_submission():
    model.load_state_dict(torch.load('best_convnext.pth'))
    model.eval()

    test_ds = datasets.ImageFolder(TEST_DIR, transform=data_transforms['val_test'])
    results = []

    print("Generating predictions with TTA...")
    with torch.no_grad():
        for i, (path, _) in enumerate(test_ds.samples):
            img = Image.open(path).convert('RGB')
            img_name = os.path.basename(path)

            # TTA: Original + Mirrored
            t1 = data_transforms['val_test'](img).unsqueeze(0).to(device)
            t2 = data_transforms['val_test'](transforms.functional.hflip(img)).unsqueeze(0).to(device)

            out = (torch.softmax(model(t1), dim=1) + torch.softmax(model(t2), dim=1)) / 2
            _, pred = torch.max(out, 1)

            results.append({'id': img_name, 'label': pred.item()})

    pd.DataFrame(results).to_csv('sample_submission.csv', index=False)
    print("CSV generated")


Downloading: "https://download.pytorch.org/models/convnext_tiny-983f1562.pth" to /root/.cache/torch/hub/checkpoints/convnext_tiny-983f1562.pth


100%|██████████| 109M/109M [00:00<00:00, 174MB/s] 


In [None]:
train_model(epochs=20)

Epoch 1/20 | Loss: 2.3501 | Acc: 74.76%
Epoch 2/20 | Loss: 0.9827 | Acc: 85.44%
Epoch 3/20 | Loss: 0.6708 | Acc: 89.32%
Epoch 4/20 | Loss: 0.5149 | Acc: 92.23%
Epoch 5/20 | Loss: 0.4803 | Acc: 87.38%
Epoch 6/20 | Loss: 0.4306 | Acc: 93.20%
Epoch 7/20 | Loss: 0.3885 | Acc: 91.26%
Epoch 8/20 | Loss: 0.3166 | Acc: 92.23%
Epoch 9/20 | Loss: 0.3464 | Acc: 91.26%
Epoch 10/20 | Loss: 0.2654 | Acc: 92.23%
Epoch 11/20 | Loss: 0.2696 | Acc: 92.23%
Epoch 12/20 | Loss: 0.2355 | Acc: 92.23%
Epoch 13/20 | Loss: 0.1976 | Acc: 92.23%
Epoch 14/20 | Loss: 0.2354 | Acc: 92.23%
Epoch 15/20 | Loss: 0.2350 | Acc: 92.23%
Epoch 16/20 | Loss: 0.2187 | Acc: 92.23%
Epoch 17/20 | Loss: 0.1969 | Acc: 92.23%
Epoch 18/20 | Loss: 0.2528 | Acc: 92.23%
Epoch 19/20 | Loss: 0.2411 | Acc: 92.23%
Epoch 20/20 | Loss: 0.2348 | Acc: 92.23%


In [None]:
generate_submission()

Generating predictions with TTA...
CSV generated


# Arquitectura 4

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import pandas as pd
import os
from PIL import Image
import copy


In [None]:

OFFICIAL_MAPPING = {
    'Groove_billed_Ani': 0, 'Red_winged_Blackbird': 1, 'Rusty_Blackbird': 2,
    'Gray_Catbird': 3, 'Brandt_Cormorant': 4, 'Eastern_Towhee': 5,
    'Indigo_Bunting': 6, 'Brewer_Blackbird': 7, 'Painted_Bunting': 8,
    'Bobolink': 9, 'Lazuli_Bunting': 10, 'Yellow_headed_Blackbird': 11,
    'American_Crow': 12, 'Fish_Crow': 13, 'Brown_Creeper': 14,
    'Yellow_billed_Cuckoo': 15, 'Yellow_breasted_Chat': 16,
    'Black_billed_Cuckoo': 17, 'Gray_crowned_Rosy_Finch': 18,
    'Bronzed_Cowbird': 19
}

In [None]:


BASE_DIR = '/content/drive/MyDrive/BDMA7_project_files'
TRAIN_DIR = os.path.join(BASE_DIR, 'train_images')
VAL_DIR = os.path.join(BASE_DIR, 'val_images')
TEST_DIR = os.path.join(BASE_DIR, 'test_images')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Data Augmentation
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(0.3, 0.3, 0.3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val_test': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# DATA LOADING WITH DYNAMIC MAPPING
def get_target_transform(internal_mapping):
    def transform(target_idx):
        folder_name = list(internal_mapping.keys())[list(internal_mapping.values()).index(target_idx)]
        for bird_name, official_id in OFFICIAL_MAPPING.items():
            if bird_name in folder_name:
                return official_id
        return target_idx
    return transform

temp_ds = datasets.ImageFolder(TRAIN_DIR)
target_tf = get_target_transform(temp_ds.class_to_idx)

train_dataset = datasets.ImageFolder(TRAIN_DIR, transform=data_transforms['train'], target_transform=target_tf)
val_dataset = datasets.ImageFolder(VAL_DIR, transform=data_transforms['val_test'], target_transform=target_tf)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

# 4. MODEL DEFINITION FOR ENSEMBLE
def get_resnet_model():
    model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
    num_ftrs = model.fc.in_features
    model.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(num_ftrs, 20))
    return model.to(DEVICE)

def get_effnet_model():
    model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
    num_ftrs = model.classifier[1].in_features
    model.classifier[1] = nn.Sequential(nn.Dropout(0.5), nn.Linear(num_ftrs, 20))
    return model.to(DEVICE)

# GENERIC TRAINING FUNCTION
def train_specific_model(model, save_name, epochs=20):
    # We use Label Smoothing to improve generalization between similar species
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.2)

    best_acc = 0.0
    for epoch in range(epochs):
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        model.eval()
        correct = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                _, preds = torch.max(model(inputs), 1)
                correct += (preds == labels).sum().item()

        val_acc = 100 * correct / len(val_dataset)
        print(f"[{save_name}] Epoch {epoch+1} Val Acc: {val_acc:.2f}%")
        scheduler.step(val_acc)

        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), save_name)

# GENERACIÓN DE ENSAMBLE + TTA
def generate_submission():
    # Load models entrenados
    m1 = get_resnet_model()
    m1.load_state_dict(torch.load('best_resnet.pth'))
    m1.eval()

    m2 = get_effnet_model()
    m2.load_state_dict(torch.load('best_effnet.pth'))
    m2.eval()

    # Test dataset (without real labels)
    test_ds = datasets.ImageFolder(TEST_DIR, transform=data_transforms['val_test'])
    results = []

    print("Starting Ensemble with Test Time Augmentation...")
    with torch.no_grad():
        for i, (path, _) in enumerate(test_ds.samples):
            img = Image.open(path).convert('RGB')
            img_name = os.path.basename(path)

            # TTA: Original + Horizontal Flip
            t_orig = data_transforms['val_test'](img).unsqueeze(0).to(DEVICE)
            t_flip = data_transforms['val_test'](transforms.functional.hflip(img)).unsqueeze(0).to(DEVICE)

            # Average of probabilities from both models and both image versions
            prob_m1 = (torch.softmax(m1(t_orig), dim=1) + torch.softmax(m1(t_flip), dim=1)) / 2
            prob_m2 = (torch.softmax(m2(t_orig), dim=1) + torch.softmax(m2(t_flip), dim=1)) / 2

            # Votación suave (promedio final)
            final_probs = (prob_m1 + prob_m2) / 2
            _, pred = torch.max(final_probs, 1)

            results.append({'id': img_name, 'label': pred.item()})

    pd.DataFrame(results).to_csv('sample_submission.csv', index=False)
    print("Final file generated!")


In [None]:
# Entrenar ResNet
train_specific_model(get_resnet_model(), 'best_resnet.pth')

[best_resnet.pth] Epoch 1 Val Acc: 55.34%
[best_resnet.pth] Epoch 2 Val Acc: 80.58%
[best_resnet.pth] Epoch 3 Val Acc: 88.35%
[best_resnet.pth] Epoch 4 Val Acc: 90.29%
[best_resnet.pth] Epoch 5 Val Acc: 89.32%
[best_resnet.pth] Epoch 6 Val Acc: 86.41%
[best_resnet.pth] Epoch 7 Val Acc: 87.38%
[best_resnet.pth] Epoch 8 Val Acc: 88.35%
[best_resnet.pth] Epoch 9 Val Acc: 89.32%
[best_resnet.pth] Epoch 10 Val Acc: 90.29%
[best_resnet.pth] Epoch 11 Val Acc: 89.32%
[best_resnet.pth] Epoch 12 Val Acc: 89.32%
[best_resnet.pth] Epoch 13 Val Acc: 89.32%
[best_resnet.pth] Epoch 14 Val Acc: 89.32%
[best_resnet.pth] Epoch 15 Val Acc: 90.29%
[best_resnet.pth] Epoch 16 Val Acc: 90.29%
[best_resnet.pth] Epoch 17 Val Acc: 90.29%
[best_resnet.pth] Epoch 18 Val Acc: 90.29%
[best_resnet.pth] Epoch 19 Val Acc: 90.29%
[best_resnet.pth] Epoch 20 Val Acc: 90.29%


In [None]:
# Entrenar EfficientNet
train_specific_model(get_effnet_model(), 'best_effnet.pth')

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth


100%|██████████| 20.5M/20.5M [00:00<00:00, 170MB/s]


[best_effnet.pth] Epoch 1 Val Acc: 25.24%
[best_effnet.pth] Epoch 2 Val Acc: 52.43%
[best_effnet.pth] Epoch 3 Val Acc: 66.02%
[best_effnet.pth] Epoch 4 Val Acc: 72.82%
[best_effnet.pth] Epoch 5 Val Acc: 81.55%
[best_effnet.pth] Epoch 6 Val Acc: 84.47%
[best_effnet.pth] Epoch 7 Val Acc: 85.44%
[best_effnet.pth] Epoch 8 Val Acc: 85.44%
[best_effnet.pth] Epoch 9 Val Acc: 84.47%
[best_effnet.pth] Epoch 10 Val Acc: 87.38%
[best_effnet.pth] Epoch 11 Val Acc: 88.35%
[best_effnet.pth] Epoch 12 Val Acc: 86.41%
[best_effnet.pth] Epoch 13 Val Acc: 85.44%
[best_effnet.pth] Epoch 14 Val Acc: 87.38%
[best_effnet.pth] Epoch 15 Val Acc: 87.38%
[best_effnet.pth] Epoch 16 Val Acc: 86.41%
[best_effnet.pth] Epoch 17 Val Acc: 85.44%
[best_effnet.pth] Epoch 18 Val Acc: 86.41%
[best_effnet.pth] Epoch 19 Val Acc: 87.38%
[best_effnet.pth] Epoch 20 Val Acc: 87.38%


In [None]:
# Generate CSV
generate_submission()

Starting Ensamble con Test Time Augmentation...
¡File final generated!


In [None]:
# ARCHITECTURE 4 GAVE THE BEST RESULTS IN THE KAGGLE TEST

# Arquitectura 5

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import pandas as pd
import os
from PIL import Image


In [None]:

# CONFIGURACIÓN Y MAPEO
OFFICIAL_MAPPING = {
    'Groove_billed_Ani': 0, 'Red_winged_Blackbird': 1, 'Rusty_Blackbird': 2,
    'Gray_Catbird': 3, 'Brandt_Cormorant': 4, 'Eastern_Towhee': 5,
    'Indigo_Bunting': 6, 'Brewer_Blackbird': 7, 'Painted_Bunting': 8,
    'Bobolink': 9, 'Lazuli_Bunting': 10, 'Yellow_headed_Blackbird': 11,
    'American_Crow': 12, 'Fish_Crow': 13, 'Brown_Creeper': 14,
    'Yellow_billed_Cuckoo': 15, 'Yellow_breasted_Chat': 16,
    'Black_billed_Cuckoo': 17, 'Gray_crowned_Rosy_Finch': 18,
    'Bronzed_Cowbird': 19
}

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BASE_DIR = '/content/drive/MyDrive/BDMA7_project_files'
TRAIN_DIR = os.path.join(BASE_DIR, 'train_images')
VAL_DIR = os.path.join(BASE_DIR, 'val_images')
TEST_DIR = os.path.join(BASE_DIR, 'test_images')

# MODELS
def get_resnet():
    m = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
    m.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(m.fc.in_features, 20))
    return m.to(DEVICE)

def get_effnet():
    m = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
    m.classifier[1] = nn.Sequential(nn.Dropout(0.5), nn.Linear(m.classifier[1].in_features, 20))
    return m.to(DEVICE)

# TRAINING
def train_model_stage(model, stage_name, resolution, save_path, load_path=None, epochs=10):
    print(f"\nIniciando {stage_name} a {resolution}px...")

    # Transformations según resolución
    transform = transforms.Compose([
        transforms.RandomResizedCrop(resolution),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

    # Data loading
    temp_ds = datasets.ImageFolder(TRAIN_DIR)
    internal_mapping = temp_ds.class_to_idx

    def target_tf(target_idx):
        folder_name = list(internal_mapping.keys())[list(internal_mapping.values()).index(target_idx)]
        for name, idx in OFFICIAL_MAPPING.items():
            if name in folder_name: return idx
        return target_idx

    train_ds = datasets.ImageFolder(TRAIN_DIR, transform=transform, target_transform=target_tf)
    loader = DataLoader(train_ds, batch_size=8 if resolution > 300 else 32, shuffle=True)

    # Load weights
    if load_path and os.path.exists(load_path):
        model.load_state_dict(torch.load(load_path))
        print(f"Weights loaded from {load_path}")

    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4 if not load_path else 1e-5)

    # Simple training loop
    model.train()
    for epoch in range(epochs):
        for inputs, labels in loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1} completada.")

    torch.save(model.state_dict(), save_path)
    print(f"Model saved to {save_path}")

# 4. FLUJO COMPLETO AUTOMATIZADO
def run_full_pipeline():
    # MODEL 1: RESNET
    res_model = get_resnet()
    # Stage 1: 224px
    train_model_stage(res_model, "ResNet Stage 1", 224, "resnet_224.pth", epochs=12)
    # Stage 2: 448px (Load weights from 224)
    train_model_stage(res_model, "ResNet Stage 2", 448, "resnet_448.pth", load_path="resnet_224.pth", epochs=8)

    # MODEL 2: EFFICIENTNET
    eff_model = get_effnet()
    # Stage 1: 224px
    train_model_stage(eff_model, "EffNet Stage 1", 224, "effnet_224.pth", epochs=12)
    # Stage 2: 448px
    train_model_stage(eff_model, "EffNet Stage 2", 448, "effnet_448.pth", load_path="effnet_224.pth", epochs=8)

    print("\nTraining completed")

# Execution
run_full_pipeline()


Starting ResNet Stage 1 a 224px...
Epoch 1 completada.
Epoch 2 completada.
Epoch 3 completada.
Epoch 4 completada.
Epoch 5 completada.
Epoch 6 completada.
Epoch 7 completada.
Epoch 8 completada.
Epoch 9 completada.
Epoch 10 completada.
Epoch 11 completada.
Epoch 12 completada.
Model saved to resnet_224.pth

Starting ResNet Stage 2 a 448px...
Weights loadingdos desde resnet_224.pth
Epoch 1 completada.
Epoch 2 completada.
Epoch 3 completada.
Epoch 4 completada.
Epoch 5 completada.
Epoch 6 completada.
Epoch 7 completada.
Epoch 8 completada.
Model saved to resnet_448.pth

Starting EffNet Stage 1 a 224px...
Epoch 1 completada.
Epoch 2 completada.
Epoch 3 completada.
Epoch 4 completada.
Epoch 5 completada.
Epoch 6 completada.
Epoch 7 completada.
Epoch 8 completada.
Epoch 9 completada.
Epoch 10 completada.
Epoch 11 completada.
Epoch 12 completada.
Model saved to effnet_224.pth

Starting EffNet Stage 2 a 448px...
Weights loadingdos desde effnet_224.pth
Epoch 1 completada.
Epoch 2 completada.


In [None]:
#  TRANSFORMACIÓN DE TEST (448px)
test_transform = transforms.Compose([
    transforms.Resize(512),
    transforms.CenterCrop(448),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# PREDICTION FUNCTION WITH ENSEMBLE AND TTA
def generate_submission(resnet_path, effnet_path, output_csv='sample_submission.csv'):
    # Load models
    print("Loading high resolution models...")
    m1 = get_resnet()
    m1.load_state_dict(torch.load(resnet_path))
    m1.eval()

    m2 = get_effnet()
    m2.load_state_dict(torch.load(effnet_path))
    m2.eval()

    # Test dataset
    test_ds = datasets.ImageFolder(TEST_DIR, transform=test_transform)
    results = []

    print(f"Generating predictions for {len(test_ds)} images...")

    with torch.no_grad():
        for i, (path, _) in enumerate(test_ds.samples):
            img_name = os.path.basename(path)
            img = Image.open(path).convert('RGB')

            # TTA (Test Time Augmentation) ---
            # Pasada 1: Original
            img_orig = test_transform(img).unsqueeze(0).to(DEVICE)
            # Pasada 2: Espejo Horizontal
            img_flip = test_transform(transforms.functional.hflip(img)).unsqueeze(0).to(DEVICE)

            # Obtener probabilidades (Softmax)
            # Model 1 (ResNet)
            p1_orig = torch.softmax(m1(img_orig), dim=1)
            p1_flip = torch.softmax(m1(img_flip), dim=1)

            # Model 2 (EffNet)
            p2_orig = torch.softmax(m2(img_orig), dim=1)
            p2_flip = torch.softmax(m2(img_flip), dim=1)

            # AVERAGE OF 4 PREDICTIONS (Ensemble + TTA)
            # Esto reduce el error aleatorio y mejora la confianza
            avg_probs = (p1_orig + p1_flip + p2_orig + p2_flip) / 4

            _, pred = torch.max(avg_probs, 1)
            results.append({'id': img_name, 'label': pred.item()})

            if (i+1) % 100 == 0:
                print(f"Processed {i+1} images...")

    # Create and save CSV
    df = pd.DataFrame(results)
    df.to_csv(output_csv, index=False)
    print(f"\nFile '{output_csv}' generated.")


In [None]:
generate_submission('resnet_448.pth', 'effnet_448.pth')

Loading high resolution models...
Generating predictions para 400 images...
Processed 100 images...
Processed 200 images...
Processed 300 images...
Processed 400 images...

File 'sample_submission.csv' generated.


# Arquitectura 6

In [None]:
!pip install torchinfo
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torchinfo import summary
from torch.utils.data import DataLoader
import pandas as pd
import os
from PIL import Image
import numpy as np

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:

# CONFIGURACIÓN Y MAPEO OFICIAL
OFFICIAL_MAPPING = {
    'Groove_billed_Ani': 0, 'Red_winged_Blackbird': 1, 'Rusty_Blackbird': 2,
    'Gray_Catbird': 3, 'Brandt_Cormorant': 4, 'Eastern_Towhee': 5,
    'Indigo_Bunting': 6, 'Brewer_Blackbird': 7, 'Painted_Bunting': 8,
    'Bobolink': 9, 'Lazuli_Bunting': 10, 'Yellow_headed_Blackbird': 11,
    'American_Crow': 12, 'Fish_Crow': 13, 'Brown_Creeper': 14,
    'Yellow_billed_Cuckoo': 15, 'Yellow_breasted_Chat': 16,
    'Black_billed_Cuckoo': 17, 'Gray_crowned_Rosy_Finch': 18,
    'Bronzed_Cowbird': 19
}

In [None]:
BASE_DIR = '/content/drive/MyDrive/BDMA7_project_files'
TRAIN_DIR = os.path.join(BASE_DIR, 'train_images')
VAL_DIR = os.path.join(BASE_DIR, 'val_images')
TEST_DIR = os.path.join(BASE_DIR, 'test_images')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# TRANSFORMACIONES DE ALTA RESOLUCIÓN (448px)
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(448),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(0.3, 0.3, 0.3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val_test': transforms.Compose([
        transforms.Resize(512),
        transforms.CenterCrop(448),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# DATA LOADING WITH DYNAMIC MAPPING
def get_target_transform(internal_mapping):
    def transform(target_idx):
        folder_name = list(internal_mapping.keys())[list(internal_mapping.values()).index(target_idx)]
        for bird_name, official_id in OFFICIAL_MAPPING.items():
            if bird_name in folder_name:
                return official_id
        return target_idx
    return transform

temp_ds = datasets.ImageFolder(TRAIN_DIR)
target_tf = get_target_transform(temp_ds.class_to_idx)

train_dataset = datasets.ImageFolder(TRAIN_DIR, transform=data_transforms['train'], target_transform=target_tf)
val_dataset = datasets.ImageFolder(VAL_DIR, transform=data_transforms['val_test'], target_transform=target_tf)

# Batch size reduced to handle 448px images on GPU
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=4)

# MODELS
def get_resnet_model():
    model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
    num_ftrs = model.fc.in_features
    model.fc = nn.Sequential(nn.Dropout(0.5), nn.Linear(num_ftrs, 20))
    return model.to(DEVICE)

def get_effnet_model():
    model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
    num_ftrs = model.classifier[1].in_features
    model.classifier[1] = nn.Sequential(nn.Dropout(0.5), nn.Linear(num_ftrs, 20))
    return model.to(DEVICE)

# TRAINING CON MIXUP Y COSINE ANNEALING
def train_specific_model(model, save_name, epochs=22):
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)

    # Scheduler de Coseno para un descenso de LR más suave y efectivo
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    best_acc = 0.0
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            # IMPLEMENTACIÓN DE MIXUP
            # Combines two images and their labels to avoid overfitting
            alpha = 0.2
            lam = np.random.beta(alpha, alpha)
            index = torch.randperm(inputs.size(0)).to(DEVICE)

            mixed_inputs = lam * inputs + (1 - lam) * inputs[index, :]
            labels_a, labels_b = labels, labels[index]

            optimizer.zero_grad()
            outputs = model(mixed_inputs)
            loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Evaluación
        model.eval()
        correct = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                _, preds = torch.max(model(inputs), 1)
                correct += (preds == labels).sum().item()

        val_acc = 100 * correct / len(val_dataset)
        print(f"[{save_name}] Epoch {epoch+1} Loss: {running_loss/len(train_loader):.4f} Val Acc: {val_acc:.2f}%")

        scheduler.step()

        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), save_name)

# INFERENCIA CON ENSAMBLE Y TTA (Test Time Augmentation)
def generate_submission():
    m1 = get_resnet_model()
    m1.load_state_dict(torch.load('best_resnet.pth'))
    m1.eval()

    m2 = get_effnet_model()
    m2.load_state_dict(torch.load('best_effnet.pth'))
    m2.eval()

    test_ds = datasets.ImageFolder(TEST_DIR, transform=data_transforms['val_test'])
    results = []

    print("Starting Ensemble with High Resolution TTA...")
    with torch.no_grad():
        for i, (path, _) in enumerate(test_ds.samples):
            img = Image.open(path).convert('RGB')
            img_name = os.path.basename(path)

            t_orig = data_transforms['val_test'](img).unsqueeze(0).to(DEVICE)
            t_flip = data_transforms['val_test'](transforms.functional.hflip(img)).unsqueeze(0).to(DEVICE)

            # Average of probabilities from both models and both image versions (4 predictions per foto)
            prob_m1 = (torch.softmax(m1(t_orig), dim=1) + torch.softmax(m1(t_flip), dim=1)) / 2
            prob_m2 = (torch.softmax(m2(t_orig), dim=1) + torch.softmax(m2(t_flip), dim=1)) / 2

            final_probs = (prob_m1 + prob_m2) / 2
            _, pred = torch.max(final_probs, 1)

            results.append({'id': img_name, 'label': pred.item()})

    pd.DataFrame(results).to_csv('sample_submission_0622.csv', index=False)
    print("Final file generated!")



In [None]:
train_specific_model(get_resnet_model(), 'best_resnet.pth')

[best_resnet.pth] Epoch 1 Loss: 2.8864 Val Acc: 57.28%
[best_resnet.pth] Epoch 2 Loss: 2.0023 Val Acc: 79.61%
[best_resnet.pth] Epoch 3 Loss: 1.6298 Val Acc: 83.50%
[best_resnet.pth] Epoch 4 Loss: 1.5032 Val Acc: 85.44%
[best_resnet.pth] Epoch 5 Loss: 1.4884 Val Acc: 86.41%
[best_resnet.pth] Epoch 6 Loss: 1.4777 Val Acc: 91.26%
[best_resnet.pth] Epoch 7 Loss: 1.3994 Val Acc: 88.35%
[best_resnet.pth] Epoch 8 Loss: 1.3018 Val Acc: 91.26%
[best_resnet.pth] Epoch 9 Loss: 1.3882 Val Acc: 87.38%
[best_resnet.pth] Epoch 10 Loss: 1.2745 Val Acc: 89.32%
[best_resnet.pth] Epoch 11 Loss: 1.3147 Val Acc: 92.23%
[best_resnet.pth] Epoch 12 Loss: 1.3383 Val Acc: 92.23%
[best_resnet.pth] Epoch 13 Loss: 1.2933 Val Acc: 91.26%
[best_resnet.pth] Epoch 14 Loss: 1.2664 Val Acc: 91.26%
[best_resnet.pth] Epoch 15 Loss: 1.2642 Val Acc: 92.23%
[best_resnet.pth] Epoch 16 Loss: 1.2212 Val Acc: 92.23%
[best_resnet.pth] Epoch 17 Loss: 1.1591 Val Acc: 92.23%
[best_resnet.pth] Epoch 18 Loss: 1.2110 Val Acc: 91.26%
[

In [None]:
train_specific_model(get_effnet_model(), 'best_effnet.pth')

[best_effnet.pth] Epoch 1 Loss: 2.9352 Val Acc: 50.49%
[best_effnet.pth] Epoch 2 Loss: 2.6335 Val Acc: 64.08%
[best_effnet.pth] Epoch 3 Loss: 2.2602 Val Acc: 71.84%
[best_effnet.pth] Epoch 4 Loss: 2.0468 Val Acc: 81.55%
[best_effnet.pth] Epoch 5 Loss: 1.8750 Val Acc: 83.50%
[best_effnet.pth] Epoch 6 Loss: 1.8101 Val Acc: 88.35%
[best_effnet.pth] Epoch 7 Loss: 1.6765 Val Acc: 87.38%
[best_effnet.pth] Epoch 8 Loss: 1.6843 Val Acc: 88.35%
[best_effnet.pth] Epoch 9 Loss: 1.5680 Val Acc: 88.35%
[best_effnet.pth] Epoch 10 Loss: 1.5709 Val Acc: 88.35%
[best_effnet.pth] Epoch 11 Loss: 1.5902 Val Acc: 87.38%
[best_effnet.pth] Epoch 12 Loss: 1.4924 Val Acc: 89.32%
[best_effnet.pth] Epoch 13 Loss: 1.4999 Val Acc: 88.35%
[best_effnet.pth] Epoch 14 Loss: 1.5145 Val Acc: 89.32%
[best_effnet.pth] Epoch 15 Loss: 1.4964 Val Acc: 89.32%
[best_effnet.pth] Epoch 16 Loss: 1.4498 Val Acc: 88.35%
[best_effnet.pth] Epoch 17 Loss: 1.4425 Val Acc: 90.29%
[best_effnet.pth] Epoch 18 Loss: 1.4481 Val Acc: 89.32%
[

In [None]:
generate_submission()

Starting Ensamble con TTA en Alta Resolución...
¡File final generated!


In [None]:
import shutil

def save_to_drive(source_path, destination_dir, filename='sample_submission_0622.csv'):
    """Saves a file from local path to Google Drive."""
    drive_path = os.path.join(destination_dir, filename)
    try:
        shutil.copy(source_path, drive_path)
        print(f"File '{filename}' saved to Drive: {drive_path}")
    except FileNotFoundError:
        print(f"Error: Source file '{source_path}' not found.")
    except Exception as e:
        print(f"Error saving to Drive: {e}")

# Save the generated sample_submission.csv
save_to_drive('sample_submission_0622.csv', BASE_DIR)


File 'sample_submission_0622.csv' saved to Drive: /content/drive/MyDrive/BDMA7_project_files/sample_submission_0622.csv


In [None]:
!pip install torchview
from torchview import draw_graph
import torchvision.models as models

# input_size follows the format (batch_size, channels, height, width)
model_graph = draw_graph(get_resnet_model(), input_size=(32, 3, 224, 224), expand_nested=True)
model_graph.visual_graph
model_graph.visual_graph.render(filename='arquitectura6_22', format='png', cleanup=True)

'arquitectura6_22.png'



'arquitectura6_22.png'

In [None]:
from torchinfo import summary
# Requires defining input size (batch_size, channels, height, width)
summary(get_resnet_model())

model_graph = draw_graph(get_effnet_model(), input_size=(32, 3, 224, 224), expand_nested=True)
model_graph.visual_graph
model_graph.visual_graph.render(filename='arquitectura6_22', format='png', cleanup=True)

'arquitectura6_22.png'

'arquitectura6_22.png'

In [None]:
import shutil
import os

def save_image_to_drive(source_path, destination_dir, filename='arquitectura6_22.png'):
    """Saves an image file from local path to Google Drive."""
    drive_path = os.path.join(destination_dir, filename)
    try:
        shutil.copy(source_path, drive_path)
        print(f"File '{filename}' saved to Drive: {drive_path}")
    except FileNotFoundError:
        print(f"Error: Source file '{source_path}' not found.")
    except Exception as e:
        print(f"Error saving to Drive: {e}")

# Save the generated architecture image
save_image_to_drive('arquitectura6_22.png', BASE_DIR)

File 'arquitectura6_22.png' saved to Drive: /content/drive/MyDrive/BDMA7_project_files/arquitectura6_22.png


##Architecture 7

### High-Resolution ResNet-50 with Ensemble

In this experiment, a super ensemble is constructed by combining three models:
1. The trained ResNet-50 model
2. ResNet-101 pretrained on ImageNet
3. EfficientNet-B0 pretrained on ImageNet

Test-Time Augmentation (TTA) is applied during inference by averaging predictions from the original and horizontally flipped images. Predictions from all three models are averaged to produce the final class probabilities, and the class with the highest probability is selected as


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, models, transforms
from torch.utils.data import DataLoader
import pandas as pd
import numpy as np
import os
from PIL import Image

BASE_DIR = '/content/bdma07/BDMA7_project_files'
TRAIN_DIR = os.path.join(BASE_DIR, 'train_images')
VAL_DIR = os.path.join(BASE_DIR, 'val_images')
TEST_DIR = os.path.join(BASE_DIR, 'test_images')

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
OFFICIAL_MAPPING = {
    'Groove_billed_Ani': 0, 'Red_winged_Blackbird': 1, 'Rusty_Blackbird': 2,
    'Gray_Catbird': 3, 'Brandt_Cormorant': 4, 'Eastern_Towhee': 5,
    'Indigo_Bunting': 6, 'Brewer_Blackbird': 7, 'Painted_Bunting': 8,
    'Bobolink': 9, 'Lazuli_Bunting': 10, 'Yellow_headed_Blackbird': 11,
    'American_Crow': 12, 'Fish_Crow': 13, 'Brown_Creeper': 14,
    'Yellow_billed_Cuckoo': 15, 'Yellow_breasted_Chat': 16,
    'Black_billed_Cuckoo': 17, 'Gray_crowned_Rosy_Finch': 18,
    'Bronzed_Cowbird': 19
}


In [None]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(448),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(15),
        transforms.ColorJitter(0.3, 0.3, 0.3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ]),
    'val_test': transforms.Compose([
        transforms.Resize(512),
        transforms.CenterCrop(448),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],
                             [0.229, 0.224, 0.225])
    ]),
}


In [None]:
def get_target_transform(internal_mapping):
    def transform(target_idx):
        folder_name = list(internal_mapping.keys())[
            list(internal_mapping.values()).index(target_idx)
        ]
        for bird_name, official_id in OFFICIAL_MAPPING.items():
            if bird_name in folder_name:
                return official_id
        return target_idx
    return transform

temp_ds = datasets.ImageFolder(TRAIN_DIR)
target_tf = get_target_transform(temp_ds.class_to_idx)

train_dataset = datasets.ImageFolder(
    TRAIN_DIR,
    transform=data_transforms['train'],
    target_transform=target_tf
)

val_dataset = datasets.ImageFolder(
    VAL_DIR,
    transform=data_transforms['val_test'],
    target_transform=target_tf
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=2)


In [None]:
def get_resnet101():
    model = models.resnet101(weights=models.ResNet101_Weights.DEFAULT)
    model.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(model.fc.in_features, 20)
    )
    return model.to(DEVICE)

def get_effnet():
    model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.DEFAULT)
    model.classifier[1] = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(model.classifier[1].in_features, 20)
    )
    return model.to(DEVICE)

def get_resnet50():
    model = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)
    model.fc = nn.Sequential(
        nn.Dropout(0.5),
        nn.Linear(model.fc.in_features, 20)
    )
    return model.to(DEVICE)


In [None]:
def train_model(model, save_name, epochs=15):
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.05)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)

    best_acc = 0.0

    for epoch in range(epochs):
        model.train()

        for inputs, labels in train_loader:
            inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

            alpha = 0.2
            lam = np.random.beta(alpha, alpha)
            index = torch.randperm(inputs.size(0)).to(DEVICE)

            mixed_inputs = lam * inputs + (1 - lam) * inputs[index]
            labels_a, labels_b = labels, labels[index]

            optimizer.zero_grad()
            outputs = model(mixed_inputs)
            loss = lam * criterion(outputs, labels_a) + (1 - lam) * criterion(outputs, labels_b)
            loss.backward()
            optimizer.step()

        model.eval()
        correct = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)
                _, preds = torch.max(model(inputs), 1)
                correct += (preds == labels).sum().item()

        val_acc = 100 * correct / len(val_dataset)
        if val_acc > best_acc:
            best_acc = val_acc
            torch.save(model.state_dict(), save_name)

    print(f"{save_name} best val acc: {best_acc:.2f}%")


In [None]:
train_model(get_resnet101(), 'resnet101.pth', epochs=12)
train_model(get_effnet(), 'effnet_b0.pth', epochs=12)
train_model(get_resnet50(), 'resnet50.pth', epochs=12)


In [None]:
def generate_submission():
    m1 = get_resnet101()
    m1.load_state_dict(torch.load('resnet101.pth'))
    m1.eval()

    m2 = get_effnet()
    m2.load_state_dict(torch.load('effnet_b0.pth'))
    m2.eval()

    m3 = get_resnet50()
    m3.load_state_dict(torch.load('resnet50.pth'))
    m3.eval()

    test_ds = datasets.ImageFolder(TEST_DIR, transform=data_transforms['val_test'])
    results = []

    with torch.no_grad():
        for path, _ in test_ds.samples:
            img = Image.open(path).convert('RGB')
            img_name = os.path.basename(path)

            t_orig = data_transforms['val_test'](img).unsqueeze(0).to(DEVICE)
            t_flip = data_transforms['val_test'](
                transforms.functional.hflip(img)
            ).unsqueeze(0).to(DEVICE)

            p1 = (torch.softmax(m1(t_orig), 1) + torch.softmax(m1(t_flip), 1)) / 2
            p2 = (torch.softmax(m2(t_orig), 1) + torch.softmax(m2(t_flip), 1)) / 2
            p3 = (torch.softmax(m3(t_orig), 1) + torch.softmax(m3(t_flip), 1)) / 2

            avg_pred = (p1 + p2 + p3) / 3
            _, pred = torch.max(avg_pred, 1)

            results.append({'id': img_name, 'label': pred.item()})

    pd.DataFrame(results).to_csv('submission_super_ensemble.csv', index=False)

generate_submission()


In [None]:
my_sub = pd.read_csv('submission_super_ensemble.csv')
sample_sub = pd.read_csv('sample_submission (1).csv')

if 'id' in my_sub.columns:
    my_sub = my_sub.rename(columns={'id': 'path', 'label': 'class_idx'})

my_sub = my_sub.set_index('path')
my_sub = my_sub.reindex(sample_sub['path'])
my_sub = my_sub.reset_index()

my_sub.to_csv('submission_opt_2.csv', index=False)
print("Saved submission_opt_2.csv")

