In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!unzip -qq "/content/drive/MyDrive/민호/private/task1.zip"

In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import torch
import torch.nn as nn
import random
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from PIL import Image


def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the random seed
set_seed(42)

In [4]:
# Load training labels
train_labels = pd.read_csv("task1/train_labels.csv")

# Define paths
train_dir = "task1/train"
test_dir = "task1/test"

# Split into train and validation
train_data, val_data = train_test_split(train_labels, test_size=0.2, random_state=42)

In [5]:
class Base():
  batch_size = 64

In [6]:
import os
import random
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Define transforms
image_transforms = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to a fixed size
    transforms.ToTensor(),         # Convert to PyTorch Tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize
])


# Custom Dataset with Sparse Gaussian Noise
class ImageDataset(Dataset):
    def __init__(self, data, img_dir, transform=None, is_test=False, apply_gaussian_noise=False):
        self.data = data
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test  # Flag to indicate if it's a test dataset
        self.apply_gaussian_noise = apply_gaussian_noise

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.data.iloc[idx, 0])
        image = Image.open(img_name).convert('RGB')

        if self.transform:
            image = self.transform(image)

        # Add sparse Gaussian noise
        if self.apply_gaussian_noise and random.random() < 0.5:  # 50% probability
            noise = torch.zeros_like(image)  # Initialize noise tensor with zeros
            num_pixels = int(image.numel() * 0.02)  # Apply noise to 2% of pixels
            for _ in range(num_pixels):
                c = random.randint(0, image.size(0) - 1)  # Random channel (R, G, B)
                h = random.randint(0, image.size(1) - 1)  # Random height
                w = random.randint(0, image.size(2) - 1)  # Random width
                noise[c, h, w] = torch.randn(1).item() * 0.5  # Gaussian noise
            image = image + noise
            image = torch.clamp(image, 0.0, 1.0)  # Ensure pixel values stay in [0, 1]

        if self.is_test:  # If test dataset, return only the image
            return image
        else:  # If train/val dataset, return image and label
            label = self.data.iloc[idx, 1]
            return image, label

# Assume train_data and val_data are pandas DataFrames with image paths and labels
# and train_dir is the directory containing the images.

# Datasets and Dataloaders with Gaussian Noise
train_dataset = ImageDataset(train_data, train_dir, transform=image_transforms, apply_gaussian_noise=True)
val_dataset = ImageDataset(val_data, train_dir, transform=image_transforms, apply_gaussian_noise=True)

train_loader = DataLoader(train_dataset, batch_size=Base.batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=Base.batch_size, shuffle=False)


In [7]:
from torchvision.models import efficientnet_b0

# Load pre-trained EfficientNet
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = efficientnet_b0(pretrained=True)  # Use EfficientNet-B0 as an example
model.classifier[1] = nn.Linear(model.classifier[1].in_features, 1)  # Modify for binary classification
model = model.to(device)

# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 160MB/s]


In [9]:
# Simplified Train Function Using Predefined Loaders
def train_model_with_loaders(model, train_loader, val_loader, criterion, optimizer_class, num_epochs=10):
    # Initialize optimizer
    optimizer = optimizer_class(model.parameters())

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        # Training loop
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device).float().unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)

        # Validation loop
        val_loss, val_f1 = evaluate_model(model, val_loader, criterion)
        print(f"Epoch {epoch + 1}/{num_epochs}, "
              f"Train Loss: {train_loss / len(train_loader.dataset):.4f}, "
              f"Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")

    # Final evaluation
    val_loss, val_f1 = evaluate_model(model, val_loader, criterion)
    print(f"Final Results - Val Loss: {val_loss:.4f}, Val F1: {val_f1:.4f}")

# Train the model using predefined loaders
train_model_with_loaders(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer_class=lambda params: torch.optim.Adam(params, lr=0.001),
    num_epochs=50
)

KeyboardInterrupt: 

In [10]:
from torch.utils.data import DataLoader, ConcatDataset

def train_model_on_full_data_with_progress(model, train_loader, val_loader, criterion, optimizer_class, num_epochs=10):
    """
    Train the model on the combined dataset (train + validation) and display progress for each epoch.

    Args:
    - model: PyTorch model to train.
    - train_loader: DataLoader for training dataset.
    - val_loader: DataLoader for validation dataset.
    - criterion: Loss function.
    - optimizer_class: Function that returns an optimizer.
    - num_epochs: Number of training epochs.

    Returns:
    - None
    """
    # Combine train and validation datasets
    combined_dataset = ConcatDataset([train_loader.dataset, val_loader.dataset])
    combined_loader = DataLoader(combined_dataset, batch_size=train_loader.batch_size, shuffle=True)

    # Initialize optimizer
    optimizer = optimizer_class(model.parameters())

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        # Training loop on combined dataset
        for batch_idx, (images, labels) in enumerate(combined_loader, 1):
            images, labels = images.to(device), labels.to(device).float().unsqueeze(1)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * images.size(0)

            # Progress log for every 10 batches
            if batch_idx % 10 == 0:
                print(f"Epoch [{epoch + 1}/{num_epochs}] Batch [{batch_idx}/{len(combined_loader)}] "
                      f"Loss: {loss.item():.4f}")

        # Epoch summary
        epoch_loss = train_loss / len(combined_loader.dataset)
        print(f"Epoch [{epoch + 1}/{num_epochs}] Completed. Average Loss: {epoch_loss:.4f}")

    print("Training on full dataset complete.")


train_model_on_full_data_with_progress(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    criterion=criterion,
    optimizer_class=lambda params: torch.optim.Adam(params, lr=0.001),
    num_epochs=50
)

Epoch [1/50] Batch [10/55] Loss: 0.1740
Epoch [1/50] Batch [20/55] Loss: 0.0885
Epoch [1/50] Batch [30/55] Loss: 0.0857
Epoch [1/50] Batch [40/55] Loss: 0.0861
Epoch [1/50] Batch [50/55] Loss: 0.0380
Epoch [1/50] Completed. Average Loss: 0.1113
Epoch [2/50] Batch [10/55] Loss: 0.0432
Epoch [2/50] Batch [20/55] Loss: 0.0149
Epoch [2/50] Batch [30/55] Loss: 0.0059
Epoch [2/50] Batch [40/55] Loss: 0.2251
Epoch [2/50] Batch [50/55] Loss: 0.0535
Epoch [2/50] Completed. Average Loss: 0.0490
Epoch [3/50] Batch [10/55] Loss: 0.0591
Epoch [3/50] Batch [20/55] Loss: 0.0219
Epoch [3/50] Batch [30/55] Loss: 0.0047
Epoch [3/50] Batch [40/55] Loss: 0.0167
Epoch [3/50] Batch [50/55] Loss: 0.0196
Epoch [3/50] Completed. Average Loss: 0.0374
Epoch [4/50] Batch [10/55] Loss: 0.0199
Epoch [4/50] Batch [20/55] Loss: 0.0173
Epoch [4/50] Batch [30/55] Loss: 0.0617
Epoch [4/50] Batch [40/55] Loss: 0.0604
Epoch [4/50] Batch [50/55] Loss: 0.0052
Epoch [4/50] Completed. Average Loss: 0.0203
Epoch [5/50] Batch [

KeyboardInterrupt: 

In [11]:
# Filter only image files
test_files = [f for f in os.listdir(test_dir) if f.lower().endswith(('.jpeg', '.jpg', '.png'))]

# Create test DataFrame
test_data = pd.DataFrame(test_files, columns=['file_name'])

# Create test dataset and dataloader
test_dataset = ImageDataset(test_data, test_dir, transform=image_transforms, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Predict on test dataset
model.eval()
test_preds = []

with torch.no_grad():
    for images in test_loader:  # No labels in test data
        images = images.to(device)
        outputs = model(images)
        preds = torch.sigmoid(outputs).cpu().numpy() > 0.5
        test_preds.extend(preds)

# Save predictions in the answer_sample format
test_data['bad'] = np.array(test_preds).astype(int)  # Add predictions as 'bad'
output_df = test_data[['file_name', 'bad']]          # Ensure columns match answer_sample

# Sort by file_name
output_df = output_df.sort_values(by='file_name').reset_index(drop=True)

# Save to CSV
output_path = "final_predictions.csv"
output_df.to_csv(output_path, index=False)

print(f"Predictions saved to {output_path}")


Predictions saved to final_predictions.csv
