In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn, optim
from transformers import AutoImageProcessor, AutoModelForImageClassification
from torchvision import transforms
from PIL import Image
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
from ultralytics import YOLO
from PIL import Image
import numpy as np
import cv2

In [2]:
object_det_model = YOLO("yolov8x.pt")

In [3]:
def rotate(img, angle):
    if angle == 0:
        return img
    h, w = img.size[1], img.size[0]
    m = cv2.getRotationMatrix2D((w / 2, h / 2), angle, 1)
    rad = np.radians(angle)
    nw = int(abs(np.sin(rad)) * h + abs(np.cos(rad)) * w)
    nh = int(abs(np.cos(rad)) * h + abs(np.sin(rad)) * w)
    m[0, 2] += (nw - w) / 2
    m[1, 2] += (nh - h) / 2
    return Image.fromarray(cv2.warpAffine(np.array(img), m, (nw, nh)))

In [4]:
def detect_best_class(model, img_path):
    img = Image.open(img_path).convert("RGB")
    angles = [0, 45, 90, 180]

    best = {"cls": None, "conf": 0}

    for a in angles:
        rimg = rotate(img, a)
        res = model(rimg)[0]

        if res.boxes:
            for b in res.boxes:
                conf = float(b.conf)
                cls = int(b.cls)

                if conf > best["conf"]:
                    best.update({"cls": cls, "conf": conf})

    if best["cls"] is None:
        return "None"

    return model.names[best["cls"]]

In [5]:
class AddGaussianNoise(object):
    def __init__(self, mean=0., std=0.05):
        self.mean = mean
        self.std = std

    def __call__(self, tensor):
        noise = torch.randn(tensor.size()) * self.std + self.mean
        return torch.clamp(tensor + noise, 0., 1.)

In [6]:
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),

    # 무작위 회전 (30, 45, 60도 중 하나)
    transforms.RandomChoice([
        transforms.RandomRotation(30),
        transforms.RandomRotation(45),
        transforms.RandomRotation(60),
        transforms.RandomRotation(75),
        transforms.RandomRotation(90),
        transforms.RandomRotation(120),
        transforms.RandomRotation(150),
        transforms.RandomRotation(180),
        transforms.RandomRotation(210),
        transforms.RandomRotation(240),
        transforms.RandomRotation(270),
        transforms.RandomRotation(300)
    ]),

    # 좌우 / 상하 반전
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomVerticalFlip(p=0.5),

    # 30% 확률로 흑백화
    transforms.RandomApply([transforms.Grayscale(num_output_channels=3)], p=0.3),

    transforms.ToTensor(),

    # 가우시안 노이즈 추가
    AddGaussianNoise(0., 0.05),
])

# 검증 및 테스트용은 변형 최소화 (노이즈, 회전 등 제외)
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

In [7]:
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None, label_encoder=None):
        self.data = pd.read_csv(csv_file)
        self.img_dir = img_dir
        self.transform = transform
        self.label_encoder = label_encoder

        if 'target' in self.data.columns and self.label_encoder is not None:
            self.data['target'] = self.label_encoder.transform(self.data['target'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = f"{self.img_dir}/{self.data.iloc[idx, 0]}"
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        if 'target' in self.data.columns:
            label = torch.tensor(self.data.iloc[idx, 1], dtype=torch.long)
            return image, label
        else:
            return image, -1

In [8]:
train_df = pd.read_csv("/root/CV_/datasets/data/train.csv")
le = LabelEncoder()
train_df['target'] = le.fit_transform(train_df['target'])

train_dataset = CustomImageDataset(
    csv_file="/root/CV_/datasets/data/train.csv",
    img_dir="/root/CV_/datasets/data/train",
    transform=train_transform,
    label_encoder=le
)

test_dataset = CustomImageDataset(
    csv_file="/root/CV_/datasets/data/sample_submission.csv",
    img_dir="/root/CV_/datasets/data/test",
    transform=test_transform,
    label_encoder=le
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)


In [9]:
processor = AutoImageProcessor.from_pretrained("Falconsai/nsfw_image_detection")
model = AutoModelForImageClassification.from_pretrained("Falconsai/nsfw_image_detection")

num_classes = len(le.classes_)
if model.config.num_labels != num_classes:
    if hasattr(model, 'classifier'):
        in_features = model.classifier.in_features
        model.classifier = nn.Linear(in_features, num_classes)
    elif hasattr(model, 'score'):
        in_features = model.score.in_features
        model.score = nn.Linear(in_features, num_classes)
    else:
        print("모델 구조 확인 필요 - 마지막 레이어 이름 다를 수 있음")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images).logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} - Loss: {avg_loss:.4f}")


Epoch 1/30 - Loss: 1.9747
Epoch 2/30 - Loss: 1.0046
Epoch 3/30 - Loss: 0.6729
Epoch 4/30 - Loss: 0.5405
Epoch 5/30 - Loss: 0.4572
Epoch 6/30 - Loss: 0.4107
Epoch 7/30 - Loss: 0.3200
Epoch 8/30 - Loss: 0.2906
Epoch 9/30 - Loss: 0.2553
Epoch 10/30 - Loss: 0.2817
Epoch 11/30 - Loss: 0.2586
Epoch 12/30 - Loss: 0.2870
Epoch 13/30 - Loss: 0.2187
Epoch 14/30 - Loss: 0.2154


In [None]:
model.eval()
all_preds = []

with torch.no_grad():
    for images, _ in test_loader:
        images = images.to(device)

        for img_tensor in images:
            img_np = (img_tensor.cpu().permute(1, 2, 0).numpy() * 255).astype(np.uint8)
            pil_img = Image.fromarray(img_np)

            yolo_res = object_det_model(pil_img)[0]

            detected_car = False

            if yolo_res.boxes:
                for box in yolo_res.boxes:
                    cls = int(box.cls[0])
                    class_name = object_det_model.names[cls]

                    if class_name == "car":
                        detected_car = True
                        break

            if detected_car:
                all_preds.append(16)
            else:
                img_batch = img_tensor.unsqueeze(0).to(device)
                outputs = model(img_batch).logits
                pred_class = outputs.argmax(dim=1).cpu().item()
                all_preds.append(pred_class)

pred_labels = le.inverse_transform(all_preds)

result = pd.read_csv('/root/CV_/datasets/data/sample_submission.csv')
result['target'] = pred_labels
result.to_csv('add_aug_ob_det_vit_output.csv', index=False)
print("✅ 저장 완료: vit_output.csv")


0: 640x640 (no detections), 16.0ms
Speed: 1.7ms preprocess, 16.0ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 16.0ms
Speed: 1.5ms preprocess, 16.0ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 laptop, 16.0ms
Speed: 1.4ms preprocess, 16.0ms inference, 14.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 16.0ms
Speed: 1.5ms preprocess, 16.0ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 1 clock, 16.0ms
Speed: 1.4ms preprocess, 16.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 16.0ms
Speed: 1.4ms preprocess, 16.0ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 16.0ms
Speed: 1.4ms preprocess, 16.0ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 640)

0: 640x640 (no detections), 16.0ms
Speed: 1.4ms preprocess, 16.0ms inference, 0.4m