In [2]:
import torch
import torchvision.transforms as T
from timm import create_model
from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import ImageFolder
import yaml
import os

class FaceDataset(Dataset):
    def __init__(self, data_folder, model, transform=None):
        self.data_folder = data_folder
        self.model = model  # 모델 추가
        self.image_folder = ImageFolder(os.path.join(data_folder, 'images'), transform=transform)
        self.face_coordinates = self.load_face_coordinates()

    def load_face_coordinates(self):
        face_coordinates = []  # 얼굴 좌표 정보를 저장할 리스트
        labels_folder = os.path.join(self.data_folder, 'labels')

        for path, _ in self.image_folder.samples:
            # 이미지 파일의 경로를 기반으로 얼굴 좌표 파일의 경로를 구성
            image_name = os.path.basename(path)
            face_txt_path = os.path.join(labels_folder, image_name.replace('.jpg', '.txt'))

            # 얼굴 좌표 파일이 실제로 존재하는지 확인
            if os.path.exists(face_txt_path):
                with open(face_txt_path, 'r') as f:
                    # txt 파일에서 얼굴 좌표 정보를 읽어옴
                    line = f.readline().strip().split()

                    if len(line) != 5:
                        print(f"파일이 없습니다 : {face_txt_path}")
                        continue

                    x, y, size_x, size_y = map(float, line[1:])
                    x_scaled = x / center_size
                    y_scaled = y / center_size

                    face_coords = [x_scaled, y_scaled, size_x, size_y]
                    face_coordinates.append(face_coords)
            else:
                print(f"파일이 없습니다 : {image_name}")

        return face_coordinates
    def __getitem__(self, index):
        image, label_tensor = self.image_folder[index]

        if not isinstance(label_tensor, (int, float)):
            label = label_tensor.item()
        else:
            label = label_tensor

        image_np = image.permute(1, 2, 0).numpy() 
        H, W, _ = image_np.shape  

        print(f"Original Image size: ({H}, {W}, 3)")

        face_coords = self.face_coordinates[index]

        resized_image = T.functional.resize(image, (300, 300))  
        resized_H, resized_W = resized_image.shape[1:3]

        x_scaled, y_scaled, size_x_scaled, size_y_scaled = face_coords
        x_scaled = int(x_scaled * resized_W)
        y_scaled = int(y_scaled * resized_H)
        size_x_scaled = int(size_x_scaled * resized_W)
        size_y_scaled = int(size_y_scaled * resized_H)

        # 보정: crop 영역의 크기가 0이 되지 않도록 처리
        size_x_scaled = max(size_x_scaled, 1)
        size_y_scaled = max(size_y_scaled, 1)

        face_img = T.functional.crop(resized_image, top=y_scaled, left=x_scaled, height=size_y_scaled, width=size_x_scaled)

        resized_face_img = T.functional.resize(face_img, (224, 224)) # swin transformer 모델의 이미지 크기에 맞게 변환

        print(f"얼굴 좌표값 : {face_coords}")
        print(f"label : {label}")

        return resized_image, resized_face_img, face_coords, label


    def __len__(self):
        return len(self.image_folder)

# yaml 파일 로드
with open('./data.yaml', 'r', encoding='UTF8') as f:
    config = yaml.safe_load(f)

# center_size 설정
center_size = config.get('img_size', 1.0)

# Swin Transformer 모델
swin_model = create_model('swin_base_patch4_window7_224', pretrained=True, num_classes=1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
swin_model.to(device)

# 데이터 전처리
print(swin_model.default_cfg['input_size'])
transform = T.Compose([T.ToTensor()])
face_dataset = FaceDataset('./PJT_MCL/train', model=swin_model, transform=transform)
dataloader = DataLoader(face_dataset, batch_size=4, shuffle=True, num_workers=0)

optimizer = torch.optim.AdamW(swin_model.parameters(), lr=1e-4)
criterion = torch.nn.BCEWithLogitsLoss()

for epoch in range(10):
    for resized_images, face_images, face_coords, labels in dataloader:
        resized_images = resized_images.to(device)
        face_images = face_images.to(device)

        print(f"얼굴 좌표값(훈련) : {face_coords}")

        # face_coords가 리스트이므로 각각의 요소를 Tensor로 변환
        face_coords = [coord.to(device) for coord in face_coords]

        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = swin_model(face_images)
        print(f"결과값 : {outputs}")

        labels = labels.unsqueeze(1).float()  # 모델 출력의 크기와 일치하도록 차원 추가 및 float로 변환
        print(f"labels : {labels}")

        loss = criterion(outputs, labels) 
        print(f"loss : {loss}")

        loss.backward()
        optimizer.step()

torch.save(swin_model.state_dict(), 'swin_model_ver3.pt')

(3, 224, 224)
Original Image size: (300, 300, 3)
얼굴 좌표값 : [0.0014333333333333333, 0.0010666666666666667, 0.03666666666666667, 0.03666666666666667]
label : 0
Original Image size: (300, 300, 3)
얼굴 좌표값 : [0.0022444444444444443, 1.1111111111111112e-05, 0.07666666666666666, 0.07666666666666666]
label : 0
Original Image size: (300, 300, 3)
얼굴 좌표값 : [0.0021, 0.00043333333333333337, 0.12666666666666668, 0.12666666666666668]
label : 0
Original Image size: (300, 300, 3)
얼굴 좌표값 : [0.0022, 0.00024444444444444443, 0.023333333333333334, 0.023333333333333334]
label : 0
얼굴 좌표값(훈련) : [tensor([0.0014, 0.0022, 0.0021, 0.0022], dtype=torch.float64), tensor([1.0667e-03, 1.1111e-05, 4.3333e-04, 2.4444e-04], dtype=torch.float64), tensor([0.0367, 0.0767, 0.1267, 0.0233], dtype=torch.float64), tensor([0.0367, 0.0767, 0.1267, 0.0233], dtype=torch.float64)]
결과값 : tensor([[0.0199],
        [0.1223],
        [0.1880],
        [0.1685]], grad_fn=<AddmmBackward0>)
labels : tensor([[0.],
        [0.],
        [0.],
 

Swin transformer 모델 적용

In [4]:
import torch
from timm import create_model

swin_model = create_model('swin_base_patch4_window7_224', pretrained=False, num_classes=1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
swin_model.to(device)

# 저장된 모델 가중치 불러오기
swin_model.load_state_dict(torch.load('swin_model_ver3.pt', map_location=device))
swin_model.eval()

SwinTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 128, kernel_size=(4, 4), stride=(4, 4))
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  )
  (layers): Sequential(
    (0): SwinTransformerStage(
      (downsample): Identity()
      (blocks): Sequential(
        (0): SwinTransformerBlock(
          (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (attn): WindowAttention(
            (qkv): Linear(in_features=128, out_features=384, bias=True)
            (attn_drop): Dropout(p=0.0, inplace=False)
            (proj): Linear(in_features=128, out_features=128, bias=True)
            (proj_drop): Dropout(p=0.0, inplace=False)
            (softmax): Softmax(dim=-1)
          )
          (drop_path1): Identity()
          (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
          (mlp): Mlp(
            (fc1): Linear(in_features=128, out_features=512, bias=True)
            (act): GELU(approximate='none')
            (

In [12]:
import torch
import torchvision.transforms as T
import cv2
from PIL import Image
import numpy as np

# 모델 불러오기
swin_model = create_model('swin_base_patch4_window7_224', pretrained=False, num_classes=1)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
swin_model.load_state_dict(torch.load('swin_model_ver3.pt', map_location=device))
swin_model.eval()
swin_model.to(device)

# 이미지 경로
image_path = './images/248.jpg'

# 이미지 전처리
transform = T.Compose([T.Resize((224, 224)), T.ToTensor()])
image = Image.open(image_path).convert('RGB')
input_image = transform(image).unsqueeze(0).to(device)

with torch.no_grad():
    output = swin_model(input_image)

probability = torch.sigmoid(output)

# 얼굴 좌표 추출
if len(probability) > 0:
    face_coords = probability[0].detach().cpu().numpy()

    if len(face_coords.shape) == 0:
        face_coords = face_coords.reshape(1)

    if len(face_coords) == 1:
        x, y, w, h = int(face_coords[0]), 0, 0, 0 
    else:
        x_scaled, y_scaled, size_x_scaled, size_y_scaled = face_coords
        x = int(x_scaled * image.width)
        y = int(y_scaled * image.height)
        w = int(size_x_scaled * image.width)
        h = int(size_y_scaled * image.height)

    image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    cv2.rectangle(image_np, (x, y), (x + w, y + h), (0, 0, 255), 2)

    cv2.imshow('Image with Face Detection', image_np)
    cv2.waitKey(0)
    cv2.destroyAllWindows()


In [None]:
# 테스트 데이터셋 생성
test_face_dataset = FaceDataset(config['test'], transform=transform)
test_dataloader = DataLoader(test_face_dataset, batch_size=4, shuffle=False, num_workers=0)

# 모델 평가
swin_model.eval()
correct_predictions = 0
total_samples = 0

with torch.no_grad():
    for resized_images, face_images, labels in test_dataloader:
        resized_images, labels = resized_images.to(device), labels.to(device)

        # 모델 예측
        outputs = swin_model(face_images)

        # 예측된 결과를 기반으로 정확도 계산
        predictions = torch.round(torch.sigmoid(outputs))
        correct_predictions += (predictions == labels.unsqueeze(1)).sum().item()
        total_samples += labels.size(0)

# 정확도 계산
accuracy = correct_predictions / total_samples
print(f"Test Accuracy: {accuracy * 100:.2f}%")


In [None]:
# DETR 모델 적용

import torch
import numpy as np
import cv2
import time
from ultralytics import RTDETR
import supervision as sv
class DETRClass:
    def __init__(self, capture_index):
        self.capture_index=capture_index
        self.device=torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = RTDETR('rtdetr-l.pt')
        self.CLASS_NAMES_DICT=self.model.model.names
        self.box_annotator=sv.BoxAnnotator(sv.ColorPalette.default(),thickness=3, text_thickness=3, text_scale=1.5)
    def plot_bboxs(self, results, frame):
        boxes=results[0].boxes.cpu().numpy()
        class_id=boxes.cls
        conf=boxes.conf
        xyxy=boxes.xyxy
        class_id=class_id.astype(np.int32)
        detections=sv.Detections(xyxy=xyxy, class_id=class_id, confidence=conf)
        self.labels = [f"{self.CLASS_NAMES_DICT[class_id]} {confidence: .2f}" for xyxy, class_id, confidence in zip(detections.xyxy, detections.class_id, detections.confidence)]
        frame=self.box_annotator.annotate(frame, detections, self.labels)
        return frame
    def __call__(self):
        cap=cv2.VideoCapture(self.capture_index)
        assert cap.isOpened()
        cap.set(cv2.CAP_PROP_FRAME_WIDTH,1280)
        cap.set(cv2.CAP_PROP_FRAME_HEIGHT,720)
        while cap.isOpened():
            start_time=time.perf_counter()
            ret, frame=cap.read()
            results=self.model.predict(frame)
            frame=self.plot_bboxs(results, frame)
            end_time=time.perf_counter()
            fps=1/(end_time-start_time)
            cv2.putText(frame, f"FPS: {fps:.2f}",(20,70), cv2.FONT_HERSHEY_SCRIPT_SIMPLEX,1,(0,255,0),2)
            cv2.imshow("DETR",frame)
            if cv2.waitKey(1)==ord('q'):
                break
        cap.release()
        cv2.destroyAllWindows()
trasformer_detector=DETRClass(0)
trasformer_detector()