In [1]:
# 필요한 라이브러리 설치
!git clone https://github.com/KaiyangZhou/Dassl.pytorch.git
%cd Dassl.pytorch/
!pip install -r requirements.txt
!cp -r dassl ../
%cd ..

Cloning into 'Dassl.pytorch'...
remote: Enumerating objects: 2477, done.[K
remote: Counting objects: 100% (993/993), done.[K
remote: Compressing objects: 100% (288/288), done.[K
remote: Total 2477 (delta 777), reused 861 (delta 705), pack-reused 1484 (from 1)[K
Receiving objects: 100% (2477/2477), 428.00 KiB | 572.00 KiB/s, done.
Resolving deltas: 100% (1658/1658), done.
/Users/and___young/Documents/@24-2/DeepLearning/COSE474_NayoungKim/FinalProject/Dassl.pytorch



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
/Users/and___young/Documents/@24-2/DeepLearning/COSE474_NayoungKim/FinalProject


In [5]:
# 기본 import
import os.path as osp
from collections import OrderedDict
import math
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.cuda.amp import GradScaler, autocast
from dassl.utils import load_pretrained_weights, load_checkpoint
from dassl.optim import build_optimizer, build_lr_scheduler
from clip import clip
from clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
import time

_tokenizer = _Tokenizer()

In [8]:
# CLIP 모델 로드 함수 정의

def load_clip_to_cpu(cfg):
    backbone_name = cfg.MODEL.BACKBONE.NAME
    url = clip._MODELS[backbone_name]
    model_path = clip._download(url, root="./clip_checkpoints")  # 저장 경로 지정

    try:
        model = torch.jit.load(model_path, map_location="cpu").eval()
        state_dict = None
    except RuntimeError:
        state_dict = torch.load(model_path, map_location="cpu")

    model = clip.build_model(state_dict or model.state_dict())
    return model

In [9]:
# 설정 및 모델 로드

from dassl.config import get_cfg_default
cfg = get_cfg_default()
cfg.MODEL.BACKBONE.NAME = "ViT-B/16"  # CLIP의 vision encoder backbone 설정
clip_model = load_clip_to_cpu(cfg)

100%|███████████████████████████████████████| 335M/335M [00:11<00:00, 29.9MiB/s]


In [10]:
# 텍스트 인코더 클래스 정의

class TextEncoder(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.transformer = clip_model.transformer
        self.positional_embedding = clip_model.positional_embedding
        self.ln_final = clip_model.ln_final
        self.text_projection = clip_model.text_projection
        self.dtype = clip_model.dtype

    def forward(self, prompts, tokenized_prompts):
        x = prompts + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.transformer(x)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.ln_final(x).type(self.dtype)
        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
        return x

In [11]:
# 클래스 이름 및 프롬프트 설정

# 데이터셋의 클래스 정의
classnames = ['AnnualCrop', 'Forest', 'HerbaceousVegetation', 'Highway', 
              'Industrial', 'Pasture', 'PermanentCrop', 'Residential', 
              'River', 'SeaLake']

# 클래스 이름에서 언더스코어 제거
classnames = [name.replace("_", " ") for name in classnames]

# 각 클래스 이름의 토큰 길이 계산
name_lens = [len(_tokenizer.encode(name)) for name in classnames]

# 프롬프트 생성
prompts = ["A photo of " + name + "." for name in classnames]

# 프롬프트 토큰화
tokenized_prompts = torch.cat([clip.tokenize(p) for p in prompts])

# 예시로 첫 번째 프롬프트 출력
print("프롬프트 예시:", prompts[1])
print("토큰화된 프롬프트 예시:", tokenized_prompts[1])

프롬프트 예시: A photo of Forest.
토큰화된 프롬프트 예시: tensor([49406,   320,  1125,   539,  4167,   269, 49407,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0], dtype=torch.int32)


In [13]:
# 필요한 추가 라이브러리 import

import torchvision
from torchvision import transforms

# CIFAR-100 데이터셋 다운로드 및 로드
transform = transforms.Compose([
    transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize((0.48145466, 0.4578275, 0.40821073), 
                       (0.26862954, 0.26130258, 0.27577711))
])

test_dataset = torchvision.datasets.CIFAR100(
    root='./data', 
    train=False,
    download=True, 
    transform=transform
)

Downloading https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz to ./data/cifar-100-python.tar.gz


  0%|          | 0/169001437 [00:00<?, ?it/s]

Extracting ./data/cifar-100-python.tar.gz to ./data


In [16]:
# 데이터 전처리 및 로더 정의

def preprocess_image(image_path):
    # CLIP 모델에 맞는 이미지 전처리
    preprocess = transforms.Compose([
        transforms.Resize(224, interpolation=transforms.InterpolationMode.BICUBIC),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), 
                           (0.26862954, 0.26130258, 0.27577711))
    ])
    
    image = Image.open(image_path).convert('RGB')
    return preprocess(image)

class ImageDataset:
    def __init__(self, image_dir):
        self.image_paths = list(Path(image_dir).glob('*/*'))  # 모든 하위 폴더의 이미지
        self.actual_labels = [path.parent.name for path in self.image_paths]  # 폴더명을 레이블로
        
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image = preprocess_image(image_path)
        label = self.actual_labels[idx]
        return image, label

In [19]:
# 추론 함수 정의

def inference_cifar(model, dataset, classnames, device='cuda'):
    print(f"Moving model to {device}...")
    model = model.to(device)  # 모델을 GPU로
    model.eval()
    
    # 텍스트 특징 추출
    print("Extracting text features...")
    text_inputs = torch.cat([clip.tokenize(f"A photo of a {c}") for c in classnames]).to(device)
    with torch.no_grad():
        text_features = model.encode_text(text_inputs)
        text_features /= text_features.norm(dim=-1, keepdim=True)
    
    correct = 0
    total = 0
    predictions = []
    
    # 데이터로더 생성 - num_workers 추가
    dataloader = torch.utils.data.DataLoader(
        dataset, 
        batch_size=128,  # 배치 사이즈 증가
        shuffle=False,
        num_workers=4,   # 데이터 로딩 병렬화
        pin_memory=True  # GPU 전송 최적화
    )
    
    print("Starting inference...")
    with torch.no_grad():
        for images, labels in dataloader:
            # 데이터를 GPU로
            images = images.to(device)
            labels = labels.to(device)
            
            # 이미지 특징 추출
            image_features = model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)
            
            # 유사도 계산
            similarity = (100.0 * image_features @ text_features.T)
            
            # 예측값 구하기
            _, predicted = similarity.max(1)
            
            # 정확도 계산
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
            
            # 예측 결과 저장
            for i in range(len(labels)):
                predictions.append({
                    'true': dataset.classes[labels[i].cpu().item()],
                    'predicted': dataset.classes[predicted[i].cpu().item()],
                    'confidence': similarity[i].max().item()
                })
    
    accuracy = 100 * correct / total
    return accuracy, predictions

In [20]:
# 실제 추론 실행

# GPU 사용 가능 여부 확인 및 설정
if torch.cuda.is_available():
    device = "cuda"
    print("Using GPU:", torch.cuda.get_device_name(0))
else:
    device = "cpu"
    print("Using CPU")

# CIFAR-100 클래스명 가져오기
cifar100_classnames = test_dataset.classes

# 시작 시간 기록
start_time = time.time()

# 추론 실행
accuracy, predictions = inference_cifar(clip_model, test_dataset, cifar100_classnames, device)

# 종료 시간 기록
end_time = time.time()

# 결과 출력
print(f"\nInference completed in {end_time - start_time:.2f} seconds")
print(f"Overall Accuracy: {accuracy:.2f}%")
print("\nSample Predictions:")
for i, pred in enumerate(predictions[:5]):
    print(f"Image {i+1}:")
    print(f"True label: {pred['true']}")
    print(f"Predicted: {pred['predicted']}")
    print(f"Confidence: {pred['confidence']:.2f}%\n")

Using CPU
Moving model to cpu...
Extracting text features...


RuntimeError: "softmax_lastdim_kernel_impl" not implemented for 'Half'

In [None]:
# 결과 시각화

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import numpy as np

def visualize_results(predictions, dataset, num_samples=5):
    # 1. 샘플 이미지와 예측 결과 시각화
    plt.figure(figsize=(15, 3))
    for i in range(num_samples):
        plt.subplot(1, num_samples, i + 1)
        img, _ = dataset[i]
        plt.imshow(img.permute(1, 2, 0).cpu().numpy())
        plt.title(f'True: {predictions[i]["true"]}\nPred: {predictions[i]["predicted"]}', 
                 fontsize=8)
        plt.axis('off')
    plt.tight_layout()
    plt.show()

    # 2. 정확도 분포 시각화
    confidences = [p['confidence'] for p in predictions]
    plt.figure(figsize=(10, 5))
    plt.hist(confidences, bins=50)
    plt.title('Distribution of Prediction Confidence')
    plt.xlabel('Confidence (%)')
    plt.ylabel('Count')
    plt.show()

    # 3. 혼동 행렬 시각화 (상위 10개 클래스만)
    true_labels = [p['true'] for p in predictions]
    pred_labels = [p['predicted'] for p in predictions]
    
    # 상위 10개 클래스만 선택
    unique_labels = list(set(true_labels))[:10]
    mask = [(t in unique_labels) and (p in unique_labels) for t, p in zip(true_labels, pred_labels)]
    filtered_true = [l for l, m in zip(true_labels, mask) if m]
    filtered_pred = [l for l, m in zip(pred_labels, mask) if m]
    
    cm = confusion_matrix(filtered_true, filtered_pred, labels=unique_labels)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=unique_labels, yticklabels=unique_labels)
    plt.title('Confusion Matrix (Top 10 Classes)')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.xticks(rotation=45)
    plt.yticks(rotation=45)
    plt.tight_layout()
    plt.show()

In [None]:
# 결과 시각화 실행

# 이전에 얻은 predictions와 test_dataset을 사용하여 시각화
visualize_results(predictions, test_dataset)