# Implementation guide

## Installation
## Data Preparation
## Foundation model Training

# Report guide
--------------
motivations
related works
methods (formulation, architecture)
- Describe your computing resources and reformulate your problem managable: Colab Pro $10/month 
experiments (data preparation, hyperparameter tuning, quantitative/qualitative experimental results)
discussion & future direction
--------------
## basic points (overall) (2): 
- length (0.5)
- format (0.5)
- clarity of writing (1)

## Introduction (5): 
- motivation (2)
- problem definition (2)
- concise description of contribution (1)

## Methods (5): 
- significance/novelty (2)
- figure (1)
- reproducibility (2)-algorithm
: 수도코드 등 implementation의 architecture를 설계하는 파트가 포함이 되어야 한다.

## Experiments (7): 
- dataset (1)
- computer resource (CPU,GPU, OS, pytorch etc.) & experimental design (1)
- quantitative results (1)
  : -> 숫자, Plot(그래프 그림) 얘는 정량적 결과(quantitative)
- qualitative results (1)
  : -> 수로는 설명되지 않는, 그림 같은 느낌적인 느낌을 전달하는. 정성적.(ex 어텐션 맵)
- Figures (plots)/Tables and their analysis (2)
  : Visualising result를 통해 result가 좋은지 아닌지를 확인할 것.
- discussion why the proposed method is successful or unsuccessful (1) 
  : If your model is not competitive(degradation이 observe된다면), why&future direction 설명하면 됨.

## Future direction (1).

## Github history (2)

## Overleaf history (2)
## (Bonus+1) 
- pre-trained foundation models beyond ImageNet-pretrained CNNs (distillation, adaption, pseudo-labeling, baseline etc.), CLIP, BERT, RoBERTa
- CLIP처럼 foundation model을 선택해서 했으면 좋겠다… stable diffusion… se(segment efficient)m2,...

# CLIP model 기본 구조 구현

In [1]:
pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /private/var/folders/4p/kqk9nd_51cd1t2bqc3l1fyjr0000gn/T/pip-req-build-2tgydonu
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /private/var/folders/4p/kqk9nd_51cd1t2bqc3l1fyjr0000gn/T/pip-req-build-2tgydonu
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25ldone

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:

# CLIP의 기본 구성 요소만 초기화


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from clip import clip
from clip.simple_tokenizer import SimpleTokenizer as _Tokenizer
import os

# 토크나이저 초기화 
_tokenizer = _Tokenizer()

class TextEncoder(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        # CLIP의 텍스트 인코더 컴포넌트를 가져온다
        self.transformer = clip_model.transformer
        self.positional_embedding = clip_model.positional_embedding
        self.ln_final = clip_model.ln_final
        self.text_projection = clip_model.text_projection
        self.dtype = clip_model.dtype

    def forward(self, prompts, tokenized_prompts):
        x = prompts + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        x = self.ln_final(x).type(self.dtype)
        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)] @ self.text_projection
        return x

def load_clip_to_cpu(model_name="ViT-B/16"):
    # 모델을 저장할 디렉토리 설정
    root = os.path.expanduser("~/.cache/clip")
    
    # CLIP 모델을 직접 로드한다
    model, preprocess = clip.load(model_name, device="cpu", download_root=root)
    
    # 평가 모드로 설정
    model = model.eval()
    
    return model

def initialize_clip():
    # CLIP 모델 로드
    clip_model = load_clip_to_cpu()
    
    # 텍스트 인코더와 이미지 인코더 초기화
    text_encoder = TextEncoder(clip_model)
    image_encoder = clip_model.visual
    
    # GPU 사용 가능하면 GPU로 이동
    device = "cuda" if torch.cuda.is_available() else "cpu"
    text_encoder = text_encoder.to(device)
    image_encoder = image_encoder.to(device)
    
    return text_encoder, image_encoder, device

# 기본적인 데이터 전처리를 위한 transform 정의
def get_transforms():
    return transforms.Compose([
        transforms.Resize((224, 224)),  # CLIP 입력 크기에 맞춤
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                           (0.26862954, 0.26130258, 0.27577711))  # CLIP 기본 정규화 값
    ])

# 사용 예시
if __name__ == "__main__":
    # CLIP 모델 초기화
    text_encoder, image_encoder, device = initialize_clip()
    print(f"Device: {device}")
    print("CLIP model initialized successfully!")

Device: cpu
CLIP model initialized successfully!


In [None]:
# 이미지-텍스트 쌍의 처리

class CLIPModel(nn.Module):
    def __init__(self, clip_model):
        super().__init__()
        self.text_encoder = TextEncoder(clip_model)
        self.image_encoder = clip_model.visual
        self.logit_scale = clip_model.logit_scale
        self.dtype = clip_model.dtype

    def forward(self, image, text):
        # 이미지 인코딩
        image_features = self.image_encoder(image.type(self.dtype))
        image_features = F.normalize(image_features, dim=-1)

        # 텍스트 인코딩
        text_features = self.text_encoder(text)
        text_features = F.normalize(text_features, dim=-1)

        # 유사도 계산
        logit_scale = self.logit_scale.exp()
        logits = logit_scale * image_features @ text_features.T

        return logits

In [None]:
# 손실 함수 정의

def clip_loss(similarity):
    labels = torch.arange(similarity.shape[0], device=similarity.device)
    loss_i = F.cross_entropy(similarity, labels)
    loss_t = F.cross_entropy(similarity.T, labels)
    return (loss_i + loss_t) / 2

In [None]:
# 추론 기능

@torch.no_grad()
def get_image_features(model, image):
    image_features = model.image_encoder(image.type(model.dtype))
    return F.normalize(image_features, dim=-1)

@torch.no_grad()
def get_text_features(model, text):
    text_features = model.text_encoder(text)
    return F.normalize(text_features, dim=-1)

In [None]:
# 토큰화 및 전처리

def prepare_text(text):
    return clip.tokenize(text).to(device)

def prepare_image(image):
    return get_transforms()(image).unsqueeze(0).to(device)

# CoOp 핵심 아이디어 구현

In [3]:
class CoOpTextEncoder(nn.Module):
    def __init__(self, clip_model, n_ctx=16, n_cls=1000):
        super().__init__()
        self.transformer = clip_model.transformer
        self.positional_embedding = clip_model.positional_embedding
        self.ln_final = clip_model.ln_final
        self.text_projection = clip_model.text_projection
        self.dtype = clip_model.dtype
        
        # CoOp의 핵심: 학습 가능한 context tokens
        self.n_ctx = n_ctx
        self.ctx = nn.Parameter(torch.randn(n_ctx, clip_model.transformer.width))
        
        # class-specific한 prompts를 위한 embedding
        self.meta_net = nn.Sequential(OrderedDict([
            ("linear1", nn.Linear(clip_model.transformer.width, clip_model.transformer.width // 16)),
            ("relu", nn.ReLU(inplace=True)),
            ("linear2", nn.Linear(clip_model.transformer.width // 16, n_cls * n_ctx * clip_model.transformer.width))
        ]))
        
    def construct_prompts(self, ctx, prefix, suffix, label=None):
        # ctx: context tokens
        # prefix: "A photo of a"와 같은 고정된 프롬프트의 시작 부분
        # suffix: 클래스 이름과 같은 프롬프트의 끝 부분
        
        if label is not None:
            # class-specific prompts 생성
            ctx = self.meta_net(ctx)
            ctx = ctx.view(-1, self.n_ctx, self.transformer.width)
            ctx = ctx[torch.arange(ctx.shape[0]), label]
        
        # 전체 프롬프트 구성
        prompts = torch.cat(
            [
                prefix,  # (n_cls, prefix_len, dim)
                ctx,     # (n_cls, n_ctx, dim)
                suffix,  # (n_cls, suffix_len, dim)
            ],
            dim=1,
        )
        return prompts

    def forward(self, prompts, tokenized_prompts, label=None):
        # 입력 임베딩 처리
        x = prompts + self.positional_embedding.type(self.dtype)
        x = x.permute(1, 0, 2)
        x = self.transformer(x)
        x = x.permute(1, 0, 2)
        x = self.ln_final(x).type(self.dtype)
        
        # 토큰화된 프롬프트에 따라 특정 위치의 features 선택
        x = x[torch.arange(x.shape[0]), tokenized_prompts.argmax(dim=-1)]
        
        # 텍스트 프로젝션 적용
        x = x @ self.text_projection
        
        return x

def initialize_coop(clip_model, n_ctx=16, n_cls=1000):
    # CoOp 텍스트 인코더와 이미지 인코더 초기화
    text_encoder = CoOpTextEncoder(clip_model, n_ctx=n_ctx, n_cls=n_cls)
    image_encoder = clip_model.visual
    
    # GPU 사용 가능하면 GPU로 이동
    device = "cuda" if torch.cuda.is_available() else "cpu"
    text_encoder = text_encoder.to(device)
    image_encoder = image_encoder.to(device)
    
    return text_encoder, image_encoder, device

# 학습을 위한 손실 함수
def compute_loss(image_features, text_features, logit_scale):
    # 정규화
    image_features = F.normalize(image_features, dim=-1)
    text_features = F.normalize(text_features, dim=-1)
    
    # 로짓 계산
    logits = logit_scale.exp() * image_features @ text_features.t()
    
    # 레이블 생성 (대각선 매트릭스)
    labels = torch.arange(logits.shape[0], device=logits.device)
    
    # 교차 엔트로피 손실 계산
    loss = F.cross_entropy(logits, labels)
    return loss

## Caltech-101 데이터셋을 사용한 CoOp (Context Optimization) 학습 및 시각화 구현

In [8]:
pip install kaggle

Collecting kaggle
  Downloading kaggle-1.6.17.tar.gz (82 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.7/82.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting python-slugify (from kaggle)
  Downloading python_slugify-8.0.4-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting text-unidecode>=1.3 (from python-slugify->kaggle)
  Downloading text_unidecode-1.3-py2.py3-none-any.whl.metadata (2.4 kB)
Downloading python_slugify-8.0.4-py2.py3-none-any.whl (10 kB)
Downloading text_unidecode-1.3-py2.py3-none-any.whl (78 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.2/78.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.6.17-py3-none-any.whl size=105800 sha256=a1e6f8124d3fdb0b75c4f1b8edde6ee646f88fd0bdd3127b48ab8

In [9]:
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi
from tqdm import tqdm
import shutil

def download_caltech101(root_dir='./data'):
    # 디렉토리 생성
    os.makedirs(root_dir, exist_ok=True)
    
    print("Downloading Caltech-101 dataset from Kaggle...")
    
    # Kaggle API 초기화
    api = KaggleApi()
    api.authenticate()
    
    # 데이터셋 다운로드
    api.dataset_download_files('huanghanchina/caltech101',
                             path=root_dir,
                             quiet=False)
    
    # 압축 해제
    zip_path = os.path.join(root_dir, 'caltech101.zip')
    print("\nExtracting files...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(root_dir)
    
    # 파일 구조 정리
    src_dir = os.path.join(root_dir, "101_ObjectCategories")
    dst_dir = os.path.join(root_dir, "caltech101")
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
    shutil.move(src_dir, dst_dir)
    
    # 임시 파일 삭제
    os.remove(zip_path)
    
    print(f"\nDataset downloaded and extracted to {dst_dir}")
    print("Number of categories:", len(os.listdir(dst_dir)))
    return dst_dir

if __name__ == "__main__":
    # 데이터셋 다운로드 및 설정
    dataset_path = download_caltech101()
    print(f"Dataset ready at: {dataset_path}")

OSError: Could not find kaggle.json. Make sure it's located in /Users/and___young/.kaggle. Or use the environment method. See setup instructions at https://github.com/Kaggle/kaggle-api/

In [10]:
train_dataset = Caltech101Dataset(root_dir=dataset_path, transform=transform)

NameError: name 'dataset_path' is not defined

In [11]:
import os
import tarfile
import urllib.request
from tqdm import tqdm
import shutil

def download_caltech101(root_dir='./data'):
    """
    Caltech-101 데이터셋을 다운로드하고 설정합니다.
    """
    # 디렉토리 생성
    os.makedirs(root_dir, exist_ok=True)
    
    # 데이터셋 URL
    url = "https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.tar.gz"
    filename = os.path.join(root_dir, "caltech-101.tar.gz")
    
    # 진행률 표시기와 함께 다운로드
    print("Downloading Caltech-101 dataset...")
    with urllib.request.urlopen(url) as response:
        total_size = int(response.headers['Content-Length'])
        with open(filename, 'wb') as f:
            with tqdm(total=total_size, unit='B', unit_scale=True) as pbar:
                while True:
                    chunk = response.read(8192)
                    if not chunk:
                        break
                    f.write(chunk)
                    pbar.update(len(chunk))
    
    # 압축 해제
    print("\nExtracting files...")
    with tarfile.open(filename, 'r:gz') as tar:
        def is_within_directory(directory, target):
            abs_directory = os.path.abspath(directory)
            abs_target = os.path.abspath(target)
            prefix = os.path.commonprefix([abs_directory, abs_target])
            return prefix == abs_directory
        
        def safe_extract(tar, path=".", members=None, *, numeric_owner=False):
            for member in tar.getmembers():
                member_path = os.path.join(path, member.name)
                if not is_within_directory(path, member_path):
                    raise Exception("Attempted path traversal in tar file")
            
            tar.extractall(path, members, numeric_owner=numeric_owner) 
            
        safe_extract(tar, path=root_dir)
    
    # 파일 구조 정리
    src_dir = os.path.join(root_dir, "caltech-101", "101_ObjectCategories")
    dst_dir = os.path.join(root_dir, "caltech101")
    
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
    shutil.move(src_dir, dst_dir)
    
    # 임시 파일 삭제
    os.remove(filename)
    shutil.rmtree(os.path.join(root_dir, "caltech-101"))
    
    print(f"\nDataset downloaded and extracted to {dst_dir}")
    print("Number of categories:", len(os.listdir(dst_dir)))
    return dst_dir

if __name__ == "__main__":
    # 데이터셋 다운로드 및 설정
    dataset_path = download_caltech101()
    print(f"Dataset ready at: {dataset_path}")

Downloading Caltech-101 dataset...


HTTPError: HTTP Error 500: INTERNAL SERVER ERROR

In [4]:
import torch
import torchvision.datasets as datasets
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
from collections import OrderedDict
import os

class Caltech101Dataset(Dataset):
    def __init__(self, root_dir='./data/caltech101', transform=None, train=True):
        self.dataset = datasets.ImageFolder(root_dir, transform=transform)
        self.classes = self.dataset.classes
        
        # 학습/테스트 분할 (80:20)
        train_size = int(0.8 * len(self.dataset))
        test_size = len(self.dataset) - train_size
        train_dataset, test_dataset = random_split(
            self.dataset, [train_size, test_size],
            generator=torch.Generator().manual_seed(42)
        )
        
        self.data = train_dataset if train else test_dataset
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        image, label = self.data[idx]
        return image, label

def train_coop(num_epochs=50, batch_size=32, learning_rate=1e-4):
    # 데이터 로드
    transform = get_transforms()
    train_dataset = Caltech101Dataset(train=True, transform=transform)
    test_dataset = Caltech101Dataset(train=False, transform=transform)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # 모델 초기화
    clip_model = load_clip_to_cpu()
    text_encoder, image_encoder, device = initialize_coop(
        clip_model, 
        n_ctx=16, 
        n_cls=len(train_dataset.classes)
    )
    
    # 옵티마이저 설정
    optimizer = torch.optim.Adam(text_encoder.parameters(), lr=learning_rate)
    
    # 학습 기록용
    train_losses = []
    train_accuracies = []
    test_accuracies = []
    
    for epoch in range(num_epochs):
        # 학습 모드
        text_encoder.train()
        epoch_loss = 0
        correct = 0
        total = 0
        
        for images, labels in train_loader:
            images = images.to(device)
            labels = labels.to(device)
            
            # 이미지 features 추출
            image_features = image_encoder(images)
            
            # 프롬프트 템플릿 생성
            prefix = clip.tokenize("A photo of a").to(device)
            suffix = clip.tokenize([train_dataset.classes[label] for label in labels]).to(device)
            
            # 텍스트 프롬프트 생성 및 features 추출
            prompts = text_encoder.construct_prompts(
                text_encoder.ctx,
                prefix=prefix,
                suffix=suffix,
                label=labels
            )
            text_features = text_encoder(prompts, suffix, labels)
            
            # 손실 계산
            loss = compute_loss(image_features, text_features, clip_model.logit_scale)
            
            # 역전파
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            # 정확도 계산
            logits = image_features @ text_features.t()
            pred = torch.argmax(logits, dim=1)
            correct += (pred == labels).sum().item()
            total += labels.size(0)
            
            epoch_loss += loss.item()
        
        # 테스트 평가
        text_encoder.eval()
        test_correct = 0
        test_total = 0
        
        with torch.no_grad():
            for images, labels in test_loader:
                images = images.to(device)
                labels = labels.to(device)
                
                image_features = image_encoder(images)
                prefix = clip.tokenize("A photo of a").to(device)
                suffix = clip.tokenize([test_dataset.classes[label] for label in labels]).to(device)
                
                prompts = text_encoder.construct_prompts(
                    text_encoder.ctx,
                    prefix=prefix,
                    suffix=suffix,
                    label=labels
                )
                text_features = text_encoder(prompts, suffix, labels)
                
                logits = image_features @ text_features.t()
                pred = torch.argmax(logits, dim=1)
                test_correct += (pred == labels).sum().item()
                test_total += labels.size(0)
        
        # 에폭당 평균 손실과 정확도 기록
        avg_loss = epoch_loss / len(train_loader)
        train_accuracy = 100 * correct / total
        test_accuracy = 100 * test_correct / test_total
        
        train_losses.append(avg_loss)
        train_accuracies.append(train_accuracy)
        test_accuracies.append(test_accuracy)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"Training - Loss: {avg_loss:.4f}, Accuracy: {train_accuracy:.2f}%")
        print(f"Testing - Accuracy: {test_accuracy:.2f}%\n")
    
    return train_losses, train_accuracies, test_accuracies

def visualize_training(train_losses, train_accuracies, test_accuracies):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # 손실 그래프
    ax1.plot(train_losses)
    ax1.set_title('Training Loss')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    
    # 정확도 그래프
    ax2.plot(train_accuracies, label='Train')
    ax2.plot(test_accuracies, label='Test')
    ax2.set_title('Accuracy')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy (%)')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

def visualize_learned_prompts(text_encoder, class_names):
    ctx = text_encoder.ctx.detach().cpu()
    plt.figure(figsize=(12, 8))
    plt.imshow(ctx, cmap='viridis')
    plt.colorbar()
    plt.title('Learned Context Tokens')
    plt.xlabel('Embedding Dimension')
    plt.ylabel('Context Position')
    plt.show()

# 메인 실행
if __name__ == "__main__":
    # 학습 실행
    train_losses, train_accuracies, test_accuracies = train_coop(
        num_epochs=50,
        batch_size=32,
        learning_rate=1e-4
    )
    
    # 결과 시각화
    visualize_training(train_losses, train_accuracies, test_accuracies)
    
    # 학습된 프롬프트 시각화
    clip_model = load_clip_to_cpu()
    text_encoder, _, _ = initialize_coop(
        clip_model,
        n_ctx=16,
        n_cls=101  # Caltech-101의 클래스 수
    )
    visualize_learned_prompts(text_encoder, None)

FileNotFoundError: [Errno 2] No such file or directory: './data/caltech101'