# VisionTransformer finetuning 4チャンネルにしてみる

In [1]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("GPU Count:", torch.cuda.device_count())
if torch.cuda.is_available():
    print("GPU Name:", torch.cuda.get_device_name(0))


CUDA Available: True
GPU Count: 1
GPU Name: NVIDIA GeForce RTX 4070 Ti SUPER


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from PIL import Image
from torchvision import transforms

In [3]:
import random
# 再現性のためにランダムシードを設定
def set_seed(seed):
    random.seed(seed) # Python
    np.random.seed(seed) # NumPy
    torch.manual_seed(seed) # PyTorch
    
    # GPUを使う場合
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False # 再現性を重視

set_seed(42)  # 任意のシード値

- Pillow は RGB、OpenCV は BGRで読み込む

In [4]:
metadata = pd.read_csv("./data/HAM10000_metadata")
metadata

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,dataset
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,vidir_modern
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,vidir_modern
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,vidir_modern
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,vidir_modern
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,vidir_modern
...,...,...,...,...,...,...,...,...
10010,HAM_0002867,ISIC_0033084,akiec,histo,40.0,male,abdomen,vidir_modern
10011,HAM_0002867,ISIC_0033550,akiec,histo,40.0,male,abdomen,vidir_modern
10012,HAM_0002867,ISIC_0033536,akiec,histo,40.0,male,abdomen,vidir_modern
10013,HAM_0000239,ISIC_0032854,akiec,histo,80.0,male,face,vidir_modern


In [5]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   lesion_id     10015 non-null  object 
 1   image_id      10015 non-null  object 
 2   dx            10015 non-null  object 
 3   dx_type       10015 non-null  object 
 4   age           9958 non-null   float64
 5   sex           10015 non-null  object 
 6   localization  10015 non-null  object 
 7   dataset       10015 non-null  object 
dtypes: float64(1), object(7)
memory usage: 626.1+ KB


In [6]:
metadata_labels = metadata[['image_id', 'dx']]
metadata_labels

Unnamed: 0,image_id,dx
0,ISIC_0027419,bkl
1,ISIC_0025030,bkl
2,ISIC_0026769,bkl
3,ISIC_0025661,bkl
4,ISIC_0031633,bkl
...,...,...
10010,ISIC_0033084,akiec
10011,ISIC_0033550,akiec
10012,ISIC_0033536,akiec
10013,ISIC_0032854,akiec


In [7]:
labels = metadata_labels['dx'].map({"akiec":0, "bcc":1, "bkl":2, "df":3, "mel":4, "nv":5, "vasc":6})
labels

0        2
1        2
2        2
3        2
4        2
        ..
10010    0
10011    0
10012    0
10013    0
10014    4
Name: dx, Length: 10015, dtype: int64

In [8]:
# 画像ファイル名を取得
image_id = metadata_labels["image_id"].values
image_id

array(['ISIC_0027419', 'ISIC_0025030', 'ISIC_0026769', ...,
       'ISIC_0033536', 'ISIC_0032854', 'ISIC_0032258'], dtype=object)

In [9]:
mask_id = image_id
mask_id

array(['ISIC_0027419', 'ISIC_0025030', 'ISIC_0026769', ...,
       'ISIC_0033536', 'ISIC_0032854', 'ISIC_0032258'], dtype=object)

In [10]:
# ラベルを取得
labels = labels.values
labels

array([2, 2, 2, ..., 0, 0, 4])

In [11]:
# インデックスを分割
from sklearn.model_selection import train_test_split
train_idx, val_idx = train_test_split(
    np.arange(len(image_id)),
    test_size = 0.3,
    stratify=labels, # クラスごとに均等に分割
    random_state = 42
)

### Vitの前処理
'input_size': (3, 224, 224)
 'interpolation': 'bicubic
 'mean': (0.5, 0.5, 0.5)
 'std': (0.5, 0.5, 0.5)
 'crop_pct': 0.9
 'crop_mode': 'center'

In [12]:
from timm.data import resolve_data_config
import torchvision.transforms as transforms
import timm

# モデルの前処理設定を取得
model = timm.create_model("vit_base_patch16_224", pretrained=True)
config = resolve_data_config({}, model=model)
config

{'input_size': (3, 224, 224),
 'interpolation': 'bicubic',
 'mean': (0.5, 0.5, 0.5),
 'std': (0.5, 0.5, 0.5),
 'crop_pct': 0.9,
 'crop_mode': 'center'}

In [13]:
config["input_size"][1:]

(224, 224)

In [19]:
#from timm.data import create_transform

mean = [0.5, 0.5, 0.5, 0.0]
std = [0.5, 0.5, 0.5, 1.0]

# torchvision.transforms で統一
train_transform = transforms.Compose([
    transforms.Resize(config["input_size"][1:]),
    #transforms.ToTensor(), # PIL画像をTensorに変換する処理だが、4チャンネルにした時点ですでにTesnorになっている想定
    transforms.Normalize(mean=mean, std=std)
])

val_transform = transforms.Compose([
    transforms.Resize(config["input_size"][1:]),
    #transforms.ToTensor(),
    transforms.Normalize(mean=mean, std=std)
])

In [20]:
# ImageDataset クラスを定義し、データローダーで取得
from torch.utils.data import Dataset, DataLoader

image_path = "./data/images"
mask_file_path = "./data/HAM10000_segmentations_lesion_tschandl"

class ImageDataset(Dataset):
    def __init__(self, image_id, mask_id, labels, transform=None):
        self.image_id = [f if f.endswith('.jpg') else f + ".jpg" for f in image_id] # 拡張子を追加
        self.mask_id = [f if f.endswith('.png') else f + "_segmentation.png" for f in mask_id] # 拡張子を追加
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_id)
    
    def __getitem__(self, idx):
        # 画像を指定するディレクトリを作成
        img_path = os.path.join(image_path, self.image_id[idx]) # self.image_id[idx]は画像ファイル名
        mask_path = os.path.join(mask_file_path, self.mask_id[idx]) # self.mask_id[idx]は画像ファイル名
        
        # ファイルが存在しない場合のエラー
        if not os.path.exists(img_path):
            raise FileNotFoundError(f"File not found: {img_path}")
        
        # ディレクトリで画像を指定しPIL形式で開く
        img = Image.open(img_path).convert("RGB")
        img_mask =  Image.open(mask_path)
        
        label  = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return img, img_mask, label

In [21]:
# CustomSubsetクラス
class CustomSubset(Dataset):
    def __init__(self, dataset, indices, transform=None):
        self.dataset = dataset
        self.indices = indices
        self.transform = transform
    
    def __len__(self):
        return len(self.indices)
    
    def __getitem__(self, idx):
        img, img_mask, label = self.dataset[self.indices[idx]] # 元のデータセットから取得
    
        img = transforms.ToTensor()(img) # [3, H, W]
        img_mask = transforms.ToTensor()(img_mask) # [1, H, W]
        # 4チャンネルに結合
        img_4ch = torch.cat([img, img_mask], dim=0) # [4, H, W]

        if self.transform: # transformが渡されていれば使うし、なければ何もしない
            img_4ch = self.transform(img_4ch)
        
        return img_4ch, label

In [22]:
# 画像とラベルのデータセットを作成
dataset = ImageDataset(image_id, mask_id, labels, transform=None)

In [23]:
img, img_mask, label = dataset[0]
print(type(img), img.size)
print(type(img_mask), img_mask.size)
print(label)

<class 'PIL.Image.Image'> (600, 450)
<class 'PIL.PngImagePlugin.PngImageFile'> (600, 450)
tensor(2)


In [24]:
# データセットの分割
train_dataset = CustomSubset(dataset, train_idx, transform=train_transform)
val_dataset = CustomSubset(dataset, val_idx, transform=val_transform)

In [25]:
img_4ch, label = train_dataset[0]
print(type(img_4ch), img_4ch.size())
print(label)

<class 'torch.Tensor'> torch.Size([4, 224, 224])
tensor(5)


In [26]:
# Dataloaderの作成
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [27]:
for img_4ch, labels in train_loader:
    print(f"Train Batch shape: {img_4ch.shape}, Labels: {labels}")
    break

for img_4ch, labels in val_loader:
    print(f"Val Batch shape: {img_4ch.shape}, Labels: {labels}")
    break

Train Batch shape: torch.Size([32, 4, 224, 224]), Labels: tensor([5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 5, 5, 2, 2,
        5, 5, 0, 4, 5, 5, 5, 5])
Val Batch shape: torch.Size([32, 4, 224, 224]), Labels: tensor([4, 5, 5, 5, 5, 2, 5, 5, 5, 5, 5, 2, 5, 1, 0, 5, 5, 5, 2, 4, 2, 5, 5, 5,
        5, 5, 2, 5, 0, 5, 5, 4])


In [28]:
import psutil
print(f"使用中メモリ: {psutil.virtual_memory().used / (1024 ** 3):.2f} GB")

使用中メモリ: 7.27 GB


In [29]:
# メモリの削減
# del image_tensors
# import gc
# gc.collect()

## Vit の事前学習済みモデルを適用

- このあと、普通に適用としたらエラーになるので、Datasetクラスを作成して適用させる
- torchvision.transforms の ToTensor() は PIL画像 または numpy.ndarray (H, W, C) のみを処理できる

In [30]:
#train_labels = train_labels.values  # Pandas Series → NumPy 配列
#val_labels = val_labels.values      # 検証データも同様に変換

- torch.long型は整数系（int64）

In [31]:
import mlflow
import mlflow.pytorch

# MLflow のトラッキング URI を設定
mlflow.set_tracking_uri("http://127.0.0.1:5000")

# MLflow の実験を設定
mlflow.set_experiment("2503_skin_cancer_classification")

<Experiment: artifact_location='/works/mlruns/3', creation_time=1742211699954, experiment_id='3', last_update_time=1742211699954, lifecycle_stage='active', name='2503_skin_cancer_classification', tags={}>

In [32]:
# モデルを定義
import timm
import torch.nn as nn

num_classes = 7

def create_model(model_name="vit_base_patch16_224", num_classes=7):
    model = timm.create_model(model_name, pretrained=True)
    model.patch_embed.proj = nn.Conv2d(4, 768, kernel_size=(16, 16), stride=(16, 16)) # 4チャンネルにしてみる
    model.head = nn.Linear(model.num_features, 7)
    return model

In [33]:
model = create_model()
print(model.head)

Linear(in_features=768, out_features=7, bias=True)


In [34]:
model = timm.create_model(model_name="vit_base_patch16_224", pretrained=True)
model

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity(

In [35]:
model.patch_embed.proj = nn.Conv2d(4, 768, kernel_size=(16, 16), stride=(16, 16))

- Mixupは２つのサンプルを線形に組み合わせることでデータ拡張を行います。具体的には2つの画像を選択し、それぞれを一定比率でブレンドします。このとき、正解ラベルもブレンド比率に従って再計算します。

- CutMixは、画像の一部を切り取って他の画像に挿入することでデータ拡張を行う手法です。具体的には、1つ目の画像から矩形の領域をランダムに選択し、矩形内を2つ目の画像に置き換えます。正解ラベルは、領域の面積比率に基づいて再計算します。

In [36]:
import torch.nn.functional as F

def train(model, train_loader, criterion, optimizer, device, mixup_fn):
    model.train()
    train_loss = 0
    train_correct = 0
    total_samples = 0

    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        imgs, labels = mixup_fn(imgs, labels)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        batch_size = labels.size(0)
        train_loss += loss.item() * batch_size

        # Mixup が適用されなかった場合に one-hot へ変換
        if labels.dim() == 1:
            labels = F.one_hot(labels, num_classes=7).float() # SoftTargetCrossEntropy を使う場合、ラベルは float にする

        train_correct += (outputs.argmax(1) == labels.argmax(1)).sum().item()
        total_samples += batch_size
    
    train_loss /= total_samples
    train_acc = train_correct/ total_samples
    
    return train_loss, train_acc

In [37]:
def evaluate(model, val_loader, val_criterion, device):
    model.eval()
    val_loss = 0
    val_correct = 0
    total_val_samples = 0

    val_criterion = nn.CrossEntropyLoss()  # 評価時は `CrossEntropyLoss` を使用

    with torch.no_grad():
        for batch_idx, (imgs, labels) in enumerate(val_loader):
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = val_criterion(outputs, labels)

            #loss = criterion(outputs, labels)

            batch_size = labels.size(0)
            val_loss += loss.item() * batch_size
            val_correct += (outputs.argmax(1) == labels).sum().item()
            total_val_samples += batch_size
        
    val_loss /= total_val_samples
    val_acc = val_correct / total_val_samples
        
    return val_loss, val_acc,

In [38]:
# モデルの作成・訓練・評価・ログ記録
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from timm.data.mixup import Mixup
from timm.loss import SoftTargetCrossEntropy

def objective(train_loader, val_loader, num_classes, num_epochs):
    set_seed(42)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model = create_model(num_classes=num_classes).to(device)
    criterion = SoftTargetCrossEntropy() # Mixupで変換したラベルを受け取る
    val_criterion = nn.CrossEntropyLoss()
    optimizer =optim.AdamW([
        {"params":model.patch_embed.parameters(), "lr":1e-6}, # 埋め込み層
        {"params":model.blocks.parameters(), "lr": 1e-5}, # Transformerブロック
        {"params":model.head.parameters(), "lr":1e-4} # 分類層
    ])
    scheduler = CosineAnnealingLR(optimizer, T_max=5)
    
    mixup_fn = Mixup(mixup_alpha=0.2, cutmix_alpha=1.0, prob=0.5, num_classes=7)

    best_val_loss = float("inf")
    best_epoch = 0
    best_acc = 0
    best_model_state = None

    # ダミー入力データ（モデルの入力形式を記録するだけ）を作成（バッチサイズ1, チャンネル3, 224x224画像）
    input_example = torch.randn(1, 3, 224, 224).to(torch.float32).cpu().numpy() # mlflowはnumpy配列は受け付ける

    with mlflow.start_run() as run:

        run_id = run.info.run_id
        model_filename = f"best_model_{run_id}.pth"

        # ハイパラメータの記録
        mlflow.log_param("learning_rate", 1e-4)
        mlflow.log_param("batch_size", 32)
        mlflow.log_param("num_epochs", num_epochs)
        mlflow.log_param("loss_function", "SoftTargetCrossEntropy")
        mlflow.log_param("optimizer", "AdamW")

        # データセットの情報を記録
        mlflow.log_param("dataset_path", "./data/HAM10000_metadata.csv")
        mlflow.log_param("num_train_samples", len(train_loader.dataset))
        mlflow.log_param("num_val_samples", len(val_loader.dataset))

        # タグを追加
        mlflow.set_tag("model_name", "Vit_finetuning")
        mlflow.set_tag("dataset", "HAM10000")
        mlflow.set_tag("optimizer", "AdamW")
        mlflow.set_tag("experiment", "skin_cancer_classification")

        for epoch in range(num_epochs):
            train_loss, train_acc = train(model, train_loader, criterion, optimizer, device, mixup_fn)
            val_loss, val_acc = evaluate(model, val_loader, val_criterion, device)

            mlflow.log_metric("train_loss", train_loss, step=epoch)
            mlflow.log_metric("train_acc", train_acc, step=epoch)
            mlflow.log_metric("val_loss", val_loss, step=epoch)
            mlflow.log_metric("val_acc", val_acc, step=epoch)

            # ベストモデルを更新
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_epoch = epoch
                best_acc = val_acc
                best_model_state = model.state_dict() # ここで更新
            
            scheduler.step()

            print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")
        
        # ベストモデルの状態をロード
        model.load_state_dict(best_model_state)
        # ベストモデルを保存
        torch.save(best_model_state, model_filename) # モデルの状態を.pthファイルに記録
        mlflow.log_artifact(model_filename) # MLflow にファイルとして記録  
        # 最終的なbest_model_stateをログする
        mlflow.pytorch.log_model(model, "best_model", input_example=input_example)

        return best_epoch, best_val_loss, best_acc, best_model_state            

In [None]:
num_epochs = 10
best_epoch, best_val_loss, best_acc, best_model_state = objective(train_loader, val_loader, num_classes, num_epochs)

print("Best Epoch:", best_epoch)
print("Best Val Loss:", best_val_loss)
print("Best Acc:", best_acc)
print("Best Model Sate:", best_model_state)

Epoch 0: Train Loss: 1.1765, Train Acc: 0.6793, Val Loss: 0.8700, Val Acc: 0.6985
