# Check GPU if it is used

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    print("GPU Name: ", gpu_devices[0].name)
    try:
        with tf.device('/device:GPU:0'):
            a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
            b = tf.constant([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
            c = tf.matmul(a, b)
        print("Matrix multiplication on GPU successful:", c)
    except RuntimeError as e:
        print("Error during GPU test:", e)
else:
    print("TensorFlow is NOT using a GPU. Training will be very slow.")

import numpy
import scipy
print(f"NumPy version: {numpy.__version__}")
print(f"SciPy version: {scipy.__version__}")

# Import Lib

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam, AdamW
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint,CSVLogger
from tensorflow.keras.models import load_model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization, MultiHeadAttention, Conv2D
import numpy as np
import os
import shutil
import math

# Import Dataset

In [4]:
train_path = '/media/capybara/Data/dataset_vit/archive'

datapath = train_path

# Set the configuration for ViT structure.

In [6]:
# --- ส่วนของการรันโมเดล ---
IMAGE_SIZE = 224 # 
PATCH_SIZE = 16  # ขนาดของ patch
BATCH_SIZE = 32 # อาจปรับ batch size ตามหน่วยความจำ
NUM_CLASSES = 1000 # จำนวนคลาสใน dataset
EMBED_DIM = 384 # ลดขนาดลงสำหรับการทดลอง
NUM_HEADS = 8 # จำนวน attention heads
NUM_LAYERS = 8 # จำนวน layer ของ encoder
MLP_DIM = 4 * EMBED_DIM
DROPOUT_RATE = 0.1
EPOCHS = 50 # เพิ่มจำนวน epochs เพื่อให้โมเดลมีเวลาเรียนรู้

# Preprocess Image before training the model

In [None]:
img_height = IMAGE_SIZE
img_width = IMAGE_SIZE
batch_size = BATCH_SIZE
AUTOTUNE = tf.data.AUTOTUNE # ให้ TensorFlow จัดการ parallelism ที่เหมาะสมเอง

# # --- ส่วนของการลบ bad_dirs---
# bad_dirs = ['.ipynb_checkpoints']
# for bad in bad_dirs:
#     path_to_check = os.path.join(datapath, bad) # แก้ไขเป็น path_to_check เพื่อไม่ให้ทับซ้อนกับตัวแปร path อื่น
#     if os.path.exists(path_to_check):
#         print(f"Removing directory: {path_to_check}")
#         shutil.rmtree(path_to_check)

# --- 1. โหลดชุดข้อมูลโดยใช้ image_dataset_from_directory ---
# ฟังก์ชันนี้จะจัดการการอ่านจากไดเรกทอรี, กำหนดขนาดภาพ, สร้าง batch, และแบ่งข้อมูล

train_dataset = tf.keras.utils.image_dataset_from_directory(
    datapath,
    validation_split=0.04,  # ตรงกับ validation_split เดิมของคุณ
    subset="training",
    seed=42,               # ตรงกับ seed เดิม
    image_size=(img_height, img_width), # ตรงกับ target_size เดิม
    batch_size=batch_size, # โหลดเป็น batch เลย
    label_mode='categorical', # ตรงกับ class_mode เดิม (labels เป็น one-hot)
    shuffle=True # สุ่มข้อมูลใน training set
)

val_dataset = tf.keras.utils.image_dataset_from_directory(
    datapath,
    validation_split=0.04,
    subset="validation",
    seed=42,
    image_size=(img_height, img_width),
    batch_size=batch_size,
    label_mode='categorical',
    shuffle=False # ไม่ต้องสุ่มใน validation set
)

# แสดง class names และสร้าง class_indices (คล้ายกับ .class_indices ของ ImageDataGenerator)
class_names = train_dataset.class_names
print("Class names:", class_names)
class_indices = {name: i for i, name in enumerate(class_names)}
print("Class indices:", class_indices)


# --- 2. สร้าง Model สำหรับ Data Augmentation โดยใช้ Keras Preprocessing Layers ---
data_augmentation_layers = tf.keras.Sequential([
    tf.keras.layers.Rescaling(1./255), # ทำ Rescaling เป็นอันดับแรก
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(factor=(-10/360, 10/360), fill_mode='nearest'),
    tf.keras.layers.RandomTranslation(height_factor=0.1, width_factor=0.1, fill_mode='nearest'),
    tf.keras.layers.RandomZoom(height_factor=(-0.1, 0.1), width_factor=(-0.1, 0.1), fill_mode='nearest')    
])

# สร้าง Layer สำหรับ Rescale อย่างเดียว (สำหรับ validation set)
rescale_layer_only = tf.keras.layers.Rescaling(1./255)

# --- 3. สร้างฟังก์ชันสำหรับนำ Augmentation และ Preprocessing ไปใช้ ---

def augment_and_preprocess_train_data(images, labels):
    images = data_augmentation_layers(images, training=True) # training=True เพื่อให้ Random* layers ทำงาน
    return images, labels

def preprocess_val_data(images, labels):
    images = rescale_layer_only(images) # Validation data ทำแค่ rescale
    return images, labels

# --- 4. สร้าง Input Pipelines ที่มีประสิทธิภาพ ---
# Training pipeline
train_pipeline = train_dataset.map(augment_and_preprocess_train_data, num_parallel_calls=AUTOTUNE)
train_pipeline = train_pipeline.prefetch(buffer_size=AUTOTUNE) # ให้ CPU เตรียมข้อมูลล่วงหน้า

# Validation pipeline
val_pipeline = val_dataset.map(preprocess_val_data, num_parallel_calls=AUTOTUNE)
val_pipeline = val_pipeline.prefetch(buffer_size=AUTOTUNE)

# # --- ตรวจสอบ Output ของ Datasets ---
# print("\n--- ตรวจสอบ Output ของ Datasets (หลัง Preprocessing) ---")
# for X_batch_train, y_batch_train in train_pipeline.take(1):
#     print("Shape of first BATCH of TRAIN images:", X_batch_train.shape)
#     print("Data type of TRAIN images:", X_batch_train.dtype)
#     print("Min value in TRAIN images:", tf.reduce_min(X_batch_train).numpy())
#     print("Max value in TRAIN images:", tf.reduce_max(X_batch_train).numpy())
#     print("Shape of first BATCH of TRAIN labels:", y_batch_train.shape)

# for X_batch_val, y_batch_val in val_pipeline.take(1):
#     print("Shape of first BATCH of VAL images:", X_batch_val.shape)
#     print("Data type of VAL images:", X_batch_val.dtype)
#     print("Min value in VAL images:", tf.reduce_min(X_batch_val).numpy())
#     print("Max value in VAL images:", tf.reduce_max(X_batch_val).numpy())
#     print("Shape of first BATCH of VAL labels:", y_batch_val.shape)


Found 1281167 files belonging to 1000 classes.
Using 1229921 files for training.
Found 1281167 files belonging to 1000 classes.
Using 51246 files for validation.
Class names: ['n01440764', 'n01443537', 'n01484850', 'n01491361', 'n01494475', 'n01496331', 'n01498041', 'n01514668', 'n01514859', 'n01518878', 'n01530575', 'n01531178', 'n01532829', 'n01534433', 'n01537544', 'n01558993', 'n01560419', 'n01580077', 'n01582220', 'n01592084', 'n01601694', 'n01608432', 'n01614925', 'n01616318', 'n01622779', 'n01629819', 'n01630670', 'n01631663', 'n01632458', 'n01632777', 'n01641577', 'n01644373', 'n01644900', 'n01664065', 'n01665541', 'n01667114', 'n01667778', 'n01669191', 'n01675722', 'n01677366', 'n01682714', 'n01685808', 'n01687978', 'n01688243', 'n01689811', 'n01692333', 'n01693334', 'n01694178', 'n01695060', 'n01697457', 'n01698640', 'n01704323', 'n01728572', 'n01728920', 'n01729322', 'n01729977', 'n01734418', 'n01735189', 'n01737021', 'n01739381', 'n01740131', 'n01742172', 'n01744401', 'n017

# Caculate steps per epoch

In [None]:
num_train_samples = 1229921 
num_val_samples = 51246    
# batch_size ควรเป็นตัวแปรที่คุณกำหนดค่าไว้ เช่น batch_size = 32

# คำนวณ steps_per_epoch (ต้องปัดขึ้น)
steps_per_epoch = math.ceil(num_train_samples / batch_size)
print(f"Total training samples (tf.data): {num_train_samples}")
print(f"Batch size (tf.data): {batch_size}")
print(f"Steps per epoch for training (tf.data): {steps_per_epoch}")

# คำนวณ validation_steps (ต้องปัดขึ้น)
if num_val_samples > 0:
    validation_steps= math.ceil(num_val_samples / batch_size)
    print(f"Total validation samples (tf.data): {num_val_samples}")
    print(f"Validation steps (tf.data): {validation_steps}")
else:
    validation_steps_new = None

Total training samples (tf.data): 1229921
Batch size (tf.data): 32
Steps per epoch for training (tf.data): 38436
Total validation samples (tf.data): 51246
Validation steps (tf.data): 1602


## ENCODEBLOCK & VISIONTRANSFORMER

In [None]:
# Corrected Transformer Encoder Block with Pre-LN (Layer Normalization before sub-layer)
class EncoderBlock(Model):
    def __init__(self, embed_dim, num_heads, mlp_dim, dropout_rate=0.1, **kwargs):
        super(EncoderBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.mlp_dim = mlp_dim
        self.dropout_rate = dropout_rate

        # Layer Normalization 1 (before Multi-Head Attention)
        self.norm1 = LayerNormalization(epsilon=1e-6, name="norm1")
        # Multi-Head Attention layer
        self.mha = MultiHeadAttention(
            num_heads=num_heads,
            key_dim=embed_dim // num_heads, # key_dim per head
            dropout=dropout_rate, # Dropout within MHA's attention scores
            name="multi_head_attention"
        )
       
        self.dropout_mha_output = Dropout(dropout_rate)

        # Layer Normalization 2 (before MLP)
        self.norm2 = LayerNormalization(epsilon=1e-6, name="norm2")
        # MLP Block (Feed-Forward Network)
        self.mlp = Sequential([
            Dense(mlp_dim, activation='gelu', name="mlp_dense_1"),
            Dropout(dropout_rate), # Dropout within MLP
            Dense(embed_dim, name="mlp_dense_2"),
            Dropout(dropout_rate)  # Dropout after projection, before adding to residual
        ], name="mlp_block")

    def call(self, inputs, training=False):
        # --- Multi-Head Attention sub-layer (Pre-LN) ---
        # 1. Layer Normalization
        x_norm1 = self.norm1(inputs)
        # 2. Multi-Head Attention (produces attention_output)
        attn_output = self.mha(query=x_norm1, value=x_norm1, key=x_norm1, training=training)
        # 3. Dropout on MHA output (optional, but often used)
        attn_output_dropped = self.dropout_mha_output(attn_output, training=training)
        # 4. Residual connection
        x_res1 = inputs + attn_output_dropped

        # --- MLP sub-layer (Pre-LN) ---
        # 1. Layer Normalization
        x_norm2 = self.norm2(x_res1) # Apply norm to the output of the first residual connection
        # 2. MLP (produces mlp_output)
        mlp_output = self.mlp(x_norm2, training=training)
        # 3. Residual connection (MLP already has internal dropout before output)
        x_res2 = x_res1 + mlp_output

        return x_res2

# Vision Transformer (ViT) model implementation (Unchanged from your provided code, but uses the modified EncoderBlock)
class VisionTransformer(Model):
    def __init__(self, image_size=IMAGE_SIZE, num_classes=NUM_CLASSES, patch_size=PATCH_SIZE, embed_dim=EMBED_DIM,
                 num_heads=NUM_HEADS, num_layers=NUM_LAYERS, mlp_dim=MLP_DIM, dropout_rate=0.1):
        super(VisionTransformer, self).__init__()

        self.num_classes = num_classes
        self.patch_size = patch_size
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.mlp_dim = mlp_dim
        self.dropout_rate = dropout_rate
        self.num_patches = (image_size // patch_size) ** 2

        # 1. CLS Token
        self.cls_token = self.add_weight(
            name = "cls_token",
            shape=[1, 1, embed_dim], # (1, 1, D)
            initializer=tf.keras.initializers.RandomNormal(stddev=0.02),
            trainable=True,
        )

        # 2. Position Embedding
        self.pos_embed = self.add_weight(
            name = "position_embedding",
            shape=[1, self.num_patches + 1, embed_dim], # (1, num_patches + 1, D)
            initializer=tf.keras.initializers.RandomNormal(stddev=0.02),
            trainable=True,
        )
        self.pos_dropout = Dropout(dropout_rate)

        # Patch embedding layer
        self.patch_embed = Conv2D(
            filters=embed_dim,
            kernel_size=patch_size,
            strides=patch_size,
            padding='valid',
            name="patch_embed"
        )

        # Transformer Encoder Layers (Now uses the Pre-LN EncoderBlock)
        self.encoder_layers = [
            EncoderBlock(embed_dim, num_heads, mlp_dim, dropout_rate) for _ in range(num_layers)
        ]

        # Classifier head
        self.norm_head = LayerNormalization(epsilon=1e-6, name="head_norm") # Norm ก่อนเข้า MLP Head
        self.head = Dense(num_classes, activation='softmax', name="classification_head")

    def call(self, inputs, training=False):
        batch_size = tf.shape(inputs)[0]

        # 1. Embed patches
        x = self.patch_embed(inputs)  # (B, H/P, W/P, D)
        x = tf.reshape(x, (batch_size, -1, self.embed_dim))  # (B, num_patches, D)

        # 2. Prepend CLS token
        cls_tokens = tf.tile(self.cls_token, [batch_size, 1, 1])  # (B, 1, D)
        x = tf.concat([cls_tokens, x], axis=1)  # (B, num_patches + 1, D)

        # 3. Add position embedding
        x = x + self.pos_embed  # Broadcasting
        x = self.pos_dropout(x, training=training)

        # 4. Transformer Encoder layers
        for encoder in self.encoder_layers:
            x = encoder(x, training=training)

        # 5. Classifier head
        # Take the output of the CLS token (first token)
        cls_token_output = x[:, 0]
        cls_token_output = self.norm_head(cls_token_output, training=training) # Norm output of CLS token
        logits = self.head(cls_token_output, training=training) # Keras Dense layer handles training flag for activations if needed

        return logits

# WarmupCosineDecay

In [None]:
class WarmupCosineDecay(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, initial_learning_rate, decay_steps, warmup_steps, alpha=0.0, name=None):
        super().__init__()
        self.name = name
        self.initial_learning_rate = initial_learning_rate # LR สูงสุดหลัง warmup
        self.decay_steps = decay_steps                     # จำนวน steps ทั้งหมดสำหรับการ decay
        self.warmup_steps = warmup_steps                   # จำนวน steps สำหรับ warmup
        self.alpha = alpha                                 # LR ต่ำสุด = initial_learning_rate * alpha

        # Cosine decay part starts after warmup
        self.cosine_decay_schedule = tf.keras.optimizers.schedules.CosineDecay(
            initial_learning_rate=self.initial_learning_rate,
            decay_steps=self.decay_steps - self.warmup_steps, # จำนวน steps สำหรับ cosine decay part
            alpha=self.alpha
        )

    def __call__(self, step):
        step_float = tf.cast(step, tf.float32)
        warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)

        # Linear warmup phase
        def warmup_fn():
            return (self.initial_learning_rate / warmup_steps_float) * step_float

        # Cosine decay phase (adjust step for cosine_decay_schedule)
        def cosine_decay_fn():
            return self.cosine_decay_schedule(step_float - warmup_steps_float)

        learning_rate = tf.cond(
            step_float < warmup_steps_float,
            warmup_fn,
            cosine_decay_fn
        )
        return learning_rate

    def get_config(self):
        return {
            "initial_learning_rate": self.initial_learning_rate,
            "decay_steps": self.decay_steps,
            "warmup_steps": self.warmup_steps,
            "alpha": self.alpha,
            "name": self.name
        }

# Model setup and Vision Transformer compile

In [None]:
total_steps = EPOCHS * steps_per_epoch
warmup_epochs = 10  # จำนวน epochs สำหรับ warmup
warmup_steps = warmup_epochs * steps_per_epoch
peak_learning_rate = 1e-4 # หรือ 1e-3, 3e-4 (LR สูงสุดที่จะใช้)
weight_decay_rate = 0.05  # ค่า weight decay สำหรับ AdamW (อาจต้องปรับจูน)

# สร้าง instance ของ Learning Rate Schedule
lr_schedule = WarmupCosineDecay(
    initial_learning_rate=peak_learning_rate,
    decay_steps=total_steps,
    warmup_steps=warmup_steps,
    alpha=0.0 # ลด LR ลงจนเกือบเป็น 0 เมื่อสิ้นสุด
)

try:
    optimizer = tf.keras.optimizers.AdamW(
        learning_rate=lr_schedule,
        weight_decay=weight_decay_rate,
        clipnorm=1.0
    )
except AttributeError:
    print("tf.keras.optimizers.AdamW not found, trying experimental version.")
    try:
        optimizer = tf.keras.optimizers.experimental.AdamW(
            learning_rate=lr_schedule,
            weight_decay=weight_decay_rate
        )
    except AttributeError:
        print("Experimental AdamW not found. Please ensure TensorFlow version is compatible or install tensorflow-addons.")
        
# Build the model
model = VisionTransformer(
    image_size=IMAGE_SIZE,
    num_classes=NUM_CLASSES,
    patch_size=PATCH_SIZE,
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    num_layers=NUM_LAYERS,
    mlp_dim=MLP_DIM,
    dropout_rate=DROPOUT_RATE
)

# Compile the model
model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

print(f"Optimizer set to AdamW with WarmupCosineDecay schedule.")
print(f"Total training steps: {total_steps}, Warmup steps: {warmup_steps}, Peak LR: {peak_learning_rate}")

dummy_input = tf.zeros((1, IMAGE_SIZE, IMAGE_SIZE, 3))
# เรียก Model ด้วย Dummy Input นี้ จะเป็นการบังคับให้เมธอด call() ทำงาน และ build เลเยอร์ภายในทั้งหมด
_ = model(dummy_input) # ผลลัพธ์จากการเรียกนี้เราไม่จำเป็นต้องใช้ จึงกำหนดให้ _
# สร้าง EarlyStopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5, # จำนวน epoch ที่จะรอถ้า val_loss ไม่ดีขึ้น
    restore_best_weights=True
)

model_checkpoint_path = 'best_vit_model_1000_classes.keras'
# สร้าง ModelCheckpoint callback
model_checkpoint = ModelCheckpoint(
    filepath=model_checkpoint_path, # หรือ .h5 หรือ tf format
    monitor='val_loss',      # เกณฑ์ที่ใช้ในการเลือกโมเดลที่ดีที่สุด
    save_best_only=True,     # บันทึกเฉพาะโมเดลที่ดีที่สุด
    save_weights_only=False, # บันทึกทั้งสถาปัตยกรรมและน้ำหนัก (ถ้า False) หรือเฉพาะน้ำหนัก (ถ้า True)
    verbose=1                # แสดงข้อความเมื่อมีการบันทึก
)

csv_logger = CSVLogger('training_log.csv', append=True)

model.summary() # ดูสรุปโมเดล

# Training Model

In [None]:
# Train the model
print(f"Starting training with image_size={IMAGE_SIZE}, patch_size={PATCH_SIZE}, embed_dim={EMBED_DIM}")
history = model.fit(train_pipeline,
                    epochs=EPOCHS,
                    validation_data=val_pipeline,
                    steps_per_epoch=steps_per_epoch,
                    validation_steps=validation_steps ,
                    callbacks=[early_stopping, model_checkpoint,csv_logger],
                    )
print("Training completed.")


# Evaluate the model (Validation)

In [None]:
print("\n--- Evaluating the BEST model saved by ModelCheckpoint ---")
# โหลดโมเดลที่ดีที่สุดที่บันทึกโดย ModelCheckpoint
# best_model_path ควรตรงกับ filepath ใน ModelCheckpoint callback
best_model_path = model_checkpoint_path # หรือ .h5 หรือ tf format
if os.path.exists(best_model_path):
    best_model = load_model(best_model_path)

    # ไม่จำเป็นต้อง compile ใหม่ถ้า .keras file บันทึกสถานะ optimizer ไว้แล้ว
    # แต่ถ้าต้องการความแน่นอน หรือมีการเปลี่ยน custom objects/metrics ก็สามารถ compile ใหม่ได้
    best_model.compile(optimizer=optimizer # ใช้ LR ที่เหมาะสม
                       loss='categorical_crossentropy',
                       metrics=['accuracy'])

    val_loss, val_accuracy = best_model.evaluate(val_pipeline)
    print(f"Validation Loss (Best Model): {val_loss}")
    print(f"Validation Accuracy (Best Model): {val_accuracy}")
else:
    print(f"Error: Best model file '{best_model_path}' not found. Training might not have completed or saved a model.")