In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import Adam
from torch.optim.lr_scheduler import OneCycleLR

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import time

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import (
    classification_report, 
    accuracy_score, 
    f1_score, 
    cohen_kappa_score
)
from sklearn.utils.class_weight import compute_class_weight

In [None]:
# ==========================================
# Configuration
# ==========================================
BATCH_SIZE = 64
NUM_EPOCHS = 100
LEARNING_RATE = 1e-3
PATIENCE = 10
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print(f"Using device: {DEVICE}")

In [None]:
# 1. Load Data
print("--- Loading Data ---")
df_train = pd.read_csv("dataset/train_interpolated.csv")
df_valid = pd.read_csv("dataset/valid_interpolated.csv")
df_test  = pd.read_csv("dataset/test_interpolated.csv")

# 2. Apply Mapping and Drop Invalid Rows
mapping_crop = {
    27: "Sesame", 2: "Pepper", 8: "Aralia", 1: "Sweet potato",
    17: "Sudangrass", 29: "Soybean", 9: "Perilla", 19: "Greenhouse",
    24: "Yuzu", 23: "Maize", 28: "Kiwi", 22: "Onion",
    16: "Apple", 30: "Grape", 14: "Peach", 10: "Garlic",
    12: "Pear", 13: "Cabbage", 11: "Sapling", 31: "Radish"
}

for df in (df_train, df_valid, df_test):
    df["crop_name"] = df["CR_ID"].map(mapping_crop)
    df.dropna(subset=["crop_name"], inplace=True)

# 3. Define Features
months = [f"2021{m:02d}" for m in range(7, 13)]
bands = ['b02','b03','b04','b05','b06','b07','b08','b8a','b11','b12']
features = [f"{b}_{mon}_{d}" for b in bands for mon in months for d in range(1, 4)]

# 4. Prepare X and y
le = LabelEncoder().fit(df_train["crop_name"])
n_classes = len(le.classes_)

X_train = df_train[features].values
y_train = le.transform(df_train["crop_name"])

X_valid = df_valid[features].values
y_valid = le.transform(df_valid["crop_name"])

X_test  = df_test[features].values
y_test  = le.transform(df_test["crop_name"])

# 5. Scaling (MinMaxScaler)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)
X_test_scaled  = scaler.transform(X_test)

# 6. Tensor Conversion & Reshaping
# Dimensions:
n_bands = len(bands)       # 10
n_steps = len(months) * 3  # 18

# Reshape logic: (N, features) -> (N, steps, bands)
# Note: ViT expects (N, 1, H, W) or (N, H, W). We will shape it as (N, steps, bands) initially.
X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32).reshape(-1, n_steps, n_bands)
X_valid_t = torch.tensor(X_valid_scaled, dtype=torch.float32).reshape(-1, n_steps, n_bands)
X_test_t  = torch.tensor(X_test_scaled,  dtype=torch.float32).reshape(-1, n_steps, n_bands)

y_train_t = torch.tensor(y_train, dtype=torch.long)
y_valid_t = torch.tensor(y_valid, dtype=torch.long)
y_test_t  = torch.tensor(y_test,  dtype=torch.long)

# 7. Create DataLoaders
# Lower num_workers on Windows to avoid overhead/errors
train_ds = TensorDataset(X_train_t, y_train_t)
valid_ds = TensorDataset(X_valid_t, y_valid_t)
test_ds  = TensorDataset(X_test_t,  y_test_t)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
val_loader   = DataLoader(valid_ds, batch_size=256, shuffle=False, num_workers=2, pin_memory=True)
test_loader  = DataLoader(test_ds,  batch_size=256, shuffle=False, num_workers=2, pin_memory=True)

print(f"Data ready. Classes: {n_classes}")
print(f"Input Shape: (Steps={n_steps}, Bands={n_bands})")

In [None]:
# ViT
class PatchEmbed(nn.Module):
    """
    Splits image into patches and embeds them using Conv2d.
    Handles auto-padding if input dimensions are not divisible by patch size.
    """
    def __init__(self, in_chans: int, embed_dim: int, patch_size=(2, 3)):
        super().__init__()
        self.patch_size = patch_size
        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)

    def forward(self, x: torch.Tensor):
        # x: (B, 1, H, W)
        B, C, H, W = x.shape
        ph, pw = self.patch_size
        
        # Calculate padding
        pad_h = (ph - H % ph) % ph
        pad_w = (pw - W % pw) % pw
        
        if pad_h or pad_w:
            # (left, right, top, bottom)
            x = F.pad(x, (0, pad_w, 0, pad_h))
            H, W = H + pad_h, W + pad_w
            
        x = self.proj(x)  # -> (B, E, H/ph, W/pw)
        x = x.flatten(2).transpose(1, 2)  # -> (B, N_patches, E)
        
        grid_h, grid_w = H // ph, W // pw
        return x, (grid_h, grid_w)


def _get_2d_sincos_pos_embed(embed_dim: int, grid_h: int, grid_w: int, device):
    """
    Generates 2D sinusoidal positional embeddings.
    Not learnable, so it handles variable input sizes safely.
    """
    grid_y = torch.arange(grid_h, device=device, dtype=torch.float32)
    grid_x = torch.arange(grid_w, device=device, dtype=torch.float32)
    yy, xx = torch.meshgrid(grid_y, grid_x, indexing='ij')
    
    omega = torch.arange(embed_dim // 4, device=device, dtype=torch.float32)
    omega = 1.0 / (10000 ** (omega / (embed_dim // 4)))
    
    out = []
    for pos in (yy.reshape(-1), xx.reshape(-1)):
        pos = pos.unsqueeze(1) * omega.unsqueeze(0)
        out.append(torch.sin(pos))
        out.append(torch.cos(pos))
        
    pe = torch.cat(out, dim=1)  # (N, D)
    return pe.unsqueeze(0)      # (1, N, D)


class ViT(nn.Module):
    def __init__(self, n_bands, n_steps, num_classes, embed_dim=256, depth=6,
                 num_heads=8, mlp_ratio=4.0, dropout=0.1, patch_size=(2,3), use_cls=True):
        super().__init__()
        self.n_bands = n_bands
        self.n_steps = n_steps
        self.use_cls = use_cls
        
        # Patch Embedding
        self.patch_embed = PatchEmbed(1, embed_dim, patch_size)
        
        # Class Token
        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if use_cls else None
        
        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim, nhead=num_heads,
            dim_feedforward=int(embed_dim*mlp_ratio),
            dropout=dropout, activation='gelu', batch_first=True, norm_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        
        self.norm = nn.LayerNorm(embed_dim)
        self.head = nn.Linear(embed_dim, num_classes)

    def forward(self, x: torch.Tensor):
        # Input handling: Ensure shape is (B, 1, H, W)
        if x.dim() == 3:
            B, A, B_ = x.shape
            # If input is (B, n_bands, n_steps) -> Add channel dim
            if (A, B_) == (self.n_bands, self.n_steps):
                x = x.unsqueeze(1)
            # If input is (B, n_steps, n_bands) -> Transpose to (B, bands, steps) then add channel
            elif (A, B_) == (self.n_steps, self.n_bands):
                x = x.permute(0, 2, 1).unsqueeze(1)
            else:
                raise ValueError(f"Unexpected 3D shape {x.shape}")
        
        elif x.dim() == 4:
             # Fix rare case where input is (B, bands, steps, ?) or similar
             if x.size(1) != 1 and x.size(1) == self.n_bands and x.size(2) == self.n_steps:
                x = x.unsqueeze(1)

        # 1. Patch Embed
        x, (gh, gw) = self.patch_embed(x) # -> (B, N, E)
        B, N, E = x.shape
        
        # 2. Add Positional Embeddings
        pe = _get_2d_sincos_pos_embed(E, gh, gw, x.device)
        
        if self.use_cls:
            cls = self.cls_token.expand(B, -1, -1)
            x = torch.cat([cls, x], dim=1)
            pe = torch.cat([torch.zeros(1, 1, E, device=x.device), pe], dim=1)
            
        x = x + pe
        
        # 3. Encoder
        x = self.encoder(x)
        
        # 4. Classification Head
        # Use CLS token if enabled, else global average pooling
        out = x[:, 0] if self.use_cls else x.mean(dim=1)
        out = self.norm(out)
        
        return self.head(out)

# Initialize Model
model = ViT(
    n_bands=n_bands,
    n_steps=n_steps,
    num_classes=n_classes,
    embed_dim=256,
    depth=6,
    num_heads=8,
    mlp_ratio=4.0,
    dropout=0.1,
    patch_size=(2, 3),
    use_cls=True,
).to(DEVICE)


In [None]:
# ViT TRAIN
# 1. Class Weights
class_weight_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = torch.tensor(class_weight_array, dtype=torch.float32, device=DEVICE)

# 2. Loss, Optimizer, Scheduler
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

# OneCycleLR Scheduler
scheduler = OneCycleLR(
    optimizer, 
    max_lr=LEARNING_RATE,
    epochs=NUM_EPOCHS, 
    steps_per_epoch=len(train_loader),
    pct_start=0.1, 
    div_factor=10.0, 
    final_div_factor=1000.0
)

# 3. Mixed Precision Scaler
grad_scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE.type == 'cuda'))

max_grad_norm = 2.0
best_val_loss = float('inf')
patience_cnt = 0
SAVE_PATH = "weights/ViT.pt"

print("--- Starting ViT Training ---")

for epoch in range(1, NUM_EPOCHS + 1):
    # Train Phase
    model.train()
    train_loss_sum = 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad(set_to_none=True)

        # Mixed Precision Forward
        with torch.cuda.amp.autocast(enabled=(DEVICE.type == 'cuda')):
            logits = model(xb)
            loss = criterion(logits, yb)

        # Backward & Step
        grad_scaler.scale(loss).backward()
        grad_scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        grad_scaler.step(optimizer)
        grad_scaler.update()

        scheduler.step() # Step per batch for OneCycleLR

        train_loss_sum += loss.item() * xb.size(0)

    train_loss = train_loss_sum / len(train_loader.dataset)

    # Validation Phase
    model.eval()
    val_loss_sum = 0.0
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=(DEVICE.type == 'cuda')):
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                logits = model(xb)
                loss = criterion(logits, yb)
                val_loss_sum += loss.item() * xb.size(0)

    val_loss = val_loss_sum / len(val_loader.dataset)
    current_lr = optimizer.param_groups[0]['lr']

    print(f"[{epoch:3d}/{NUM_EPOCHS}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {current_lr:.6g}")

    # Early Stopping
    if val_loss < best_val_loss - 1e-4:
        best_val_loss = val_loss
        torch.save(model.state_dict(), SAVE_PATH)
        patience_cnt = 0
    else:
        patience_cnt += 1
        if patience_cnt >= PATIENCE:
            print(f"Early stopping triggered at epoch {epoch}")
            break

In [None]:
# ViT TEST
# 1. Load Best Model
print("--- Loading Best ViT Model ---")
model.load_state_dict(torch.load("weights/ViT.pt"))
model.eval()

# Calculate Parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {n_params}")# 1. Load Best Model
print("--- Loading Best ViT Model ---")
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()

# Calculate Parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {n_params}")

# 2. Inference
test_preds, test_labels = [], []
start_time = time.time()

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        test_preds.extend(logits.argmax(1).cpu().tolist())
        test_labels.extend(yb.tolist())

end_time = time.time()

# 3. Metrics
total_time = end_time - start_time
inference_time_per_sample = (total_time / len(test_labels)) * 1000
print(f"\nInference Time: {inference_time_per_sample:.4f} ms/sample")

print(f"Test Accuracy: {accuracy_score(test_labels, test_preds):.4f}")
print(f"Macro F1: {f1_score(test_labels, test_preds, average='macro'):.4f}")
print(f"Cohen's Kappa: {cohen_kappa_score(test_labels, test_preds):.4f}")
print("-" * 60)
print(classification_report(test_labels, test_preds, target_names=le.classes_, digits=4))

# 2. Inference
test_preds, test_labels = [], []
start_time = time.time()

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        test_preds.extend(logits.argmax(1).cpu().tolist())
        test_labels.extend(yb.tolist())

end_time = time.time()

# 3. Metrics
total_time = end_time - start_time
inference_time_per_sample = (total_time / len(test_labels)) * 1000
print(f"\nInference Time: {inference_time_per_sample:.4f} ms/sample")

print(f"Test Accuracy: {accuracy_score(test_labels, test_preds):.4f}")
print(f"Macro F1: {f1_score(test_labels, test_preds, average='macro'):.4f}")
print(f"Cohen's Kappa: {cohen_kappa_score(test_labels, test_preds):.4f}")
print("-" * 60)
print(classification_report(test_labels, test_preds, target_names=le.classes_, digits=4))

In [None]:
# SSTRE

def causal_mask(L: int, device=None):
    return torch.triu(torch.ones(L, L, dtype=torch.bool, device=device), diagonal=1)

class STTREInputEmbedding(nn.Module):
    def __init__(self, L: int, M: int, d_model: int):
        super().__init__()
        self.value_proj = nn.Linear(1, d_model)
        self.pos_emb = nn.Embedding(L, d_model)
        self.var_emb = nn.Embedding(M, d_model)
        self.norm = nn.LayerNorm(d_model)
        self.register_buffer("pos_idx", torch.arange(L).long(), persistent=False)
        self.register_buffer("var_idx", torch.arange(M).long(), persistent=False)

    def forward(self, x):  # x: [B, L, M]
        B, L, M = x.shape
        v = self.value_proj(x.unsqueeze(-1))          # [B, L, M, d]
        pe = self.pos_emb(self.pos_idx).view(1, L, 1, -1)
        ve = self.var_emb(self.var_idx).view(1, 1, M, -1)
        h = self.norm(v + pe + ve)                    # [B, L, M, d]
        return h

class AttnFFNBlock(nn.Module):
    def __init__(self, d_model, nhead=4, dropout=0.1):
        super().__init__()
        self.mha = nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True)
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model*2),
            nn.GELU(),
            nn.Linear(d_model*2, d_model),
        )
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, z, attn_mask=None):  # z: [B, T, d]
        z2, _ = self.mha(z, z, z, attn_mask=attn_mask)
        z = self.norm1(z + z2)
        z2 = self.ffn(z)
        z = self.norm2(z + z2)
        return z

class TemporalStack(nn.Module):
    def __init__(self, L, d_model, nhead=4, dropout=0.1, depth=3):
        super().__init__()
        self.L = L
        self.layers = nn.ModuleList([AttnFFNBlock(d_model, nhead, dropout) for _ in range(depth)])

    def forward(self, x):  # [B, L, M, d]
        B, L, M, d = x.shape
        z = x.permute(0, 2, 1, 3).reshape(B * M, L, d)  # [B*M, L, d]
    
        for layer in self.layers:
            z = layer(z, attn_mask=None)
        z = z.view(B, M, L, d).permute(0, 2, 1, 3)      # [B, L, M, d]
        return z

class SpatialStack(nn.Module):
    def __init__(self, M, d_model, nhead=4, dropout=0.1, depth=3):
        super().__init__()
        self.layers = nn.ModuleList([AttnFFNBlock(d_model, nhead, dropout) for _ in range(depth)])

    def forward(self, x):  # [B, L, M, d]
        B, L, M, d = x.shape
        z = x.reshape(B * L, M, d)                      # [B*L, M, d]
        for layer in self.layers:
            z = layer(z, attn_mask=None)
        z = z.view(B, L, M, d)
        return z

class SpatioTemporalStack(nn.Module):
    def __init__(self, LM, d_model, nhead=4, dropout=0.1, depth=3):
        super().__init__()
        self.layers = nn.ModuleList([AttnFFNBlock(d_model, nhead, dropout) for _ in range(depth)])

    def forward(self, x):  # [B, L, M, d]
        B, L, M, d = x.shape
        z = x.view(B, L * M, d)                         # [B, LM, d]
        for layer in self.layers:
            z = layer(z, attn_mask=None)
        z = z.view(B, L, M, d)
        return z

class STTREClassifier(nn.Module):
    def __init__(self, L: int, M: int, num_classes: int,
                 d_model: int = 32, nhead: int = 4, dropout: float = 0.1, depth: int = 3):
        super().__init__()
        self.embed = STTREInputEmbedding(L, M, d_model)
        self.temporal = TemporalStack(L, d_model, nhead, dropout, depth)
        self.spatial  = SpatialStack(M, d_model, nhead, dropout, depth)
        self.st_block = SpatioTemporalStack(L * M, d_model, nhead, dropout, depth)
        self.proj = nn.Linear(d_model * 3, d_model)
        self.head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Dropout(dropout),
            nn.Linear(d_model, num_classes)
        )

    def forward(self, x):  # x: [B, L, M]
        h = self.embed(x)                      # [B, L, M, d]
        ht = self.temporal(h)                  # [B, L, M, d]
        hs = self.spatial(h)                   # [B, L, M, d]
        hst = self.st_block(h)                 # [B, L, M, d]
        h = torch.cat([ht, hs, hst], dim=-1)   # [B, L, M, 3d]
        h = self.proj(h).mean(dim=(1, 2))      # global avg pool â†’ [B, d]
        return self.head(h)                    # logits: [B, num_classes]
    
    
model = STTREClassifier(L=n_steps, M=n_bands, num_classes=n_classes, d_model=32, nhead=4, dropout=0.2, depth=2).to(DEVICE)


In [None]:
# SSTRE TRAIN
# 1. Class Weights
class_weight_array = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = torch.tensor(class_weight_array, dtype=torch.float32, device=DEVICE)

# 2. Loss, Optimizer, Scheduler
criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-4)

# OneCycleLR Scheduler
scheduler = OneCycleLR(
    optimizer, 
    max_lr=LEARNING_RATE,
    epochs=NUM_EPOCHS, 
    steps_per_epoch=len(train_loader),
    pct_start=0.1, 
    div_factor=10.0, 
    final_div_factor=1000.0
)

# 3. Mixed Precision Scaler
grad_scaler = torch.cuda.amp.GradScaler(enabled=(DEVICE.type == 'cuda'))

max_grad_norm = 2.0
best_val_loss = float('inf')
patience_cnt = 0
SAVE_PATH = "weights/SSTRE.pt"

print("--- Starting SSTRE Training ---")

for epoch in range(1, NUM_EPOCHS + 1):
    # Train Phase
    model.train()
    train_loss_sum = 0.0

    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad(set_to_none=True)

        # Mixed Precision Forward
        with torch.cuda.amp.autocast(enabled=(DEVICE.type == 'cuda')):
            logits = model(xb)
            loss = criterion(logits, yb)

        # Backward & Step
        grad_scaler.scale(loss).backward()
        grad_scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        grad_scaler.step(optimizer)
        grad_scaler.update()

        scheduler.step() # Step per batch for OneCycleLR

        train_loss_sum += loss.item() * xb.size(0)

    train_loss = train_loss_sum / len(train_loader.dataset)

    # Validation Phase
    model.eval()
    val_loss_sum = 0.0
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=(DEVICE.type == 'cuda')):
            for xb, yb in val_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                logits = model(xb)
                loss = criterion(logits, yb)
                val_loss_sum += loss.item() * xb.size(0)

    val_loss = val_loss_sum / len(val_loader.dataset)
    current_lr = optimizer.param_groups[0]['lr']

    print(f"[{epoch:3d}/{NUM_EPOCHS}] Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | LR: {current_lr:.6g}")

    # Early Stopping
    if val_loss < best_val_loss - 1e-4:
        best_val_loss = val_loss
        torch.save(model.state_dict(), SAVE_PATH)
        patience_cnt = 0
    else:
        patience_cnt += 1
        if patience_cnt >= PATIENCE:
            print(f"Early stopping triggered at epoch {epoch}")
            break

In [None]:
# SSTRE TEST
# 1. Load Best Model
print("--- Loading Best SSTRE Model ---")
model.load_state_dict(torch.load("weights/SSTRE.pt"))
model.eval()

# Calculate Parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {n_params}")# 1. Load Best Model
print("--- Loading Best SSTRE Model ---")
model.load_state_dict(torch.load(SAVE_PATH))
model.eval()

# Calculate Parameters
n_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {n_params}")

# 2. Inference
test_preds, test_labels = [], []
start_time = time.time()

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        test_preds.extend(logits.argmax(1).cpu().tolist())
        test_labels.extend(yb.tolist())

end_time = time.time()

# 3. Metrics
total_time = end_time - start_time
inference_time_per_sample = (total_time / len(test_labels)) * 1000
print(f"\nInference Time: {inference_time_per_sample:.4f} ms/sample")

print(f"Test Accuracy: {accuracy_score(test_labels, test_preds):.4f}")
print(f"Macro F1: {f1_score(test_labels, test_preds, average='macro'):.4f}")
print(f"Cohen's Kappa: {cohen_kappa_score(test_labels, test_preds):.4f}")
print("-" * 60)
print(classification_report(test_labels, test_preds, target_names=le.classes_, digits=4))

# 2. Inference
test_preds, test_labels = [], []
start_time = time.time()

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(DEVICE)
        logits = model(xb)
        test_preds.extend(logits.argmax(1).cpu().tolist())
        test_labels.extend(yb.tolist())

end_time = time.time()

# 3. Metrics
total_time = end_time - start_time
inference_time_per_sample = (total_time / len(test_labels)) * 1000
print(f"\nInference Time: {inference_time_per_sample:.4f} ms/sample")

print(f"Test Accuracy: {accuracy_score(test_labels, test_preds):.4f}")
print(f"Macro F1: {f1_score(test_labels, test_preds, average='macro'):.4f}")
print(f"Cohen's Kappa: {cohen_kappa_score(test_labels, test_preds):.4f}")
print("-" * 60)
print(classification_report(test_labels, test_preds, target_names=le.classes_, digits=4))