# Environment & basic diagnostics (optional)

In [None]:
import sys, platform
import numpy as np

print("Python  :", sys.version.split()[0])
print("Platform:", platform.platform())
print("NumPy   :", np.__version__)

# Optional: TensorFlow GPU memory growth (only if you really use TF later)
try:
    import tensorflow as tf
    gpus = tf.config.list_physical_devices("GPU")
    if gpus:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"TF GPUs  : {len(gpus)} (memory growth enabled)")
    else:
        print("TF GPUs  : none")
except Exception as e:
    print(f"TensorFlow not used / not available ({e})")


# Config & paths

In [None]:
from pathlib import Path

# Root folder: parent contains subfolders = class names
DATA_DIR = Path("/workspace/masaboe_gmail.com/aboe/keris/baselineOriginalNoBG/pamor")

# Output folder for .npy
OUT_NPY_DIR = Path("/workspace/masaboe_gmail.com/aboe/keris/baselineOriginalNoBG/PAMOR_/npy")
OUT_NPY_DIR.mkdir(parents=True, exist_ok=True)

# Image processing
TARGET_SIZE = (128, 128)     # (W, H) for PIL, but we will use consistently
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff", ".webp"}

# Splits
RANDOM_SEED = 42
TRAIN_FRAC = 0.77            # -> remaining 0.23 will be split into val/test (0.115/0.115)
VAL_FRAC_OF_TEMP = 0.5       # half of temp to val, half to test

print("DATA_DIR    :", DATA_DIR)
print("OUT_NPY_DIR :", OUT_NPY_DIR)


# Inspect image size variability per class (optional but nice for appendix)

In [None]:
from PIL import Image

def check_image_sizes_per_class(root: Path) -> dict[str, list[tuple[int, int]]]:
    """
    Returns a dict: class_name -> sorted list of unique (W, H) sizes.
    """
    report = {}
    for class_dir in sorted([p for p in root.iterdir() if p.is_dir()]):
        sizes = set()
        for fp in class_dir.iterdir():
            if fp.is_file() and fp.suffix.lower() in IMG_EXTS:
                try:
                    with Image.open(fp) as im:
                        sizes.add(im.size)  # (W, H)
                except Exception as e:
                    print(f"[WARN] Failed reading: {fp} | {e}")
        report[class_dir.name] = sorted(sizes)
    return report

size_report = check_image_sizes_per_class(DATA_DIR)
for k, v in size_report.items():
    print(f"Class '{k}': unique sizes = {v if v else 'None'}")


# Load images (RGB), resize, stack to array

In [None]:
import numpy as np
from PIL import Image

def load_images_rgb(root: Path, target_size=(128, 128)) -> tuple[np.ndarray, np.ndarray]:
    """
    Loads images from:
      root/class_name/*.ext

    Returns:
      X: uint8 array, shape (N, H, W, 3)
      y: string array, shape (N,)
    """
    X_list = []
    y_list = []

    class_dirs = [p for p in root.iterdir() if p.is_dir()]
    class_dirs.sort(key=lambda p: p.name)

    for class_dir in class_dirs:
        label = class_dir.name
        files = [f for f in class_dir.iterdir() if f.is_file() and f.suffix.lower() in IMG_EXTS]
        files.sort(key=lambda p: p.name)

        for fp in files:
            try:
                img = Image.open(fp).convert("RGB")
                img = img.resize(target_size, resample=Image.Resampling.LANCZOS)
                arr = np.asarray(img, dtype=np.uint8)  # (H, W, 3)
                X_list.append(arr)
                y_list.append(label)
            except Exception as e:
                print(f"[WARN] Failed processing: {fp} | {e}")

    if not X_list:
        raise RuntimeError(f"No valid images found under: {root}")

    X = np.stack(X_list, axis=0)          # (N, H, W, 3)
    y = np.asarray(y_list, dtype=object)  # string labels

    return X, y

X_u8, y_str = load_images_rgb(DATA_DIR, target_size=TARGET_SIZE)

print("X_u8:", X_u8.shape, X_u8.dtype)
print("y   :", y_str.shape, y_str.dtype)
print("Classes:", sorted(set(y_str.tolist())))


# Quick preview (optional)

In [None]:
import matplotlib.pyplot as plt

def preview_per_class(X: np.ndarray, y: np.ndarray, max_per_class: int = 5):
    classes = np.unique(y)
    for cls in classes:
        idxs = np.where(y == cls)[0][:max_per_class]
        if idxs.size == 0:
            continue
        plt.figure(figsize=(3 * len(idxs), 3))
        for j, i in enumerate(idxs, start=1):
            plt.subplot(1, len(idxs), j)
            plt.imshow(X[i])
            plt.title(str(cls))
            plt.axis("off")
        plt.tight_layout()
        plt.show()

preview_per_class(X_u8, y_str, max_per_class=5)


# Normalize to float32 [0,1]

In [None]:
X = (X_u8.astype(np.float32) / 255.0)  # (N, H, W, 3) float32
print("X:", X.shape, X.dtype, f"min={X.min():.3f}, max={X.max():.3f}")

# Class distribution

In [None]:
unique_vals, counts = np.unique(y_str, return_counts=True)
for v, c in zip(unique_vals, counts):
    print(f"{v}: {c}")


# Stratified split (Train / Val / Test)

In [None]:
from sklearn.model_selection import train_test_split

# Train vs temp
X_train, X_temp, y_train_str, y_temp_str = train_test_split(
    X, y_str,
    test_size=(1.0 - TRAIN_FRAC),
    random_state=RANDOM_SEED,
    stratify=y_str
)

# Val vs Test from temp
X_val, X_test, y_val_str, y_test_str = train_test_split(
    X_temp, y_temp_str,
    test_size=(1.0 - VAL_FRAC_OF_TEMP),
    random_state=RANDOM_SEED,
    stratify=y_temp_str
)

print("Train:", X_train.shape, y_train_str.shape)
print("Val  :", X_val.shape,   y_val_str.shape)
print("Test :", X_test.shape,  y_test_str.shape)


# Build ONE label mapping from TRAIN, apply to all splits (fix critical bug)

In [None]:
# Build mapping only from TRAIN classes (stable and reproducible)
classes = sorted(np.unique(y_train_str).tolist())
label2idx = {lab: i for i, lab in enumerate(classes)}
idx2label = {i: lab for lab, i in label2idx.items()}

num_classes = len(classes)
print("num_classes:", num_classes)
print("label2idx:", label2idx)

def encode_labels_onehot(y: np.ndarray, label2idx: dict[str, int], num_classes: int) -> np.ndarray:
    idx = np.array([label2idx[v] for v in y], dtype=np.int64)
    onehot = np.eye(num_classes, dtype=np.float32)[idx]
    return onehot

y_train = encode_labels_onehot(y_train_str, label2idx, num_classes)
y_val   = encode_labels_onehot(y_val_str,   label2idx, num_classes)
y_test  = encode_labels_onehot(y_test_str,  label2idx, num_classes)

print("y_train:", y_train.shape, y_train.dtype)
print("y_val  :", y_val.shape,   y_val.dtype)
print("y_test :", y_test.shape,  y_test.dtype)


In [None]:
# Save .npy (and optional label_mapping.csv)
import csv

# Save arrays
np.save(OUT_NPY_DIR / "x_train.npy", X_train)
np.save(OUT_NPY_DIR / "y_train.npy", y_train)

np.save(OUT_NPY_DIR / "x_valid.npy", X_val)
np.save(OUT_NPY_DIR / "y_valid.npy", y_val)

np.save(OUT_NPY_DIR / "x_test.npy",  X_test)
np.save(OUT_NPY_DIR / "y_test.npy",  y_test)

print("[OK] Saved .npy files to:", OUT_NPY_DIR)

# Optional: save label mapping for reproducibility (recommended even for private data)
mapping_path = OUT_NPY_DIR / "label_mapping.csv"
with open(mapping_path, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["class_name", "class_index"])
    for lab, idx in label2idx.items():
        w.writerow([lab, idx])

print("[OK] Saved label mapping to:", mapping_path)
