In [43]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [44]:
from pathlib import Path
import random, shutil, re

PROJECT_ROOT = Path("/content/drive/MyDrive/SkinCare_AI_Component")

CURATED_DIR = PROJECT_ROOT / "data" / "11_skin_type" / "curated_by_class"
SPLITS_DIR  = PROJECT_ROOT / "data" / "11_skin_type" / "splits"

CLASSES = ["oily", "dry", "combination"]

IMG_EXTS = {".jpg",".jpeg",".png",".webp",".jfif",".bmp",".tif",".tiff",".gif"}

TRAIN_RATIO, VAL_RATIO, TEST_RATIO = 0.70, 0.15, 0.15
SEED = 42
random.seed(SEED)

print("CURATED_DIR:", CURATED_DIR, "exists:", CURATED_DIR.exists())
print("SPLITS_DIR :", SPLITS_DIR)


CURATED_DIR: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/curated_by_class exists: True
SPLITS_DIR : /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/splits


In [45]:
def is_img(p: Path) -> bool:
    return p.is_file() and p.suffix.lower() in IMG_EXTS

def list_images_recursive(folder: Path):
    if not folder.exists():
        return []
    return [p for p in folder.rglob("*") if is_img(p)]

_rf_pattern = re.compile(r"\.rf\.[A-Za-z0-9]+")
def base_key(filename: str) -> str:
    stem = Path(filename).stem
    suffix = Path(filename).suffix.lower()
    return _rf_pattern.sub("", stem) + suffix

def group_by_base(files):
    g = {}
    for f in files:
        g.setdefault(base_key(f.name), []).append(f)
    return g


In [46]:
print("=== CURATED COUNTS (skin type) ===")
for c in CLASSES:
    n = len(list_images_recursive(CURATED_DIR / c))
    print(f"{c:12s} -> {n} images")
    if n == 0:
        raise ValueError(f"❌ '{c}' is empty. Put images into: {CURATED_DIR/c}")

print("\n✅ Curated folders are OK. Proceeding to split.")


=== CURATED COUNTS (skin type) ===
oily         -> 2471 images
dry          -> 2974 images
combination  -> 1573 images

✅ Curated folders are OK. Proceeding to split.


In [47]:
if SPLITS_DIR.exists():
    shutil.rmtree(SPLITS_DIR)

for sp in ["train","val","test"]:
    for c in CLASSES:
        (SPLITS_DIR / sp / c).mkdir(parents=True, exist_ok=True)

print("✅ Fresh splits structure created:", SPLITS_DIR)


✅ Fresh splits structure created: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/splits


In [48]:
def split_keys(keys, train_ratio, val_ratio, test_ratio):
    n = len(keys)
    n_test = int(n * test_ratio)
    n_val  = int(n * val_ratio)
    n_train = n - n_val - n_test

    train_keys = keys[:n_train]
    val_keys   = keys[n_train:n_train+n_val]
    test_keys  = keys[n_train+n_val:]
    return train_keys, val_keys, test_keys


In [49]:
summary = {}

for c in CLASSES:
    files = list_images_recursive(CURATED_DIR / c)
    groups = group_by_base(files)

    keys = list(groups.keys())
    random.shuffle(keys)

    train_keys, val_keys, test_keys = split_keys(keys, TRAIN_RATIO, VAL_RATIO, TEST_RATIO)

    def copy_groups(keys_list, split_name):
        dst = SPLITS_DIR / split_name / c
        copied = 0
        for k in keys_list:
            for src in groups[k]:
                shutil.copy2(src, dst / src.name)
                copied += 1
        return copied

    tr = copy_groups(train_keys, "train")
    va = copy_groups(val_keys, "val")
    te = copy_groups(test_keys, "test")

    summary[c] = (tr, va, te)

print("✅ Split completed.")
for c in CLASSES:
    tr, va, te = summary[c]
    print(f"{c:12s} train={tr} val={va} test={te}")


✅ Split completed.
oily         train=1764 val=346 test=361
dry          train=2100 val=431 test=443
combination  train=1114 val=231 test=229


In [50]:
def count_split(split_name):
    print(f"\n{split_name.upper()}:")
    total = 0
    for c in CLASSES:
        folder = SPLITS_DIR / split_name / c
        n = len([p for p in folder.iterdir() if is_img(p)])
        total += n
        print(f"  {c:12s} {n}")
    print(f"  {'TOTAL':12s} {total}")

count_split("train")
count_split("val")
count_split("test")

print("\n✅ DONE: splits created. Next is Notebook 11 (build index).")



TRAIN:
  oily         1764
  dry          2100
  combination  1114
  TOTAL        4978

VAL:
  oily         346
  dry          431
  combination  231
  TOTAL        1008

TEST:
  oily         361
  dry          443
  combination  229
  TOTAL        1033

✅ DONE: splits created. Next is Notebook 11 (build index).
