In [1]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [2]:
from pathlib import Path
import pandas as pd
import shutil
import random
import json


In [3]:
PROJECT_ROOT = Path("/content/drive/MyDrive/SkinCare_AI_Component")

CURATED  = PROJECT_ROOT / "data/11_skin_type/curated_by_class"
SPLITS   = PROJECT_ROOT / "data/11_skin_type/splits"
META_DIR = PROJECT_ROOT / "data/11_skin_type/metadata"

# From 11C
REVIEW_CSV = META_DIR / "skin_type_to_review.csv"

# 11D outputs
ACTIONS_CSV = META_DIR / "skin_type_review_actions.csv"
STAGING_DIR = PROJECT_ROOT / "data/11_skin_type/review_staging"   # safe copies for manual review (optional)

assert PROJECT_ROOT.exists(), f"❌ PROJECT_ROOT not found: {PROJECT_ROOT}"
assert CURATED.exists(), f"❌ curated_by_class not found: {CURATED}"
assert META_DIR.exists(), f"❌ metadata folder not found: {META_DIR}"
assert REVIEW_CSV.exists(), f"❌ Run 11C first. Missing: {REVIEW_CSV}"

print("✅ PROJECT_ROOT:", PROJECT_ROOT)
print("✅ REVIEW_CSV:", REVIEW_CSV)


✅ PROJECT_ROOT: /content/drive/MyDrive/SkinCare_AI_Component
✅ REVIEW_CSV: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata/skin_type_to_review.csv


In [4]:
df = pd.read_csv(REVIEW_CSV)

if "review_action" not in df.columns:
    df["review_action"] = ""  # keep / delete / move_to_oily / move_to_dry / move_to_combination
if "review_note" not in df.columns:
    df["review_note"] = ""

df.to_csv(ACTIONS_CSV, index=False)

print("✅ Created/updated:", ACTIONS_CSV)
print("Rows:", len(df))
df.head(10)


✅ Created/updated: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata/skin_type_review_actions.csv
Rows: 27


Unnamed: 0,image_path,true_id,true_label,pred_id,pred_label,confidence,review_action,review_note
0,data/11_skin_type/splits/train/oily/oily_c2df5...,0,oily,1,dry,0.976994,,
1,data/11_skin_type/splits/train/oily/d4b60e5c27...,0,oily,2,combination,0.975313,,
2,data/11_skin_type/splits/train/oily/02daf6c4-2...,0,oily,1,dry,0.965852,,
3,data/11_skin_type/splits/train/oily/th-15-_jpe...,0,oily,2,combination,0.961424,,
4,data/11_skin_type/splits/train/oily/21acc6806c...,0,oily,1,dry,0.958797,,
5,data/11_skin_type/splits/train/oily/oily_f0d8b...,0,oily,1,dry,0.947232,,
6,data/11_skin_type/splits/train/oily/oily_f0d8b...,0,oily,1,dry,0.946643,,
7,data/11_skin_type/splits/train/dry/6b67e4a21f0...,1,dry,2,combination,0.940385,,
8,data/11_skin_type/splits/train/dry/black-young...,1,dry,0,oily,0.934397,,
9,data/11_skin_type/splits/train/dry/2aacc66f034...,1,dry,2,combination,0.928725,,


In [5]:
def safe_abs(image_path: str) -> Path:
    p = Path(image_path)
    return p if p.is_absolute() else (PROJECT_ROOT / p)

# rebuild staging folder
if STAGING_DIR.exists():
    shutil.rmtree(STAGING_DIR)
STAGING_DIR.mkdir(parents=True, exist_ok=True)

copied = 0
skipped_missing = 0

for _, r in df.iterrows():
    src = safe_abs(r["image_path"])
    if not src.exists():
        skipped_missing += 1
        continue

    sub = f'{r["true_label"]}__pred_{r["pred_label"]}'
    dst_dir = STAGING_DIR / sub
    dst_dir.mkdir(parents=True, exist_ok=True)

    dst = dst_dir / src.name
    if not dst.exists():
        shutil.copy2(src, dst)
        copied += 1

print("✅ Staging created:", STAGING_DIR)
print("Copied:", copied, "| Missing skipped:", skipped_missing)


✅ Staging created: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/review_staging
Copied: 27 | Missing skipped: 0


In [6]:
dfA = pd.read_csv(ACTIONS_CSV)

VALID = {"", "keep", "delete", "move_to_oily", "move_to_dry", "move_to_combination"}
bad = sorted(set(dfA["review_action"].fillna("").astype(str).str.strip()) - VALID)
assert not bad, f"❌ Invalid actions found: {bad}"

def is_img(p: Path) -> bool:
    return p.suffix.lower() in [".jpg",".jpeg",".png",".webp",".bmp"]

def find_in_curated(true_label: str, filename: str) -> Path | None:
    hits = list((CURATED / true_label).rglob(filename))
    return hits[0] if hits else None

moved = 0
deleted = 0
missing = 0
kept = 0

for _, r in dfA.iterrows():
    action = str(r["review_action"]).strip()
    if action in ("", "keep"):
        kept += 1
        continue

    true_label = str(r["true_label"]).strip()
    fname = Path(str(r["image_path"])).name

    src = find_in_curated(true_label, fname)
    if src is None or not src.exists():
        missing += 1
        continue

    if action == "delete":
        src.unlink()
        deleted += 1
        continue

    target = action.replace("move_to_", "").strip()
    dst_dir = CURATED / target
    dst_dir.mkdir(parents=True, exist_ok=True)

    dst = dst_dir / src.name
    if dst.exists():
        dst = dst_dir / f"dup_{src.name}"

    shutil.move(str(src), str(dst))
    moved += 1

print("✅ Actions applied")
print("Kept:", kept, "| Moved:", moved, "| Deleted:", deleted, "| Missing in curated:", missing)


✅ Actions applied
Kept: 0 | Moved: 27 | Deleted: 0 | Missing in curated: 0


In [7]:
classes = ["oily", "dry", "combination"]
train_p, val_p, test_p = 0.70, 0.15, 0.15
seed = 42
random.seed(seed)

# remove old splits
if SPLITS.exists():
    shutil.rmtree(SPLITS)

for sp in ["train", "val", "test"]:
    for c in classes:
        (SPLITS / sp / c).mkdir(parents=True, exist_ok=True)

for c in classes:
    imgs = [p for p in (CURATED / c).rglob("*") if p.is_file() and is_img(p)]
    random.shuffle(imgs)

    n = len(imgs)
    n_train = int(train_p * n)
    n_val   = int(val_p * n)

    train_imgs = imgs[:n_train]
    val_imgs   = imgs[n_train:n_train+n_val]
    test_imgs  = imgs[n_train+n_val:]

    for p in train_imgs:
        shutil.copy2(p, SPLITS / "train" / c / p.name)
    for p in val_imgs:
        shutil.copy2(p, SPLITS / "val" / c / p.name)
    for p in test_imgs:
        shutil.copy2(p, SPLITS / "test" / c / p.name)

    print(c, "total", n, "| train", len(train_imgs), "val", len(val_imgs), "test", len(test_imgs))

print("✅ Splits rebuilt at:", SPLITS)


oily total 2454 | train 1717 val 368 test 369
dry total 2968 | train 2077 val 445 test 446
combination total 1570 | train 1099 val 235 test 236
✅ Splits rebuilt at: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/splits


In [8]:
META_DIR.mkdir(parents=True, exist_ok=True)

label_map = {c:i for i, c in enumerate(classes)}
rows = []

for sp in ["train", "val", "test"]:
    for c in classes:
        for p in (SPLITS / sp / c).rglob("*"):
            if p.is_file() and is_img(p):
                rows.append({
                    "image_path": str(p.relative_to(PROJECT_ROOT)),
                    "label_name": c,
                    "label_id": label_map[c],
                    "split": sp
                })

df_new = pd.DataFrame(rows)

out_index = META_DIR / "image_index_skin_type.csv"
df_new.to_csv(out_index, index=False)

out_labelmap = META_DIR / "label_map_skin_type.json"
with open(out_labelmap, "w") as f:
    json.dump(label_map, f, indent=2)

print("✅ Rebuilt:", out_index, "rows:", len(df_new))
print("✅ Rebuilt:", out_labelmap, label_map)
df_new.head(10)


✅ Rebuilt: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata/image_index_skin_type.csv rows: 6992
✅ Rebuilt: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata/label_map_skin_type.json {'oily': 0, 'dry': 1, 'combination': 2}


Unnamed: 0,image_path,label_name,label_id,split
0,data/11_skin_type/splits/train/oily/b804f8b859...,oily,0,train
1,data/11_skin_type/splits/train/oily/33a3b2173d...,oily,0,train
2,data/11_skin_type/splits/train/oily/891545fe95...,oily,0,train
3,data/11_skin_type/splits/train/oily/dry_new_69...,oily,0,train
4,data/11_skin_type/splits/train/oily/29775af660...,oily,0,train
5,data/11_skin_type/splits/train/oily/16e4b2e6c7...,oily,0,train
6,data/11_skin_type/splits/train/oily/6ee123e65f...,oily,0,train
7,data/11_skin_type/splits/train/oily/normal_new...,oily,0,train
8,data/11_skin_type/splits/train/oily/kering-6-_...,oily,0,train
9,data/11_skin_type/splits/train/oily/normal_new...,oily,0,train
