In [3]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from pathlib import Path
import json, shutil, random, hashlib
import pandas as pd


In [5]:
PROJECT_ROOT = Path("/content/drive/MyDrive/SkinCare_AI_Component")

# ‚úÖ SOURCE dataset you already have
SRC_CURATED = PROJECT_ROOT / "data/10_images/curated_by_class"
assert SRC_CURATED.exists(), f"‚ùå Not found: {SRC_CURATED}"

# ‚úÖ DESTINATION for concern dataset (separate & clean)
BASE_DIR = PROJECT_ROOT / "data/12_skin_concerns"
DST_CURATED = BASE_DIR / "curated"
META_DIR = BASE_DIR / "metadata"
META_DIR.mkdir(parents=True, exist_ok=True)
DST_CURATED.mkdir(parents=True, exist_ok=True)

OUT_CSV = META_DIR / "image_index_skin_concerns.csv"
OUT_LABELMAP = META_DIR / "label_map_skin_concerns.json"

# ‚úÖ Concern classes we want (final)
CONCERN_CLASSES = [
    "acne_pimples",
    "redness_irritation",
    "dark_spots_uneven_tone",
    "dryness_flaking",
    "normal_clear"
]

for c in CONCERN_CLASSES:
    (DST_CURATED / c).mkdir(parents=True, exist_ok=True)

print("‚úÖ PROJECT_ROOT:", PROJECT_ROOT)
print("‚úÖ SRC_CURATED:", SRC_CURATED)
print("‚úÖ DST_CURATED:", DST_CURATED)
print("‚úÖ OUT_CSV:", OUT_CSV)


‚úÖ PROJECT_ROOT: /content/drive/MyDrive/SkinCare_AI_Component
‚úÖ SRC_CURATED: /content/drive/MyDrive/SkinCare_AI_Component/data/10_images/curated_by_class
‚úÖ DST_CURATED: /content/drive/MyDrive/SkinCare_AI_Component/data/12_skin_concerns/curated
‚úÖ OUT_CSV: /content/drive/MyDrive/SkinCare_AI_Component/data/12_skin_concerns/metadata/image_index_skin_concerns.csv


In [6]:
IMG_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".bmp"}

def is_img(p: Path) -> bool:
    return p.is_file() and p.suffix.lower() in IMG_EXTS

src_classes = [p for p in SRC_CURATED.iterdir() if p.is_dir()]
print("\nüìå Classes inside data/10_images/curated_by_class:\n")
for sf in sorted(src_classes):
    cnt = sum(1 for x in sf.rglob("*") if is_img(x))
    print(f"- {sf.name:<35} {cnt} images")

print("\n‚úÖ Total class folders:", len(src_classes))



üìå Classes inside data/10_images/curated_by_class:

- Eczema                              3133 images
- Rosacea                             1108 images
- acne                                3017 images
- dark_spots                          1057 images
- dry_flaking                         624 images
- dry_irritated                       375 images
- normal                              2640 images
- redness_prone                       1255 images
- wrinkles                            1100 images

‚úÖ Total class folders: 9


In [7]:
def map_to_concern(src_class_name: str):
    name = src_class_name.lower()

    # Acne / pimples
    if any(k in name for k in ["acne", "pimple", "zit", "comedone", "blackhead", "whitehead"]):
        return "acne_pimples"

    # Redness / rash / irritation / eczema-like
    if any(k in name for k in ["rash", "red", "redness", "eczema", "dermatitis", "irritat", "allerg", "rosacea"]):
        return "redness_irritation"

    # Dark spots / pigmentation / uneven tone
    if any(k in name for k in ["spot", "dark", "pigment", "melasma", "freckle", "hyperpig", "uneven"]):
        return "dark_spots_uneven_tone"

    # Dryness / flaking / peeling
    if any(k in name for k in ["dry", "flak", "peel", "xerosis", "scaly"]):
        return "dryness_flaking"

    # Normal / clear / healthy
    if any(k in name for k in ["normal", "clear", "healthy", "clean"]):
        return "normal_clear"

    return None


In [8]:
MAX_PER_SOURCE_CLASS = 5000  # safety cap
copied_total = 0
skipped = []

for src_dir in sorted(src_classes):
    dst_class = map_to_concern(src_dir.name)

    if dst_class is None:
        skipped.append(src_dir.name)
        continue

    dst_dir = DST_CURATED / dst_class
    imgs = [p for p in src_dir.rglob("*") if is_img(p)]
    imgs = imgs[:MAX_PER_SOURCE_CLASS]

    count = 0
    for p in imgs:
        # rename to avoid collisions
        new_name = f"{src_dir.name}__{p.name}"
        out_path = dst_dir / new_name
        if not out_path.exists():
            shutil.copy2(p, out_path)
            count += 1

    copied_total += count
    print(f"‚úÖ {src_dir.name:<30} -> {dst_class:<22} | copied {count}")

print("\n‚úÖ TOTAL COPIED:", copied_total)

if skipped:
    print("\n‚ö†Ô∏è SKIPPED (no keyword match):")
    for s in skipped[:50]:
        print(" -", s)
    if len(skipped) > 50:
        print(" ... more skipped")


‚úÖ Eczema                         -> redness_irritation     | copied 3133
‚úÖ Rosacea                        -> redness_irritation     | copied 1108
‚úÖ acne                           -> acne_pimples           | copied 3017
‚úÖ dark_spots                     -> dark_spots_uneven_tone | copied 1057
‚úÖ dry_flaking                    -> dryness_flaking        | copied 624
‚úÖ dry_irritated                  -> redness_irritation     | copied 375
‚úÖ normal                         -> normal_clear           | copied 2640
‚úÖ redness_prone                  -> redness_irritation     | copied 1255

‚úÖ TOTAL COPIED: 13209

‚ö†Ô∏è SKIPPED (no keyword match):
 - wrinkles


In [9]:
print("\n=== Concern folder counts (12_skin_concerns/curated) ===")
total = 0
for c in CONCERN_CLASSES:
    cnt = sum(1 for x in (DST_CURATED / c).rglob("*") if is_img(x))
    total += cnt
    print(f"{c:<25} {cnt}")

print("\nTotal concern images:", total)



=== Concern folder counts (12_skin_concerns/curated) ===
acne_pimples              3017
redness_irritation        5871
dark_spots_uneven_tone    1057
dryness_flaking           624
normal_clear              2640

Total concern images: 13209


In [10]:
TRAIN_P, VAL_P, TEST_P = 0.70, 0.15, 0.15
SEED = 42
random.seed(SEED)

label_map = {c: i for i, c in enumerate(CONCERN_CLASSES)}
rows = []

for c in CONCERN_CLASSES:
    files = [p for p in (DST_CURATED / c).rglob("*") if is_img(p)]
    random.shuffle(files)

    n = len(files)
    if n == 0:
        print(f"‚ö†Ô∏è Warning: class '{c}' has 0 images. It will be missing in splits.")
        continue

    n_train = int(TRAIN_P * n)
    n_val   = int(VAL_P * n)

    train_files = files[:n_train]
    val_files   = files[n_train:n_train+n_val]
    test_files  = files[n_train+n_val:]

    def add(split_name, arr):
        for p in arr:
            rows.append({
                "image_path": str(p.relative_to(PROJECT_ROOT)),
                "label_name": c,
                "label_id": label_map[c],
                "split": split_name
            })

    add("train", train_files)
    add("val", val_files)
    add("test", test_files)

    print(f"{c:<25} total={n:>5} | train={len(train_files):>5} val={len(val_files):>5} test={len(test_files):>5}")

df = pd.DataFrame(rows)
print("\n‚úÖ Total rows:", len(df))
df.head()


acne_pimples              total= 3017 | train= 2111 val=  452 test=  454
redness_irritation        total= 5871 | train= 4109 val=  880 test=  882
dark_spots_uneven_tone    total= 1057 | train=  739 val=  158 test=  160
dryness_flaking           total=  624 | train=  436 val=   93 test=   95
normal_clear              total= 2640 | train= 1847 val=  396 test=  397

‚úÖ Total rows: 13209


Unnamed: 0,image_path,label_name,label_id,split
0,data/12_skin_concerns/curated/acne_pimples/acn...,acne_pimples,0,train
1,data/12_skin_concerns/curated/acne_pimples/acn...,acne_pimples,0,train
2,data/12_skin_concerns/curated/acne_pimples/acn...,acne_pimples,0,train
3,data/12_skin_concerns/curated/acne_pimples/acn...,acne_pimples,0,train
4,data/12_skin_concerns/curated/acne_pimples/acn...,acne_pimples,0,train


In [11]:
df.to_csv(OUT_CSV, index=False)

with open(OUT_LABELMAP, "w") as f:
    json.dump(label_map, f, indent=2)

print("‚úÖ Saved:", OUT_CSV)
print("‚úÖ Saved:", OUT_LABELMAP)
print("Label map:", label_map)


‚úÖ Saved: /content/drive/MyDrive/SkinCare_AI_Component/data/12_skin_concerns/metadata/image_index_skin_concerns.csv
‚úÖ Saved: /content/drive/MyDrive/SkinCare_AI_Component/data/12_skin_concerns/metadata/label_map_skin_concerns.json
Label map: {'acne_pimples': 0, 'redness_irritation': 1, 'dark_spots_uneven_tone': 2, 'dryness_flaking': 3, 'normal_clear': 4}


In [12]:
print("\n=== Split counts ===")
print(df["split"].value_counts())

print("\n=== Crosstab (split x class) ===")
print(pd.crosstab(df["split"], df["label_name"]))



=== Split counts ===
split
train    9242
test     1988
val      1979
Name: count, dtype: int64

=== Crosstab (split x class) ===
label_name  acne_pimples  dark_spots_uneven_tone  dryness_flaking  \
split                                                               
test                 454                     160               95   
train               2111                     739              436   
val                  452                     158               93   

label_name  normal_clear  redness_irritation  
split                                         
test                 397                 882  
train               1847                4109  
val                  396                 880  
