In [6]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
from pathlib import Path
import json
import pandas as pd

PROJECT_ROOT = Path("/content/drive/MyDrive/SkinCare_AI_Component")

SPLITS_DIR = PROJECT_ROOT / "data" / "11_skin_type" / "splits"
META_DIR   = PROJECT_ROOT / "data" / "11_skin_type" / "metadata"

CLASSES = ["oily", "dry", "combination"]
SPLITS  = ["train", "val", "test"]

IMG_EXTS = {".jpg",".jpeg",".png",".webp",".jfif",".bmp",".tif",".tiff",".gif"}

META_DIR.mkdir(parents=True, exist_ok=True)

print("SPLITS_DIR:", SPLITS_DIR)
print("META_DIR  :", META_DIR)


SPLITS_DIR: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/splits
META_DIR  : /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata


In [8]:
def is_img(p: Path):
    return p.is_file() and p.suffix.lower() in IMG_EXTS


In [9]:
print("=== SKIN TYPE SPLIT COUNTS ===")
for sp in SPLITS:
    print(f"\n{sp.upper()}:")
    total = 0
    for c in CLASSES:
        folder = SPLITS_DIR / sp / c
        n = len([p for p in folder.iterdir() if is_img(p)])
        total += n
        print(f"  {c:12s} {n}")
    print(f"  {'TOTAL':12s} {total}")


=== SKIN TYPE SPLIT COUNTS ===

TRAIN:
  oily         1764
  dry          2100
  combination  1114
  TOTAL        4978

VAL:
  oily         346
  dry          431
  combination  231
  TOTAL        1008

TEST:
  oily         361
  dry          443
  combination  229
  TOTAL        1033


In [10]:
label_map = {name: i for i, name in enumerate(CLASSES)}
id_to_label = {i: name for name, i in label_map.items()}

label_map_path = META_DIR / "label_map_skin_type.json"
with open(label_map_path, "w") as f:
    json.dump(label_map, f, indent=2)

print("✅ Saved:", label_map_path)
label_map


✅ Saved: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata/label_map_skin_type.json


{'oily': 0, 'dry': 1, 'combination': 2}

In [11]:
rows = []

for sp in SPLITS:
    for c in CLASSES:
        folder = SPLITS_DIR / sp / c
        for p in folder.iterdir():
            if not is_img(p):
                continue
            rows.append({
                "image_path": str(p.relative_to(PROJECT_ROOT)),
                "label_name": c,
                "label_id": label_map[c],
                "split": sp
            })

df = pd.DataFrame(rows).sort_values(["split","label_id","image_path"]).reset_index(drop=True)

index_path = META_DIR / "image_index_skin_type.csv"
df.to_csv(index_path, index=False)

print("✅ Saved:", index_path)
print("Total rows:", len(df))
df.head()


✅ Saved: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata/image_index_skin_type.csv
Total rows: 7019


Unnamed: 0,image_path,label_name,label_id,split
0,data/11_skin_type/splits/test/oily/00705a62e3f...,oily,0,test
1,data/11_skin_type/splits/test/oily/0108b7638b1...,oily,0,test
2,data/11_skin_type/splits/test/oily/0108b7638b1...,oily,0,test
3,data/11_skin_type/splits/test/oily/0108b7638b1...,oily,0,test
4,data/11_skin_type/splits/test/oily/0108b7638b1...,oily,0,test


In [12]:
# duplicates?
dup = df["image_path"].duplicated().sum()
print("Duplicate image_path rows:", dup)

# missing files?
missing = 0
for p in df["image_path"].sample(min(100, len(df)), random_state=42):
    if not (PROJECT_ROOT / p).exists():
        missing += 1

print("Missing files in sample:", missing)


Duplicate image_path rows: 0
Missing files in sample: 0
