In [1]:
from google.colab import drive
drive.mount("/content/drive")


Mounted at /content/drive


In [2]:
!pip -q install timm pandas scikit-learn tqdm


In [3]:
from pathlib import Path
import json, shutil
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

import timm
from tqdm import tqdm

from sklearn.metrics import confusion_matrix


In [4]:
PROJECT_ROOT = Path("/content/drive/MyDrive/SkinCare_AI_Component")

META_DIR = PROJECT_ROOT / "data/11_skin_type/metadata"
INDEX_CSV = META_DIR / "image_index_skin_type.csv"
LABEL_MAP_JSON = META_DIR / "label_map_skin_type.json"

# Try these in order (first existing file will be used)
CANDIDATE_CKPTS = [
    PROJECT_ROOT / "models/vision/skin_type_best.pt",
    PROJECT_ROOT / "models/vision/skin_type_convnext_best.pt",
    PROJECT_ROOT / "models/vision/skin_type_swin_best.pt",
    PROJECT_ROOT / "models/vision/skin_type_mobilenet_best.pt",
]

OUT_REVIEW_DIR = PROJECT_ROOT / "data/11_skin_type/to_review"
OUT_REVIEW_DIR.mkdir(parents=True, exist_ok=True)

OUT_CSV = META_DIR / "skin_type_to_review.csv"

# Flagging rules
CONF_THRESHOLD = 0.90   # change to 0.95 (fewer) or 0.85 (more)
MAX_COPY = 3000         # cap number of images copied to to_review/
BATCH_SIZE = 64
IMG_SIZE = 224

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

assert INDEX_CSV.exists(), f"❌ Missing index CSV: {INDEX_CSV}"
assert LABEL_MAP_JSON.exists(), f"❌ Missing label map JSON: {LABEL_MAP_JSON}"


Device: cpu


In [5]:
df = pd.read_csv(INDEX_CSV)

with open(LABEL_MAP_JSON, "r") as f:
    label_map = json.load(f)  # e.g. {"oily":0,"dry":1,"combination":2}

id_to_label = {v: k for k, v in label_map.items()}
num_classes = len(label_map)

print("Classes:", label_map)
print("Total rows:", len(df))

df_train = df[df["split"] == "train"].copy()
print("Train rows:", len(df_train))

df_train.head()


Classes: {'oily': 0, 'dry': 1, 'combination': 2}
Total rows: 6992
Train rows: 4893


Unnamed: 0,image_path,label_name,label_id,split
0,data/11_skin_type/splits/train/oily/b804f8b859...,oily,0,train
1,data/11_skin_type/splits/train/oily/33a3b2173d...,oily,0,train
2,data/11_skin_type/splits/train/oily/891545fe95...,oily,0,train
3,data/11_skin_type/splits/train/oily/dry_new_69...,oily,0,train
4,data/11_skin_type/splits/train/oily/29775af660...,oily,0,train


In [6]:
ckpt_path = None
for p in CANDIDATE_CKPTS:
    if p.exists():
        ckpt_path = p
        break

if ckpt_path is None:
    raise FileNotFoundError(
        "❌ Could not find any checkpoint.\nTried:\n" + "\n".join(map(str, CANDIDATE_CKPTS))
    )

print("✅ Using checkpoint:", ckpt_path)


✅ Using checkpoint: /content/drive/MyDrive/SkinCare_AI_Component/models/vision/skin_type_best.pt


In [7]:
ckpt = torch.load(ckpt_path, map_location=device)

# If your checkpoint saved model_name, use it. Otherwise fallback.
MODEL_NAME = ckpt.get("model_name", "convnext_tiny")
print("Model name:", MODEL_NAME)

model = timm.create_model(MODEL_NAME, pretrained=False, num_classes=num_classes).to(device)

# Robust state load: works even if head naming differs slightly
missing, unexpected = model.load_state_dict(ckpt["model_state"], strict=False)
print("Loaded. Missing keys:", len(missing), "| Unexpected keys:", len(unexpected))

model.eval()
softmax = nn.Softmax(dim=1)
print("✅ Model ready.")


Model name: convnext_tiny
Loaded. Missing keys: 0 | Unexpected keys: 0
✅ Model ready.


In [8]:
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD  = (0.229, 0.224, 0.225)

eval_tfms = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

class TrainDS(Dataset):
    def __init__(self, df_, tfm, project_root):
        self.df = df_.reset_index(drop=True)
        self.tfm = tfm
        self.project_root = Path(project_root)

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        r = self.df.iloc[idx]

        # ✅ Works whether image_path is relative or absolute
        p = Path(r["image_path"])
        img_path = p if p.is_absolute() else (self.project_root / p)

        img = Image.open(img_path).convert("RGB")
        x = self.tfm(img)
        y = int(r["label_id"])
        return x, y, str(r["image_path"])

ds = TrainDS(df_train, eval_tfms, PROJECT_ROOT)

loader = DataLoader(
    ds,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,      # ✅ Drive-safe
    pin_memory=False    # ✅ Drive-safe
)

print("✅ Loader ready. Batches:", int(np.ceil(len(ds)/BATCH_SIZE)))


✅ Loader ready. Batches: 77


In [9]:
all_true, all_pred, all_conf, all_path = [], [], [], []

with torch.no_grad():
    for x, y, rel_paths in tqdm(loader):
        x = x.to(device)

        logits = model(x)
        probs = softmax(logits)

        conf, pred = probs.max(dim=1)

        all_true.extend(y.numpy().tolist())
        all_pred.extend(pred.cpu().numpy().tolist())
        all_conf.extend(conf.cpu().numpy().tolist())
        all_path.extend(list(rel_paths))

print("✅ Inference done.")


100%|██████████| 77/77 [46:00<00:00, 35.85s/it]

✅ Inference done.





In [10]:
rows = []
for t, p, c, rp in zip(all_true, all_pred, all_conf, all_path):
    if (p != t) and (c >= CONF_THRESHOLD):
        rows.append({
            "image_path": rp,
            "true_id": int(t),
            "true_label": id_to_label[int(t)],
            "pred_id": int(p),
            "pred_label": id_to_label[int(p)],
            "confidence": float(c),
        })

review_df = pd.DataFrame(rows).sort_values("confidence", ascending=False)

print("Flagged samples:", len(review_df))
review_df.head(20)


Flagged samples: 205


Unnamed: 0,image_path,true_id,true_label,pred_id,pred_label,confidence
100,data/11_skin_type/splits/train/dry/4cfea2162b0...,1,dry,2,combination,0.999632
17,data/11_skin_type/splits/train/oily/c7da614896...,0,oily,2,combination,0.999334
57,data/11_skin_type/splits/train/oily/c7da614896...,0,oily,2,combination,0.999173
108,data/11_skin_type/splits/train/dry/combination...,1,dry,2,combination,0.997956
139,data/11_skin_type/splits/train/dry/combination...,1,dry,2,combination,0.997818
141,data/11_skin_type/splits/train/dry/combination...,1,dry,2,combination,0.997361
72,data/11_skin_type/splits/train/oily/dry13_jpg....,0,oily,1,dry,0.992617
3,data/11_skin_type/splits/train/oily/d51421f003...,0,oily,1,dry,0.992026
61,data/11_skin_type/splits/train/oily/d51421f003...,0,oily,1,dry,0.991125
25,data/11_skin_type/splits/train/oily/asian-woma...,0,oily,1,dry,0.989807


In [11]:
review_df.to_csv(OUT_CSV, index=False)
print("✅ Saved:", OUT_CSV)


✅ Saved: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/metadata/skin_type_to_review.csv


In [12]:
copied = 0
limit = MAX_COPY if (MAX_COPY is not None) else len(review_df)

for _, r in review_df.head(limit).iterrows():
    p = Path(r["image_path"])
    src = p if p.is_absolute() else (PROJECT_ROOT / p)

    sub = f'{r["true_label"]}__pred_{r["pred_label"]}'
    dst_dir = OUT_REVIEW_DIR / sub
    dst_dir.mkdir(parents=True, exist_ok=True)

    dst = dst_dir / src.name
    if not dst.exists():
        shutil.copy2(src, dst)
        copied += 1

print(f"✅ Copied {copied} images into: {OUT_REVIEW_DIR}")


✅ Copied 205 images into: /content/drive/MyDrive/SkinCare_AI_Component/data/11_skin_type/to_review


In [13]:
if len(review_df) == 0:
    print("No high-confidence mismatches found at this threshold.")
else:
    print("\n=== Top mismatch pairs (true -> pred) ===")
    print(review_df.groupby(["true_label", "pred_label"]).size().sort_values(ascending=False).head(20))

print("\n=== TRAIN confusion matrix (all train predictions) ===")
cm = confusion_matrix(all_true, all_pred, labels=list(range(num_classes)))
labels = [id_to_label[i] for i in range(num_classes)]
cm_df = pd.DataFrame(cm, index=[f"true_{x}" for x in labels], columns=[f"pred_{x}" for x in labels])
cm_df



=== Top mismatch pairs (true -> pred) ===
true_label   pred_label 
oily         dry            56
combination  dry            38
oily         combination    32
dry          combination    30
             oily           27
combination  oily           22
dtype: int64

=== TRAIN confusion matrix (all train predictions) ===


Unnamed: 0,pred_oily,pred_dry,pred_combination
true_oily,1380,182,155
true_dry,131,1813,133
true_combination,105,103,891
