In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
import re
import numpy as np
import pandas as pd

BASE_DIR = "/content/drive/MyDrive/SkinAI_Project"

DATA_DIR = os.path.join(BASE_DIR, "data")
RAW_DIR  = os.path.join(DATA_DIR, "raw")
TEXT_DIR = os.path.join(DATA_DIR, "text")
SYN_DIR  = os.path.join(DATA_DIR, "synthetic")
KG_DIR   = os.path.join(DATA_DIR, "knowledge_graph")
PROCESSED_DIR = os.path.join(DATA_DIR, "processed")

os.makedirs(PROCESSED_DIR, exist_ok=True)

print("BASE_DIR:", BASE_DIR)
print("RAW_DIR:", RAW_DIR)
print("TEXT_DIR:", TEXT_DIR)
print("SYN_DIR:", SYN_DIR)
print("KG_DIR:", KG_DIR)
print("PROCESSED_DIR:", PROCESSED_DIR)

print("\nRAW_DIR contents:", os.listdir(RAW_DIR) if os.path.exists(RAW_DIR) else "NOT FOUND")
print("TEXT_DIR contents:", os.listdir(TEXT_DIR) if os.path.exists(TEXT_DIR) else "NOT FOUND")
print("SYN_DIR contents:", os.listdir(SYN_DIR) if os.path.exists(SYN_DIR) else "NOT FOUND")
print("KG_DIR contents:", os.listdir(KG_DIR) if os.path.exists(KG_DIR) else "NOT FOUND")


Mounted at /content/drive
BASE_DIR: /content/drive/MyDrive/SkinAI_Project
RAW_DIR: /content/drive/MyDrive/SkinAI_Project/data/raw
TEXT_DIR: /content/drive/MyDrive/SkinAI_Project/data/text
SYN_DIR: /content/drive/MyDrive/SkinAI_Project/data/synthetic
KG_DIR: /content/drive/MyDrive/SkinAI_Project/data/knowledge_graph
PROCESSED_DIR: /content/drive/MyDrive/SkinAI_Project/data/processed

RAW_DIR contents: ['train-00000-of-00001', 'ps_md_nlp_skin_data', 'combined_data', 'Skin-Disease-Text-Data', 'Skin_text_classifier']
TEXT_DIR contents: ['skin_text_dataset_processed.csv', 'Skin-Disease-Text-Data', 'Skin_text_classifier.csv']
SYN_DIR contents: ['patient_profiles_10000.csv', 'treatment_records_10000.csv', 'disease_master.csv', 'symptom_list.csv', 'disease_symptom_edges.csv', 'treatment_stats_grouped.csv', 'clinical_cases_10000.csv', 'label_mapping_normalize.csv', 'label_mapping_6class_groups.csv']
KG_DIR contents: NOT FOUND


In [2]:
def clean_text(t: str) -> str:
    """Basic text cleaning for TF-IDF models."""
    t = str(t)
    t = t.lower()
    t = re.sub(r"\s+", " ", t).strip()
    return t

def safe_read_csv(path):
    """Read CSV with safe defaults."""
    return pd.read_csv(path)

def safe_read_excel(path):
    """Read Excel with safe defaults."""
    return pd.read_excel(path)

def safe_read_parquet(path):
    """Read parquet if present."""
    return pd.read_parquet(path)


In [3]:
LABEL_MAP_PATH = os.path.join(SYN_DIR, "label_mapping_normalize.csv")

label_map = None
if os.path.exists(LABEL_MAP_PATH):
    lm = pd.read_csv(LABEL_MAP_PATH)
    print("Loaded label mapping:", lm.shape)
    print("Mapping columns:", lm.columns.tolist())

    # auto-detect columns
    cols_lower = [c.lower() for c in lm.columns]
    if "old_label" in cols_lower and "new_label" in cols_lower:
        old_col = lm.columns[cols_lower.index("old_label")]
        new_col = lm.columns[cols_lower.index("new_label")]
    else:
        # fallback: first 2 columns
        old_col, new_col = lm.columns[0], lm.columns[1]

    label_map = dict(
        zip(lm[old_col].astype(str).str.strip(),
            lm[new_col].astype(str).str.strip())
    )

    print("Mapping size:", len(label_map))
else:
    print("⚠️ label_mapping_normalize.csv NOT FOUND.")
    print("Labels will not be normalized (not recommended).")

def normalize_label(lbl: str) -> str:
    lbl = str(lbl).strip()
    if label_map and lbl in label_map:
        return label_map[lbl]
    return lbl


Loaded label mapping: (47, 2)
Mapping columns: ['original_label', 'normalized_label']
Mapping size: 47


In [4]:
labeled_dfs = []

# ---- A) data/text/ ----
text_skin_csv = os.path.join(TEXT_DIR, "Skin_text_classifier.csv")
text_processed_csv = os.path.join(TEXT_DIR, "skin_text_dataset_processed.csv")

if os.path.exists(text_skin_csv):
    df = safe_read_csv(text_skin_csv)
    print("Loaded:", text_skin_csv, df.shape, df.columns.tolist())
    # try to standardize columns
    if "text" in df.columns and "label" in df.columns:
        df2 = df[["text","label"]].copy()
    else:
        # fallback: common raw naming
        rename = {}
        if "Text" in df.columns: rename["Text"] = "text"
        if "Disease name" in df.columns: rename["Disease name"] = "label"
        df2 = df.rename(columns=rename)[["text","label"]].copy()
    df2["source"] = "text/Skin_text_classifier.csv"
    labeled_dfs.append(df2)

if os.path.exists(text_processed_csv):
    df = safe_read_csv(text_processed_csv)
    print("Loaded:", text_processed_csv, df.shape, df.columns.tolist())
    # expected already standardized
    if "text" in df.columns and "label" in df.columns:
        df2 = df[["text","label"]].copy()
        df2["source"] = "text/skin_text_dataset_processed.csv"
        labeled_dfs.append(df2)

# ---- B) data/raw/ Skin_text_classifier folder ----
raw_skin_csv = os.path.join(RAW_DIR, "Skin_text_classifier", "Skin_text_classifier.csv")
if os.path.exists(raw_skin_csv):
    df = safe_read_csv(raw_skin_csv)
    print("Loaded:", raw_skin_csv, df.shape, df.columns.tolist())
    rename = {}
    if "Text" in df.columns: rename["Text"] = "text"
    if "Disease name" in df.columns: rename["Disease name"] = "label"
    df2 = df.rename(columns=rename)
    if "text" in df2.columns and "label" in df2.columns:
        df2 = df2[["text","label"]].copy()
        df2["source"] = "raw/Skin_text_classifier/Skin_text_classifier.csv"
        labeled_dfs.append(df2)

# ---- C) data/raw ps_md_nlp_skin_data ----
raw_xlsx = os.path.join(RAW_DIR, "ps_md_nlp_skin_data", "ps_md_nlp_skin_data.xlsx")
if os.path.exists(raw_xlsx):
    df = safe_read_excel(raw_xlsx)
    print("Loaded:", raw_xlsx, df.shape, df.columns.tolist())
    rename = {}
    if "Patient_Statement" in df.columns: rename["Patient_Statement"] = "text"
    if "Disease Class" in df.columns: rename["Disease Class"] = "label"
    df2 = df.rename(columns=rename)
    if "text" in df2.columns and "label" in df2.columns:
        df2 = df2[["text","label"]].copy()
        df2["source"] = "raw/ps_md_nlp_skin_data.xlsx"
        labeled_dfs.append(df2)

# ---- D) data/raw HuggingFace parquet train-00000-of-00001 ----
raw_parquet = os.path.join(RAW_DIR, "train-00000-of-00001", "train-00000-of-00001.parquet")
if os.path.exists(raw_parquet):
    df = safe_read_parquet(raw_parquet)
    print("Loaded:", raw_parquet, df.shape, df.columns.tolist())
    rename = {}
    if "question" in df.columns: rename["question"] = "text"
    if "condition" in df.columns: rename["condition"] = "label"
    df2 = df.rename(columns=rename)
    if "text" in df2.columns and "label" in df2.columns:
        df2 = df2[["text","label"]].copy()
        df2["source"] = "raw/hf_parquet"
        labeled_dfs.append(df2)

print("\nTotal labeled sources loaded:", len(labeled_dfs))


Loaded: /content/drive/MyDrive/SkinAI_Project/data/text/Skin_text_classifier.csv (143, 2) ['Disease name', 'Text']
Loaded: /content/drive/MyDrive/SkinAI_Project/data/text/skin_text_dataset_processed.csv (286, 2) ['Disease name', 'Text']
Loaded: /content/drive/MyDrive/SkinAI_Project/data/raw/Skin_text_classifier/Skin_text_classifier.csv (143, 2) ['Disease name', 'Text']
Loaded: /content/drive/MyDrive/SkinAI_Project/data/raw/ps_md_nlp_skin_data/ps_md_nlp_skin_data.xlsx (903, 4) ['Disease Class', 'Disease Definition', 'Patient_Statement', 'File Name']
Loaded: /content/drive/MyDrive/SkinAI_Project/data/raw/train-00000-of-00001/train-00000-of-00001.parquet (11, 5) ['question', 'answer', 'condition', 'difficulty', 'source_url']

Total labeled sources loaded: 4


In [5]:
def load_text_folder(root_dir, source="raw/Skin-Disease-Text-Data"):
    """
    Expected:
      root_dir/
        DiseaseName/
          *.txt
    Returns columns: text, label, source
    """
    records = []
    if not os.path.exists(root_dir):
        print("Folder not found:", root_dir)
        return pd.DataFrame(columns=["text","label","source"])

    for disease in os.listdir(root_dir):
        dpath = os.path.join(root_dir, disease)
        if not os.path.isdir(dpath):
            continue
        for fname in os.listdir(dpath):
            fpath = os.path.join(dpath, fname)
            if not os.path.isfile(fpath):
                continue
            if not fname.lower().endswith(".txt"):
                continue
            try:
                with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
                    txt = f.read().strip()
                if txt:
                    records.append({"text": txt, "label": disease, "source": source})
            except Exception as e:
                print("Read error:", fpath, e)

    df = pd.DataFrame(records)
    print("Loaded folder dataset:", df.shape)
    return df

folder_dir = os.path.join(RAW_DIR, "Skin-Disease-Text-Data")
df_folder = load_text_folder(folder_dir)

if not df_folder.empty:
    labeled_dfs.append(df_folder)

print("Total labeled sources after folder:", len(labeled_dfs))


Loaded folder dataset: (143, 3)
Total labeled sources after folder: 5


In [6]:
if len(labeled_dfs) == 0:
    raise ValueError("No labeled datasets found. Check your folders/paths.")

df_all = pd.concat(labeled_dfs, ignore_index=True)

# Basic cleanup
df_all = df_all.dropna(subset=["text","label"]).copy()
df_all["text"] = df_all["text"].astype(str)
df_all["label"] = df_all["label"].astype(str)

# Apply text cleaning + label normalization
df_all["text"] = df_all["text"].apply(clean_text)
df_all["label"] = df_all["label"].apply(normalize_label)

# Remove empty text/labels
df_all = df_all[(df_all["text"].str.len() > 5) & (df_all["label"].str.len() > 0)]

# Remove duplicates
before = len(df_all)
df_all = df_all.drop_duplicates(subset=["text","label"])
after = len(df_all)

print("Merged dataset shape:", df_all.shape)
print("Duplicates removed:", before - after)

print("\nTop 30 labels after normalization:")
print(df_all["label"].value_counts().head(30))


Merged dataset shape: (619, 3)
Duplicates removed: 724

Top 30 labels after normalization:
label
Dermatofibroma                  40
Benign keratosis                40
Actinic keratosis               36
Atopic Dermatitis               36
Squamous cell carcinoma         36
Melanocytic nevus               36
Melanoma                        36
Vascular lesion                 35
Tinea Ringworm Candidiasis      35
Acne                            30
Ringworm (Tinea Corporis)       22
Eczema                          22
Folliculitis                    22
Hives (Urticaria)               22
Scabies                         22
Vitiligo                        22
Psoriasis                       22
Rosacea                         22
Impetigo                        22
Shingles (Herpes Zoster)        22
Contact Dermatitis              14
Athlete's Foot (Tinea Pedis)    11
Athlete Foot (Tinea Pedis)      11
Acne vulgaris                    3
Name: count, dtype: int64


In [7]:
USE_SYNTHETIC_CASES = True

clinical_path = os.path.join(SYN_DIR, "clinical_cases_10000.csv")
df_syn = None

def build_case_text(row: pd.Series) -> str:
    """
    Build natural language text from structured clinical case row.
    Schema-flexible: uses the most informative columns if available.
    """
    parts = []
    preferred_cols = [
        "age","gender","sex","body_part","location","site","duration",
        "itching","pain","burning","fever",
        "rash_type","lesion_type","appearance",
        "symptoms","symptom_text",
        "history","notes","description","case_text"
    ]
    for c in preferred_cols:
        if c in row.index and pd.notna(row[c]) and str(row[c]).strip():
            parts.append(f"{c.replace('_',' ')}: {row[c]}")

    # fallback: include other non-empty fields (except label-like)
    if len(parts) < 2:
        for c in row.index:
            if c.lower() in ["label","disease","condition","diagnosis","dx","final_diagnosis"]:
                continue
            v = row[c]
            if pd.notna(v) and str(v).strip():
                parts.append(f"{c}: {v}")

    return " | ".join(parts).strip()

def detect_label_column(columns):
    """
    Robust label/diagnosis column detection.
    """
    cols = list(columns)
    cols_lower = [c.lower().strip() for c in cols]

    # 1) strong exact matches
    strong = ["label", "disease", "condition", "diagnosis", "final_diagnosis", "disease_name", "disease label", "dx"]
    for key in strong:
        if key in cols_lower:
            return cols[cols_lower.index(key)]

    # 2) contains-based matches
    keywords = ["diagnos", "disease", "condition", "label", "class", "dx"]
    candidates = []
    for c, cl in zip(cols, cols_lower):
        if any(k in cl for k in keywords):
            candidates.append(c)

    # prefer candidates that look most label-like
    priority = ["final", "primary", "main", "target"]
    if candidates:
        # if any candidate contains priority words, pick the first
        for p in priority:
            for c in candidates:
                if p in c.lower():
                    return c
        return candidates[0]

    return None

if USE_SYNTHETIC_CASES and os.path.exists(clinical_path):
    raw = pd.read_csv(clinical_path)
    print("Loaded synthetic clinical cases:", raw.shape)
    print("Columns:", raw.columns.tolist())

    lbl_col = detect_label_column(raw.columns)

    if lbl_col is None:
        print("⚠️ Could not auto-detect label column in clinical_cases_10000.csv.")
        print("➡️ Candidate columns that might be the label:")
        # show any column that contains clue keywords
        clue_cols = [c for c in raw.columns if any(k in c.lower() for k in ["diagnos","disease","condition","label","class","dx"])]
        print(clue_cols if clue_cols else "(No obvious candidates)")

        # HARD FAIL (so you notice immediately instead of silently skipping)
        raise ValueError(
            "No label column found in clinical_cases_10000.csv. "
            "Please tell me which column contains the disease/diagnosis label."
        )

    print("✅ Using label column:", lbl_col)

    df_syn = pd.DataFrame({
        "text": raw.apply(build_case_text, axis=1),
        "label": raw[lbl_col].astype(str),
        "source": "synthetic/clinical_cases_10000.csv"
    })

    # Clean + normalize
    df_syn["text"] = df_syn["text"].apply(clean_text)
    df_syn["label"] = df_syn["label"].apply(normalize_label)

    # Filter junk
    df_syn = df_syn[(df_syn["text"].str.len() > 10) & (df_syn["label"].str.len() > 0)]
    df_syn = df_syn.drop_duplicates(subset=["text","label"])

    print("Prepared synthetic rows:", df_syn.shape)
    print("Top synthetic labels:")
    print(df_syn["label"].value_counts().head(20))

    # Merge into df_all
    before = len(df_all)
    df_all = pd.concat([df_all, df_syn], ignore_index=True)
    df_all = df_all.drop_duplicates(subset=["text","label"])
    print("After adding synthetic:", df_all.shape, "| Added:", len(df_all) - before)

else:
    print("Synthetic clinical cases not used (disabled or file missing).")


Loaded synthetic clinical cases: (10000, 20)
Columns: ['case_id', 'patient_id', 'age', 'gender', 'region', 'skin_type', 'fitzpatrick_type', 'primary_disease', 'disease_category', 'severity', 'duration_weeks', 'body_site', 'symptom_text', 'allergies', 'comorbidities', 'cultural_practice', 'recommended_medicine', 'is_over_the_counter', 'follow_up_required', 'treatment_outcome']
✅ Using label column: primary_disease
Prepared synthetic rows: (10000, 3)
Top synthetic labels:
label
Tinea corporis (Ringworm)         393
Urticaria (Hives)                 384
Seborrheic dermatitis             377
Dyshidrotic eczema                367
Herpes simplex (Cold sores)       362
Impetigo                          361
Pityriasis versicolor             360
Lichen planus                     355
Contact dermatitis (Irritant)     351
Perioral dermatitis               351
Acne vulgaris                     351
Tinea capitis (Scalp ringworm)    351
Molluscum contagiosum             350
Tinea cruris (Jock itch) 

In [8]:
normalized_master_path = os.path.join(PROCESSED_DIR, "text_cleaned_normalized.csv")
df_all.to_csv(normalized_master_path, index=False)
print("✅ Saved normalized master dataset:", normalized_master_path)
print("Rows:", df_all.shape[0], "| Classes:", df_all["label"].nunique())


✅ Saved normalized master dataset: /content/drive/MyDrive/SkinAI_Project/data/processed/text_cleaned_normalized.csv
Rows: 10619 | Classes: 47


In [9]:
group_map_path = os.path.join(SYN_DIR, "label_mapping_6class_groups.csv")

df_group6 = None
if os.path.exists(group_map_path):
    gm = pd.read_csv(group_map_path)
    print("Loaded 6-class mapping:", gm.shape, gm.columns.tolist())

    cols_lower = [c.lower() for c in gm.columns]
    if "label" in cols_lower and "group" in cols_lower:
        lbl_col = gm.columns[cols_lower.index("label")]
        grp_col = gm.columns[cols_lower.index("group")]
    else:
        lbl_col, grp_col = gm.columns[0], gm.columns[1]

    group_map = dict(zip(gm[lbl_col].astype(str).str.strip(), gm[grp_col].astype(str).str.strip()))

    df_group6 = df_all.copy()
    df_group6["label_group"] = df_group6["label"].apply(lambda x: group_map.get(x, "OTHER"))

    group6_path = os.path.join(PROCESSED_DIR, "text_cleaned_grouped6.csv")
    df_group6.to_csv(group6_path, index=False)
    print("✅ Saved grouped6 dataset:", group6_path)
    print(df_group6["label_group"].value_counts())
else:
    print("No 6-class mapping found, skipping grouped dataset.")


Loaded 6-class mapping: (48, 2) ['original_label', 'group_label']
✅ Saved grouped6 dataset: /content/drive/MyDrive/SkinAI_Project/data/processed/text_cleaned_grouped6.csv
label_group
OTHER    10619
Name: count, dtype: int64


In [10]:
from sklearn.model_selection import train_test_split

# ✅ FORCE: always split from the normalized master dataset we just saved
df_data = pd.read_csv(normalized_master_path)

print("Splitting from normalized dataset:", df_data.shape)
print(df_data["label"].value_counts().head(30))

# Stratify only if safe (each class must have >=2)
vc = df_data["label"].value_counts()
strat = None if (vc < 2).any() else df_data["label"]

train_df, temp_df = train_test_split(
    df_data,
    test_size=0.30,
    random_state=42,
    stratify=strat
)

vc_temp = temp_df["label"].value_counts()
strat_temp = None if (vc_temp < 2).any() else temp_df["label"]

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.50,
    random_state=42,
    stratify=strat_temp
)

train_out = os.path.join(PROCESSED_DIR, "train.csv")
val_out   = os.path.join(PROCESSED_DIR, "val.csv")
test_out  = os.path.join(PROCESSED_DIR, "test.csv")

train_df.to_csv(train_out, index=False)
val_df.to_csv(val_out, index=False)
test_df.to_csv(test_out, index=False)

print("\n✅ Saved splits (OVERWRITTEN):")
print("Train:", train_df.shape, train_out)
print("Val  :", val_df.shape,   val_out)
print("Test :", test_df.shape,  test_out)


Splitting from normalized dataset: (10619, 3)
label
Tinea corporis (Ringworm)         393
Urticaria (Hives)                 384
Impetigo                          383
Seborrheic dermatitis             377
Dyshidrotic eczema                367
Rosacea                           362
Herpes simplex (Cold sores)       362
Pityriasis versicolor             360
Lichen planus                     355
Folliculitis                      355
Acne vulgaris                     354
Tinea capitis (Scalp ringworm)    351
Perioral dermatitis               351
Contact dermatitis (Irritant)     351
Molluscum contagiosum             350
Tinea cruris (Jock itch)          350
Chronic urticaria                 348
Vitiligo                          347
Nummular eczema                   345
Scabies                           342
Acne rosacea                      337
Cellulitis                        334
Psoriasis vulgaris                334
Contact dermatitis (Allergic)     330
Herpes zoster (Shingles)          32

In [11]:
df_train_check = pd.read_csv(os.path.join(PROCESSED_DIR, "train.csv"))

print(df_train_check.shape)
print(df_train_check["label"].value_counts().head(30))

print("\nVerification:")
print("Contains 'Acne vulgaris'?", (df_train_check["label"] == "Acne vulgaris").any())
print("Contains 'Tinea Ringworm Candidiasis'?", (df_train_check["label"] == "Tinea Ringworm Candidiasis").any())
print("Contains 'Athlete Foot (Tinea Pedis)'?", (df_train_check["label"] == "Athlete Foot (Tinea Pedis)").any())
print("Contains \"Athlete's Foot (Tinea Pedis)\"?", (df_train_check["label"] == "Athlete's Foot (Tinea Pedis)").any())


(7433, 3)
label
Tinea corporis (Ringworm)         275
Urticaria (Hives)                 269
Impetigo                          268
Seborrheic dermatitis             264
Dyshidrotic eczema                257
Herpes simplex (Cold sores)       253
Rosacea                           253
Pityriasis versicolor             252
Lichen planus                     248
Folliculitis                      248
Acne vulgaris                     248
Tinea capitis (Scalp ringworm)    246
Contact dermatitis (Irritant)     246
Perioral dermatitis               246
Tinea cruris (Jock itch)          245
Molluscum contagiosum             245
Chronic urticaria                 244
Vitiligo                          243
Nummular eczema                   242
Scabies                           239
Acne rosacea                      236
Cellulitis                        234
Psoriasis vulgaris                234
Contact dermatitis (Allergic)     231
Herpes zoster (Shingles)          230
Atopic dermatitis (Eczema)        