Check Folders Structure

In [1]:
import os

def print_folders_only(root_dir, prefix=""):
    try:
        entries = sorted(
            [e for e in os.listdir(root_dir)
             if os.path.isdir(os.path.join(root_dir, e))]
        )
    except PermissionError:
        print(prefix + "‚îî‚îÄ‚îÄ [Permission Denied]")
        return

    for i, folder in enumerate(entries):
        path = os.path.join(root_dir, folder)
        is_last = (i == len(entries) - 1)

        connector = "‚îî‚îÄ‚îÄ " if is_last else "‚îú‚îÄ‚îÄ "
        print(prefix + connector + folder)

        extension = "    " if is_last else "‚îÇ   "
        print_folders_only(path, prefix + extension)

if __name__ == "__main__":
    ROOT_DIR = os.getcwd()   # current directory
    print(f"\nüìÅ Folder Structure (Directories Only)\nüìç Root: {ROOT_DIR}\n")
    print_folders_only(ROOT_DIR)



üìÅ Folder Structure (Directories Only)
üìç Root: c:\Users\umair\Videos\PhD\PhD Data\Week 8 Jannuary\Code

‚îú‚îÄ‚îÄ CleanData
‚îÇ   ‚îú‚îÄ‚îÄ HAM10000
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ images
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ segmentations
‚îÇ   ‚îî‚îÄ‚îÄ ISIC2019
‚îÇ       ‚îú‚îÄ‚îÄ images_test
‚îÇ       ‚îî‚îÄ‚îÄ images_train
‚îú‚îÄ‚îÄ Datasets
‚îÇ   ‚îú‚îÄ‚îÄ HAM10000
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ HAM10000_images_part_1
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ HAM10000_images_part_2
‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ HAM10000_segmentations_lesion_tschandl
‚îÇ   ‚îÇ   ‚îÇ   ‚îú‚îÄ‚îÄ HAM10000_segmentations_lesion_tschandl
‚îÇ   ‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ __MACOSX
‚îÇ   ‚îÇ   ‚îÇ       ‚îî‚îÄ‚îÄ HAM10000_segmentations_lesion_tschandl
‚îÇ   ‚îÇ   ‚îî‚îÄ‚îÄ ISIC2018_Task3_Test_Images
‚îÇ   ‚îÇ       ‚îú‚îÄ‚îÄ ISIC2018_Task3_Test_Images
‚îÇ   ‚îÇ       ‚îî‚îÄ‚îÄ __MACOSX
‚îÇ   ‚îÇ           ‚îî‚îÄ‚îÄ ISIC2018_Task3_Test_Images
‚îÇ   ‚îî‚îÄ‚îÄ ISIC 2019
‚îÇ       ‚îú‚îÄ‚îÄ ISIC_2019_Test_Input
‚îÇ       ‚îî‚îÄ‚îÄ ISIC_2019_Training_Input
‚îî‚îÄ‚îÄ scik

Clean Folder Structure

In [None]:
import os
import shutil

ROOT = os.getcwd()
DATASETS = os.path.join(ROOT, "Datasets")
CLEAN = os.path.join(ROOT, "CleanData")

IMG_EXTS = (".jpg", ".jpeg", ".png")

def safe_mkdir(path):
    if not os.path.exists(path):
        os.makedirs(path)
        print(f"[CREATE] {path}")

def move_images(src, dst):
    if not os.path.exists(src):
        print(f"[SKIP] Missing: {src}")
        return

    safe_mkdir(dst)

    for root, dirs, files in os.walk(src):
        # ignore macOS junk
        dirs[:] = [d for d in dirs if d != "__MACOSX"]

        for f in files:
            if f.lower().endswith(IMG_EXTS):
                src_file = os.path.join(root, f)
                dst_file = os.path.join(dst, f)

                if os.path.exists(dst_file):
                    print(f"[SKIP] Exists: {dst_file}")
                    continue

                shutil.move(src_file, dst_file)
                print(f"[MOVE] {f}")

def main():
    print("\n=== MOVING IMAGES TO CLEAN WORKSPACE (NO COPY) ===\n")

    safe_mkdir(CLEAN)

    # -------------------------------------------------
    # HAM10000
    # -------------------------------------------------
    print("--- HAM10000 ---")

    ham_clean = os.path.join(CLEAN, "HAM10000")
    safe_mkdir(ham_clean)

    move_images(
        os.path.join(DATASETS, "HAM10000", "HAM10000_images_part_1"),
        os.path.join(ham_clean, "images")
    )

    move_images(
        os.path.join(DATASETS, "HAM10000", "HAM10000_images_part_2"),
        os.path.join(ham_clean, "images")
    )

    move_images(
        os.path.join(DATASETS, "HAM10000", "HAM10000_segmentations_lesion_tschandl"),
        os.path.join(ham_clean, "segmentations")
    )

    # -------------------------------------------------
    # ISIC 2019
    # -------------------------------------------------
    print("\n--- ISIC2019 ---")

    isic_clean = os.path.join(CLEAN, "ISIC2019")
    safe_mkdir(isic_clean)

    move_images(
        os.path.join(DATASETS, "ISIC 2019", "ISIC_2019_Training_Input"),
        os.path.join(isic_clean, "images_train")
    )

    move_images(
        os.path.join(DATASETS, "ISIC 2019", "ISIC_2019_Test_Input"),
        os.path.join(isic_clean, "images_test")
    )

    print("\n=== DONE (IMAGES MOVED, NO DELETION) ===")

if __name__ == "__main__":
    main()



Corruprted Imaages Check

In [2]:
import cv2
import os

IMG_EXTS = (".jpg", ".jpeg", ".png")

def check_folder(folder):
    bad = []
    for f in os.listdir(folder):
        if f.lower().endswith(IMG_EXTS):
            path = os.path.join(folder, f)
            img = cv2.imread(path)
            if img is None:
                bad.append(f)
    return bad

if __name__ == "__main__":
    paths = [
        "CleanData/HAM10000/images",
        "CleanData/ISIC2019/images_train",
        "CleanData/ISIC2019/images_test"
    ]

    for p in paths:
        print(f"\nChecking: {p}")
        bad = check_folder(p)
        print("Corrupted:", bad if bad else "None ‚úÖ")



Checking: CleanData/HAM10000/images
Corrupted: None ‚úÖ

Checking: CleanData/ISIC2019/images_train
Corrupted: None ‚úÖ

Checking: CleanData/ISIC2019/images_test
Corrupted: None ‚úÖ


Image Count Sanity Check

In [1]:
import os

IMG_EXTS = (".jpg", ".jpeg", ".png")

def count_images(folder):
    return sum(
        1 for f in os.listdir(folder)
        if f.lower().endswith(IMG_EXTS)
    )

paths = {
    "HAM10000": "CleanData/HAM10000/images",
    "ISIC2019 Train": "CleanData/ISIC2019/images_train",
    "ISIC2019 Test": "CleanData/ISIC2019/images_test",
}

for name, path in paths.items():
    print(f"{name}: {count_images(path)} images")


HAM10000: 10015 images
ISIC2019 Train: 25331 images
ISIC2019 Test: 8238 images


In [3]:
import pandas as pd
import os

# UPDATE PATH if filename differs slightly
META_PATH = "C:/Users/umair/Videos/PhD/PhD Data/Week 8 Jannuary/Code/CleanData/HAM10000/HAM10000_metadata"
IMG_DIR = "CleanData/HAM10000/images"
OUT_CSV = "CleanData/HAM10000/labels_binary.csv"

def map_label(dx):
    # Binary: MEL vs NON-MEL
    return 1 if dx.lower() == "mel" else 0

def main():
    df = pd.read_csv(META_PATH)

    records = []
    missing = 0

    for _, row in df.iterrows():
        img_name = row["image_id"] + ".jpg"
        img_path = os.path.join(IMG_DIR, img_name)

        if not os.path.exists(img_path):
            missing += 1
            continue

        label = map_label(row["dx"])
        records.append([img_name, img_path, label])

    out = pd.DataFrame(records, columns=["image", "path", "label"])
    out.to_csv(OUT_CSV, index=False)

    print(f"Saved: {OUT_CSV}")
    print(f"Missing images: {missing}")
    print(f"Total labeled images: {len(out)}")

if __name__ == "__main__":
    main()


Saved: CleanData/HAM10000/labels_binary.csv
Missing images: 0
Total labeled images: 10015


In [4]:
import pandas as pd
import os

# ---- PATHS (match your structure) ----
META_PATH = "C:\\Users\\umair\\Videos\\PhD\\PhD Data\\Week 8 Jannuary\\Code\\CleanData\\ISIC2019\\ISIC_2019_Training_GroundTruth.csv"
IMG_DIR = "CleanData/ISIC2019/images_train"
OUT_CSV = "CleanData/ISIC2019/labels_binary.csv"

def main():
    df = pd.read_csv(META_PATH)

    records = []
    missing = 0

    for _, row in df.iterrows():
        img_name = row["image"] + ".jpg"
        img_path = os.path.join(IMG_DIR, img_name)

        if not os.path.exists(img_path):
            missing += 1
            continue

        # Binary classification: MEL vs NON-MEL
        label = 1 if row["MEL"] == 1 else 0

        records.append([img_name, img_path, label])

    out = pd.DataFrame(records, columns=["image", "path", "label"])
    out.to_csv(OUT_CSV, index=False)

    print(f"Saved: {OUT_CSV}")
    print(f"Missing images: {missing}")
    print(f"Total labeled images: {len(out)}")

if __name__ == "__main__":
    main()


Saved: CleanData/ISIC2019/labels_binary.csv
Missing images: 0
Total labeled images: 25331


What is LBP (Local Binary Pattern)?

LBP is a handcrafted texture descriptor that encodes local micro-patterns in an image.

For each pixel:

Compare it with its neighbors

If neighbor ‚â• center ‚Üí 1

Else ‚Üí 0

Concatenate bits ‚Üí binary number ‚Üí histogram

Skin lesions differ not only in color but also in micro-texture. LBP captures fine-scale textural variations such as irregular borders, roughness, and pigment granularity, which are clinically relevant indicators of malignancy.

In [5]:
import numpy as np
import cv2
from skimage.feature import local_binary_pattern

def extract_lbp(
    image,
    radius=1,
    n_points=8,
    method="uniform",
    n_bins=10
):
    """
    Extract LBP histogram features from a grayscale image.

    Parameters:
    - image: input BGR image
    - radius: radius of LBP
    - n_points: number of sampling points
    - method: LBP method (uniform recommended)
    - n_bins: histogram bins

    Returns:
    - 1D numpy array of LBP histogram features
    """

    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # Compute LBP
    lbp = local_binary_pattern(gray, n_points, radius, method)

    # Histogram
    hist, _ = np.histogram(
        lbp.ravel(),
        bins=n_bins,
        range=(0, n_bins),
        density=True
    )

    return hist.astype(np.float32)


In [3]:
import importlib
import features.lbp
importlib.reload(features.lbp)
import cv2
from features.lbp import extract_lbp

img = cv2.imread("CleanData/HAM10000/images/ISIC_0024306.jpg")
features = extract_lbp(img)

print("LBP feature length:", len(features))
print(features)


LBP feature length: 10
[0.05273333 0.07977778 0.05538518 0.09934445 0.11282222 0.11996666
 0.10085926 0.09474074 0.11718889 0.16718148]


GLCM captures second-order texture statistics such as contrast and homogeneity, describing how pixel intensities co-occur spatially, which complements the local micro-patterns captured by LBP.

In [4]:
import cv2
from features.glcm import extract_glcm

img = cv2.imread("CleanData/HAM10000/images/ISIC_0024306.jpg")
features = extract_glcm(img)

print("GLCM feature length:", len(features))
print(features)


GLCM feature length: 5
[10.3674755   2.0494194   0.40197736  0.05172076  0.9946185 ]
