# Feature Extraction from ROI Segmentation

In [None]:
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm
from sklearn.cluster import KMeans
from scipy.stats import skew, kurtosis
from skimage.feature import graycomatrix, graycoprops

# 1. Configuration & Data Loading

In [None]:
PROJECT_ROOT = Path("..").resolve()
OUT_DIR = PROJECT_ROOT / "outputs"
INDEX_CSV = OUT_DIR / "preprocessed_index.csv"
FEAT_OUT = OUT_DIR / "extracted_features.csv"

ANNOT_DIR = PROJECT_ROOT / "data" / "Annotated Files"

RANDOM_STATE = 42
FIXED_SIZE = (300, 300)
KMEANS_K = 3
PAD_RATIO = 0.35

df = pd.read_csv(INDEX_CSV)
print(f"Loaded {len(df)} images from {INDEX_CSV}")
df.head()

Index shape: (724, 10)


Unnamed: 0,orig_path,prep_path,class,Output,width,height,resize_mode,gaussian,gamma,equalization
0,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,Anthracnose,1,300,300,fixed300_bilinear,"(5, 5)_sigma0",1.2,hist_eq
1,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,Anthracnose,1,300,300,fixed300_bilinear,"(5, 5)_sigma0",1.2,hist_eq
2,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,Anthracnose,1,300,300,fixed300_bilinear,"(5, 5)_sigma0",1.2,hist_eq
3,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,Anthracnose,1,300,300,fixed300_bilinear,"(5, 5)_sigma0",1.2,hist_eq
4,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,Anthracnose,1,300,300,fixed300_bilinear,"(5, 5)_sigma0",1.2,hist_eq


# 2. Annotation & Segmentation Utilities

In [None]:
def build_annot_index(annot_root: Path) -> dict:
    idx = {}
    if annot_root.exists():
        for p in annot_root.rglob("*.txt"):
            idx[p.stem] = p
    return idx

ANNOT_INDEX = build_annot_index(ANNOT_DIR)
print(f"Found {len(ANNOT_INDEX)} annotation files")

def find_annotation_txt(img_path: str):
    stem = Path(img_path).stem
    return ANNOT_INDEX.get(stem)

def parse_yolo_txt(txt_path: Path):
    boxes = []
    with open(txt_path) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            parts = line.split()
            if len(parts) == 5:
                cls, xc, yc, bw, bh = int(float(parts[0])), *map(float, parts[1:])
                boxes.append((cls, xc, yc, bw, bh))
    return boxes

def yolo_to_xyxy(box, W, H):
    _, xc, yc, bw, bh = box
    x1 = int(max(0, round((xc - bw/2) * W)))
    y1 = int(max(0, round((yc - bh/2) * H)))
    x2 = int(min(W-1, round((xc + bw/2) * W)))
    y2 = int(min(H-1, round((yc + bh/2) * H)))
    return x1, y1, x2, y2

def lesion_mask_from_txt(shape_hw, txt_path: Path):
    H, W = shape_hw
    mask = np.zeros((H, W), dtype=np.uint8)
    for box in parse_yolo_txt(txt_path):
        x1, y1, x2, y2 = yolo_to_xyxy(box, W, H)
        mask[y1:y2+1, x1:x2+1] = 1
    return mask

def bbox_from_mask(mask01: np.ndarray):
    ys, xs = np.where(mask01 > 0)
    if len(xs) == 0:
        return None
    return int(xs.min()), int(ys.min()), int(xs.max()), int(ys.max())

def bbox_from_mask_xywh(mask01: np.ndarray):
    bb = bbox_from_mask(mask01)
    if bb is None:
        return None
    x1, y1, x2, y2 = bb
    return x1, y1, x2 - x1 + 1, y2 - y1 + 1

def clip_bbox(x1, y1, x2, y2, W, H):
    x1 = max(0, min(W-1, x1))
    x2 = max(0, min(W-1, x2))
    y1 = max(0, min(H-1, y1))
    y2 = max(0, min(H-1, y2))
    return (x1, y1, x2, y2) if x2 >= x1 and y2 >= y1 else None

def expand_bbox(x1, y1, x2, y2, W, H, pad_ratio=0.35):
    w, h = x2 - x1 + 1, y2 - y1 + 1
    pad_x = int(round(w * pad_ratio))
    pad_y = int(round(h * pad_ratio))
    return clip_bbox(x1 - pad_x, y1 - pad_y, x2 + pad_x, y2 + pad_y, W, H)

def read_bgr_300(path):
    bgr = cv2.imread(str(path))
    return cv2.resize(bgr, FIXED_SIZE, interpolation=cv2.INTER_LINEAR) if bgr is not None else None

def read_gray_300(path):
    gray = cv2.imread(str(path), cv2.IMREAD_GRAYSCALE)
    return cv2.resize(gray, FIXED_SIZE, interpolation=cv2.INTER_LINEAR) if gray is not None else None

def kmeans_labels_lab(bgr, k=3):
    lab = cv2.cvtColor(bgr, cv2.COLOR_BGR2LAB)
    H, W = lab.shape[:2]
    X = lab.reshape(-1, 3).astype(np.float32)
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init="auto")
    return km.fit_predict(X).reshape(H, W)

def refine_roi_mask(mask01):
    mask = (mask01 > 0).astype(np.uint8)
    kernel = np.ones((7, 7), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=1)
    n, cc, stats, _ = cv2.connectedComponentsWithStats(mask * 255, connectivity=8)
    if n <= 1:
        return mask
    areas = stats[1:, cv2.CC_STAT_AREA]
    idx = 1 + int(np.argmax(areas))
    return (cc == idx).astype(np.uint8)

def prf_from_masks(pred_mask01, gt_mask01):
    pred = (pred_mask01 > 0).astype(np.uint8)
    gt = (gt_mask01 > 0).astype(np.uint8)
    tp = int((pred & gt).sum())
    fp = int((pred & (1 - gt)).sum())
    fn = int(((1 - pred) & gt).sum())
    p = tp / (tp + fp + 1e-9)
    r = tp / (tp + fn + 1e-9)
    f1 = 2 * p * r / (p + r + 1e-9)
    return float(p), float(r), float(f1)

def border_touch_ratio(mask01: np.ndarray):
    mask = (mask01 > 0).astype(np.uint8)
    H, W = mask.shape
    border = np.zeros_like(mask)
    border[0, :] = border[-1, :] = border[:, 0] = border[:, -1] = 1
    return float((mask & border).sum() / (mask.sum() + 1e-9))

def score_roi(labels, lesion_mask, chosen):
    if isinstance(chosen, int):
        chosen = [chosen]
    roi = np.zeros_like(labels, dtype=np.uint8)
    for kk in chosen:
        roi |= (labels == kk).astype(np.uint8)
    P, R, F1 = prf_from_masks(roi, lesion_mask)
    btr = border_touch_ratio(roi)
    F1_adj = F1 * (1.0 - 0.35 * min(1.0, btr))
    return {
        "chosen": chosen,
        "precision": P,
        "recall": R,
        "f1": float(F1_adj),
        "border_touch": float(btr),
        "roi_area": int(roi.sum()),
        "inter": int((roi & lesion_mask).sum())
    }

def pick_roi_clusters_robust(labels, lesion_mask):
    k = int(labels.max()) + 1
    best = None
    for kk in range(k):
        info = score_roi(labels, lesion_mask, kk)
        if best is None or info["f1"] > best["f1"]:
            best = info
    for a in range(k):
        for b in range(a + 1, k):
            info = score_roi(labels, lesion_mask, [a, b])
            if info["f1"] > best["f1"]:
                best = info
    return best

def segment_roi_mask_guided(bgr_300, txt_path, pad_ratio=0.35):
    H, W = bgr_300.shape[:2]
    lesion_full = lesion_mask_from_txt((H, W), txt_path)
    bb = bbox_from_mask(lesion_full)
    if bb is None:
        return None, lesion_full, None, (np.nan, np.nan, np.nan)
    
    x1, y1, x2, y2 = expand_bbox(*bb, W=W, H=H, pad_ratio=pad_ratio)
    bgr_crop = bgr_300[y1:y2+1, x1:x2+1]
    lesion_crop = lesion_full[y1:y2+1, x1:x2+1]
    
    labels = kmeans_labels_lab(bgr_crop, k=KMEANS_K)
    best = pick_roi_clusters_robust(labels, lesion_crop)
    if best is None or best["f1"] <= 0:
        return None, lesion_full, best, (0.0, 0.0, 0.0)
    
    roi_crop = np.zeros_like(labels, dtype=np.uint8)
    for kk in best["chosen"]:
        roi_crop |= (labels == kk).astype(np.uint8)
    roi_crop = refine_roi_mask(roi_crop)
    
    roi_full = np.zeros((H, W), dtype=np.uint8)
    roi_full[y1:y2+1, x1:x2+1] = roi_crop
    
    P2, R2, F12 = prf_from_masks(roi_full, lesion_full)
    return roi_full, lesion_full, best, (P2, R2, F12)

Annotation txt files found: 545


# 3. Feature Extraction (13 GLCM & Statistics)

In [None]:
def entropy_gray(vals_uint8):
    hist = np.bincount(vals_uint8, minlength=256).astype(np.float64)
    p = hist / (hist.sum() + 1e-12)
    p = p[p > 0]
    return float(-np.sum(p * np.log2(p)))

def extract_13_from_gray(gray, mask01):
    H, W = gray.shape[:2]
    mask = (mask01 > 0).astype(np.uint8)
    
    bb = bbox_from_mask_xywh(mask)
    if bb is not None:
        x, y, w, h = bb
        gray_roi = gray[y:y+h, x:x+w]
        mask_roi = mask[y:y+h, x:x+w]
    else:
        gray_roi = gray
        mask_roi = mask
    
    vals = gray_roi[mask_roi == 1].astype(np.float64)
    if vals.size == 0:
        vals = gray_roi.flatten().astype(np.float64)
    
    mn = float(np.mean(vals))
    var = float(np.var(vals))
    std = float(np.std(vals))
    sken = float(skew(vals)) if vals.size > 2 else 0.0
    kts = float(kurtosis(vals, fisher=False)) if vals.size > 3 else 0.0
    rms = float(np.sqrt(np.mean(vals**2)))
    sm = float(1.0 - 1.0 / (1.0 + var))
    ent = entropy_gray(vals.astype(np.uint8))
    
    q = (gray_roi // 4).astype(np.uint8)
    glcm = graycomatrix(q, distances=[1], angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
                        levels=64, symmetric=True, normed=True)
    
    cnt = float(np.mean(graycoprops(glcm, "contrast")))
    crl = float(np.mean(graycoprops(glcm, "correlation")))
    eg = float(np.mean(graycoprops(glcm, "energy")))
    hgn = float(np.mean(graycoprops(glcm, "homogeneity")))
    
    P = glcm[:, :, 0, :]
    L = P.shape[0]
    i = np.arange(L).reshape(-1, 1)
    j = np.arange(L).reshape(1, -1)
    denom = 1.0 + (i - j) ** 2
    idm = float(np.mean(np.sum(P / denom[:, :, None], axis=(0, 1))))
    
    return {
        "CNT": cnt, "CRL": crl, "SKEN": sken, "KTS": kts, "VAR": var, "STD": std,
        "ENT": ent, "EG": eg, "MN": mn, "HGN": hgn, "RMS": rms, "SM": sm, "IDM": idm
    }

# 4. Process All Images & Extract Features

In [None]:
rows = []
failed = 0
skipped_no_annot = 0
skipped_bad_roi = 0

for r in tqdm(df.to_dict("records"), desc="Extracting features"):
    orig_p = r["orig_path"]
    prep_p = r["prep_path"]
    cls = r.get("class", r.get("ClassName", ""))
    y = int(r["Output"]) if "Output" in r else int(r.get("label", 0))
    
    bgr = read_bgr_300(orig_p)
    if bgr is None:
        failed += 1
        continue
    
    gray = read_gray_300(prep_p)
    if gray is None:
        failed += 1
        continue
    
    txt = find_annotation_txt(orig_p)
    
    if txt is None:
        roi_mask = np.ones_like(gray, dtype=np.uint8)
        lesion_mask = None
        best = None
        P2 = R2 = F12 = np.nan
        skipped_no_annot += 1
    else:
        roi_mask, lesion_mask, best, (P2, R2, F12) = segment_roi_mask_guided(bgr, txt, pad_ratio=PAD_RATIO)
        if roi_mask is None:
            skipped_bad_roi += 1
            continue
        
        if roi_mask.shape != gray.shape:
            roi_mask = cv2.resize(roi_mask, (gray.shape[1], gray.shape[0]), interpolation=cv2.INTER_NEAREST)
            if lesion_mask is not None and lesion_mask.shape != gray.shape:
                lesion_mask = cv2.resize(lesion_mask, (gray.shape[1], gray.shape[0]), interpolation=cv2.INTER_NEAREST)
    
    feats = extract_13_from_gray(gray, roi_mask)
    feats.update({
        "Output": y,
        "ClassName": cls,
        "orig_path": orig_p,
        "prep_path": prep_p,
        "chosen": str(best["chosen"]) if best else "",
        "f1_pick": float(best["f1"]) if best else np.nan,
        "p_final": P2,
        "r_final": R2,
        "f1_final": F12,
    })
    rows.append(feats)

feat_df = pd.DataFrame(rows)
feat_df.to_csv(FEAT_OUT, index=False)

print(f"\n✓ Saved {len(feat_df)} features to {FEAT_OUT}")
print(f"  Failed: {failed} | No annotation: {skipped_no_annot} | Bad ROI: {skipped_bad_roi}")
feat_df.head()

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Extracting 13 features (Guided KMeans, paper 300x300): 100%|██████████| 724/724 [00:34<00:00, 20.74it/s]

Saved: E:\Kuliah\Pengenalan Pola\addressing_agricultural_challenges\outputs\extracted_features.csv
Shape: (724, 22) | failed: 0
no_annot fallback: 184
bad_roi skipped: 0





Unnamed: 0,CNT,CRL,SKEN,KTS,VAR,STD,ENT,EG,MN,HGN,...,IDM,Output,ClassName,orig_path,prep_path,chosen,f1_pick,p_final,r_final,f1_final
0,5.723888,0.987658,0.563714,3.367663,1334.582135,36.531933,6.670479,0.104164,51.389039,0.647028,...,0.647028,1,Anthracnose,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,[1],0.637451,0.680558,0.653265,0.666633
1,20.733715,0.950787,0.597947,3.122097,3613.386267,60.111449,7.421945,0.062476,97.628786,0.387375,...,0.387375,1,Anthracnose,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,"[1, 2]",0.547897,0.37544,1.0,0.54592
2,8.463067,0.984609,0.287901,1.867356,4393.23558,66.281487,7.779396,0.066144,135.33215,0.562726,...,0.562726,1,Anthracnose,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,"[0, 2]",0.512146,0.347382,1.0,0.51564
3,6.14452,0.982131,0.480171,2.136493,2108.565763,45.919122,7.306347,0.073773,123.62668,0.555171,...,0.555171,1,Anthracnose,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,"[0, 2]",0.565698,0.443959,0.940006,0.603085
4,6.912518,0.984983,0.344395,4.448925,807.453287,28.415723,6.518133,0.111556,49.570963,0.640639,...,0.640639,1,Anthracnose,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,E:\Kuliah\Pengenalan Pola\addressing_agricultu...,[0],0.746585,0.65497,0.919766,0.765105


# 5. Data Validation

In [None]:
print("Feature Columns:")
print(feat_df.columns.tolist())

print("\nMissing Values:")
print(feat_df.isna().sum())

if "f1_final" in feat_df.columns:
    print("\nROI Segmentation Quality (f1_final) - Annotated Images Only:")
    print(feat_df["f1_final"].dropna().describe())

Columns: ['CNT', 'CRL', 'SKEN', 'KTS', 'VAR', 'STD', 'ENT', 'EG', 'MN', 'HGN', 'RMS', 'SM', 'IDM', 'Output', 'ClassName', 'orig_path', 'prep_path', 'chosen', 'f1_pick', 'p_final', 'r_final', 'f1_final']

Basic check (NaN counts):
CNT            0
CRL            0
SKEN           0
KTS            0
VAR            0
STD            0
ENT            0
EG             0
MN             0
HGN            0
RMS            0
SM             0
IDM            0
Output         0
ClassName      0
orig_path      0
prep_path      0
chosen         0
f1_pick      184
p_final      184
r_final      184
f1_final     184
dtype: int64

Guided ROI quality (f1_final) summary (only annotated rows):
count    540.000000
mean       0.468129
std        0.191010
min        0.000000
25%        0.336071
50%        0.527133
75%        0.590081
max        0.898746
Name: f1_final, dtype: float64


# 6. Summary Statistics

In [None]:
print("Feature Statistics:")
feat_df.describe()

NaN count per column:
CNT            0
CRL            0
SKEN           0
KTS            0
VAR            0
STD            0
ENT            0
EG             0
MN             0
HGN            0
RMS            0
SM             0
IDM            0
Output         0
ClassName      0
orig_path      0
prep_path      0
chosen         0
f1_pick      184
p_final      184
r_final      184
f1_final     184
dtype: int64

Feature summary:


Unnamed: 0,CNT,CRL,SKEN,KTS,VAR,STD,ENT,EG,MN,HGN,RMS,SM,IDM,Output,f1_pick,p_final,r_final,f1_final
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,540.0,540.0,540.0,540.0
mean,11.658385,0.980542,-0.004277,2.815559,3756.540921,59.574203,7.545861,0.065823,124.26598,0.521916,139.003867,0.999636,0.521916,0.812155,0.438724,0.360289,0.851751,0.468129
std,5.437984,0.009469,0.73215,1.757801,1647.86556,14.413263,0.424982,0.019635,39.481611,0.070391,37.883063,0.000315,0.070391,0.390859,0.174035,0.20351,0.197402,0.19101
min,2.278921,0.934789,-3.257981,1.296318,297.607039,17.251291,5.592859,0.042861,32.607309,0.355926,37.898978,0.996651,0.355926,0.0,0.040208,0.0,0.0,0.0
25%,7.641748,0.975399,-0.170029,1.805901,2393.717509,48.925632,7.364063,0.055014,98.644706,0.467314,115.22552,0.999582,0.467314,1.0,0.304857,0.210304,0.777638,0.336071
50%,10.697103,0.982577,-0.003296,2.176826,3605.248983,60.043722,7.678575,0.060747,128.497106,0.520522,148.037614,0.999723,0.520522,1.0,0.49004,0.369055,0.92685,0.527133
75%,14.838463,0.98733,0.309929,2.980084,5409.943329,73.552317,7.871845,0.069506,141.716106,0.568179,156.030597,0.999815,0.568179,1.0,0.554083,0.448375,0.98495,0.590081
max,40.490272,0.996708,2.161841,14.866644,8124.132022,90.133967,7.989966,0.239273,233.523389,0.75167,235.79682,0.999877,0.75167,1.0,0.827475,1.0,1.0,0.898746
