Import and Path Setup

In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm

from sklearn.cluster import KMeans
from scipy.stats import skew, kurtosis
from skimage.feature import graycomatrix, graycoprops

PROJECT_ROOT = Path("..").resolve()
OUT_DIR = PROJECT_ROOT / "outputs"

# index dari preprocessing CLAHE yang kemarin
INDEX_CSV = OUT_DIR / "preprocessed_clahe_index.csv"

# output fitur
FEAT_OUT = OUT_DIR / "extracted_features.csv"

RANDOM_STATE = 42


Load CSV (Index)

In [2]:
df = pd.read_csv(INDEX_CSV)
print(df.shape)
df.head()


(724, 7)


Unnamed: 0,orig_path,prep_path,class,Output,width,height,resize_mode
0,E:\Kuliah\Pengenalan Pola\final-project\data\C...,E:\Kuliah\Pengenalan Pola\final-project\output...,Anthracnose,1,480,360,width480
1,E:\Kuliah\Pengenalan Pola\final-project\data\C...,E:\Kuliah\Pengenalan Pola\final-project\output...,Anthracnose,1,480,640,width480
2,E:\Kuliah\Pengenalan Pola\final-project\data\C...,E:\Kuliah\Pengenalan Pola\final-project\output...,Anthracnose,1,480,627,width480
3,E:\Kuliah\Pengenalan Pola\final-project\data\C...,E:\Kuliah\Pengenalan Pola\final-project\output...,Anthracnose,1,480,500,width480
4,E:\Kuliah\Pengenalan Pola\final-project\data\C...,E:\Kuliah\Pengenalan Pola\final-project\output...,Anthracnose,1,480,566,width480


K-Means Segmentation + ROI MASK

In [3]:
def kmeans_labels_gray(gray, k=3):
    h, w = gray.shape
    X = gray.reshape(-1, 1).astype(np.float32)
    km = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init="auto")
    labels = km.fit_predict(X).reshape(h, w)
    centers = km.cluster_centers_.ravel()
    return labels, centers

def choose_background_cluster(labels):
    h, w = labels.shape
    k = int(labels.max()) + 1

    border = np.zeros_like(labels, dtype=bool)
    border[0,:]=border[-1,:]=True
    border[:,0]=border[:,-1]=True

    scores = []
    for kk in range(k):
        border_frac = np.mean(labels[border] == kk)
        area_frac   = np.mean(labels == kk)
        scores.append(0.8*border_frac + 0.2*area_frac)
    return int(np.argmax(scores))

def best_component(mask01):
    mask = (mask01 > 0).astype(np.uint8)
    kernel = np.ones((7,7), np.uint8)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel, iterations=2)
    mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN,  kernel, iterations=1)

    n, cc, stats, centroids = cv2.connectedComponentsWithStats(mask*255, connectivity=8)
    if n <= 1:
        return mask, None

    H, W = mask.shape
    cx0, cy0 = W/2, H/2
    best_i, best_score = None, -1e18

    for i in range(1, n):
        x,y,w,h,area = stats[i]
        cx,cy = centroids[i]
        touches = (x==0) or (y==0) or (x+w==W) or (y+h==H)
        dist = np.sqrt((cx-cx0)**2 + (cy-cy0)**2) / (np.sqrt(cx0**2+cy0**2)+1e-9)
        score = (np.log(area+1)*2.0) - (2.0*(1.0 if touches else 0.0)) - dist
        if score > best_score:
            best_score, best_i = score, i

    refined = (cc == best_i).astype(np.uint8)
    x,y,w,h,_ = stats[best_i]
    bbox = (int(x), int(y), int(w), int(h))
    return refined, bbox

def segment_roi_mask(gray):
    labels, centers = kmeans_labels_gray(gray, k=3)
    bg_k = choose_background_cluster(labels)
    raw = (labels != bg_k).astype(np.uint8)     # gabungan 2 cluster non-bg
    mask, bbox = best_component(raw)
    return mask, bbox


Features Extraction (GLCM & Stats)

In [4]:
def entropy_gray(vals_uint8):
    hist = np.bincount(vals_uint8, minlength=256).astype(np.float64)
    p = hist / (hist.sum() + 1e-12)
    p = p[p > 0]
    return float(-np.sum(p * np.log2(p)))

def extract_13_from_gray(gray, mask, bbox=None):
    # crop ROI untuk GLCM biar fokus
    if bbox is not None:
        x,y,w,h = bbox
        gray_roi = gray[y:y+h, x:x+w]
        mask_roi = mask[y:y+h, x:x+w]
    else:
        gray_roi = gray
        mask_roi = mask

    vals = gray_roi[mask_roi == 1].astype(np.float64)
    if vals.size == 0:
        vals = gray_roi.flatten().astype(np.float64)

    mn = float(np.mean(vals))
    var = float(np.var(vals))
    std = float(np.std(vals))
    sken = float(skew(vals)) if vals.size > 2 else 0.0
    kts  = float(kurtosis(vals, fisher=False)) if vals.size > 3 else 0.0
    rms  = float(np.sqrt(np.mean(vals**2)))
    sm   = float(1.0 - (1.0 / (1.0 + var)))
    ent  = entropy_gray(vals.astype(np.uint8))

    # GLCM: quantize 64 level
    q = (gray_roi // 4).astype(np.uint8)
    glcm = graycomatrix(
        q, distances=[1],
        angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
        levels=64, symmetric=True, normed=True
    )

    cnt = float(np.mean(graycoprops(glcm, "contrast")))
    crl = float(np.mean(graycoprops(glcm, "correlation")))
    eg  = float(np.mean(graycoprops(glcm, "energy")))
    hgn = float(np.mean(graycoprops(glcm, "homogeneity")))

    # IDM
    P = glcm[:, :, 0, :]
    L = P.shape[0]
    i = np.arange(L).reshape(-1,1)
    j = np.arange(L).reshape(1,-1)
    denom = 1.0 + (i-j)**2
    idm = float(np.mean(np.sum(P / denom[:, :, None], axis=(0,1))))

    return {
        "CNT": cnt, "CRL": crl, "SKEN": sken, "KTS": kts, "VAR": var, "STD": std,
        "ENT": ent, "EG": eg, "MN": mn, "HGN": hgn, "RMS": rms, "SM": sm, "IDM": idm
    }


Batch Extraction & Save

In [5]:
rows = []
failed = 0

for r in tqdm(df.to_dict("records"), desc="Extracting 13 features"):
    p = r["prep_path"]
    cls = r["class"]
    y = int(r["Output"])

    gray = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
    if gray is None:
        failed += 1
        continue

    mask, bbox = segment_roi_mask(gray)
    feats = extract_13_from_gray(gray, mask, bbox)

    feats["Output"] = y
    feats["ClassName"] = cls
    feats["Path"] = p
    rows.append(feats)

feat_df = pd.DataFrame(rows)
feat_df.to_csv(FEAT_OUT, index=False)

print("Saved:", FEAT_OUT)
print("Shape:", feat_df.shape, "| Failed:", failed)
feat_df.head()


Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md

Extracting 13 features: 100%|██████████| 724/724 [01:11<00:00, 10.12it/s]

Saved: E:\Kuliah\Pengenalan Pola\final-project\outputs\extracted_features.csv
Shape: (724, 16) | Failed: 0





Unnamed: 0,CNT,CRL,SKEN,KTS,VAR,STD,ENT,EG,MN,HGN,RMS,SM,IDM,Output,ClassName,Path
0,4.446106,0.989337,0.169436,2.065414,2281.116409,47.761034,7.500692,0.078382,117.420382,0.647291,126.762228,0.999562,0.647291,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
1,13.065036,0.971947,0.031212,2.277342,2822.893711,53.130911,7.707979,0.052744,108.77181,0.43117,121.054535,0.999646,0.43117,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
2,8.353437,0.972498,0.575584,3.300861,1802.80796,42.459486,7.348934,0.0689,129.548861,0.554495,136.329436,0.999446,0.554495,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
3,11.191044,0.975762,-0.597276,2.152992,4430.024685,66.558431,7.760315,0.055187,143.953244,0.463002,158.59559,0.999774,0.463002,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
4,4.315616,0.988686,-0.070869,2.261092,1956.425998,44.231505,7.38991,0.076094,105.166916,0.662785,114.089904,0.999489,0.662785,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...


Re Checking Sanity

In [6]:
print("NaN count per column:")
print(feat_df.isna().sum())

print("\nFeature summary:")
display(feat_df.describe())


NaN count per column:
CNT          0
CRL          0
SKEN         0
KTS          0
VAR          0
STD          0
ENT          0
EG           0
MN           0
HGN          0
RMS          0
SM           0
IDM          0
Output       0
ClassName    0
Path         0
dtype: int64

Feature summary:


Unnamed: 0,CNT,CRL,SKEN,KTS,VAR,STD,ENT,EG,MN,HGN,RMS,SM,IDM,Output
count,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0,724.0
mean,10.89664,0.973109,0.12723,2.273893,2940.560755,53.699883,7.597901,0.064014,125.448018,0.50537,136.763262,0.99963,0.50537,0.812155
std,4.926713,0.011137,0.314265,0.425821,805.044469,7.547318,0.238788,0.024323,17.265393,0.07361,16.481299,0.000122,0.07361,0.390859
min,2.166272,0.934854,-1.837999,1.540552,965.805516,31.077412,4.768866,0.037407,71.418111,0.33547,88.496132,0.998966,0.33547,0.0
25%,7.34156,0.96712,-0.05617,2.01056,2350.847623,48.485539,7.509438,0.051925,112.458612,0.453119,123.834978,0.999575,0.453119,1.0
50%,9.96453,0.974784,0.120791,2.200013,2873.382668,53.603943,7.635367,0.057267,125.263255,0.489807,138.953163,0.999652,0.489807,1.0
75%,13.904566,0.981647,0.280634,2.418945,3511.473608,59.257687,7.752225,0.067754,136.267802,0.550238,147.279493,0.999715,0.550238,1.0
max,33.756247,0.995031,1.027631,6.156466,5332.878997,73.026564,7.89523,0.388557,229.55871,0.764343,232.881753,0.999813,0.764343,1.0
