Load Data

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

from sklearn.feature_selection import f_classif

PROJECT_ROOT = Path("..").resolve()
IN_CSV = PROJECT_ROOT / "outputs" / "extracted_features.csv"

OUT_DIR = PROJECT_ROOT / "outputs"
RANK_DIR = OUT_DIR / "rankings"
DATASET_DIR = OUT_DIR / "datasets"
RANK_DIR.mkdir(parents=True, exist_ok=True)
DATASET_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(IN_CSV)
df.head()


Unnamed: 0,CNT,CRL,SKEN,KTS,VAR,STD,ENT,EG,MN,HGN,RMS,SM,IDM,Output,ClassName,Path
0,4.446106,0.989337,0.169436,2.065414,2281.116409,47.761034,7.500692,0.078382,117.420382,0.647291,126.762228,0.999562,0.647291,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
1,13.065036,0.971947,0.031212,2.277342,2822.893711,53.130911,7.707979,0.052744,108.77181,0.43117,121.054535,0.999646,0.43117,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
2,8.353437,0.972498,0.575584,3.300861,1802.80796,42.459486,7.348934,0.0689,129.548861,0.554495,136.329436,0.999446,0.554495,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
3,11.191044,0.975762,-0.597276,2.152992,4430.024685,66.558431,7.760315,0.055187,143.953244,0.463002,158.59559,0.999774,0.463002,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...
4,4.315616,0.988686,-0.070869,2.261092,1956.425998,44.231505,7.38991,0.076094,105.166916,0.662785,114.089904,0.999489,0.662785,1,Anthracnose,E:\Kuliah\Pengenalan Pola\final-project\output...


In [2]:
feature_cols = ["CNT","CRL","SKEN","KTS","VAR","STD","ENT","EG","MN","HGN","RMS","SM","IDM"]

X = df[feature_cols].values
y = df["Output"].astype(int).values

print("X shape:", X.shape)
print("y counts:\n", pd.Series(y).value_counts())


X shape: (724, 13)
y counts:
 1    588
0    136
Name: count, dtype: int64


Anova Rangking & Save

In [3]:
scores, pvals = f_classif(X, y)

anova_rank = (
    pd.DataFrame({"feature": feature_cols, "score": scores, "p_value": pvals})
      .replace([np.inf, -np.inf], np.nan)
      .fillna(0)
      .sort_values("score", ascending=False)
)

anova_rank_path = RANK_DIR / "anova_rank.csv"
anova_rank.to_csv(anova_rank_path, index=False)

anova_rank


Unnamed: 0,feature,score,p_value
6,ENT,60.289525,2.808214e-14
0,CNT,53.418612,7.162001e-13
11,SM,32.488209,1.747581e-08
1,CRL,30.512812,4.631888e-08
5,STD,19.67761,1.059613e-05
4,VAR,15.136834,0.0001092345
3,KTS,13.139266,0.0003094782
7,EG,8.393977,0.003878617
8,MN,1.94626,0.1634195
2,SKEN,1.482229,0.2238239


Top-K CSV (Top 5/7/9/10)

In [4]:
TOP_K_LIST = [5, 7, 9, 10]

for k in TOP_K_LIST:
    top_feats = anova_rank["feature"].head(k).tolist()
    out_df = df[top_feats + ["Output"]].copy()
    out_path = DATASET_DIR / f"data{k}A.csv"   # A = ANOVA
    out_df.to_csv(out_path, index=False)
    print("Saved:", out_path, "| cols:", len(out_df.columns))


Saved: E:\Kuliah\Pengenalan Pola\final-project\outputs\datasets\data5A.csv | cols: 6
Saved: E:\Kuliah\Pengenalan Pola\final-project\outputs\datasets\data7A.csv | cols: 8
Saved: E:\Kuliah\Pengenalan Pola\final-project\outputs\datasets\data9A.csv | cols: 10
Saved: E:\Kuliah\Pengenalan Pola\final-project\outputs\datasets\data10A.csv | cols: 11
