In [2]:
# Load required libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression


In [3]:
# load data
filename = "data/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
data = pd.read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
y = array[:, 8]

### Univariate Selection

In [4]:
# feature extraction
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X, y)

In [None]:
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

fi = pd.DataFrame(
    {"Feature": names[0:8], "ANOVA_F_Score": fit.scores_, "Selected": fit.get_support()}
)
print(fi.sort_values(by="ANOVA_F_Score", ascending=False))

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]
  Feature  ANOVA_F_Score  Selected
1    plas     213.161752      True
5    mass      71.772072      True
7     age      46.140611      True
0    preg      39.670227      True
6    pedi      23.871300     False
4    test      13.281108     False
3    skin       4.304381     False
2    pres       3.256950     False


In [None]:
# summarize selected features
print(features[0:5, :])

[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


### Recursive Feature Elimination

In [8]:
# feature selection with RFE
model = LogisticRegression(solver="liblinear")
rfe = RFE(estimator=model, n_features_to_select=3)
fit = rfe.fit(X, y)
print(f"Num Features: {fit.n_features_}")
print(f"Selected Features: {fit.support_}")
print(f"Feature Ranking: {fit.ranking_}")

rfe_results = pd.DataFrame(
    {"Feature": names[0:8], "RFE_Selected": fit.support_, "RFE_Ranking": fit.ranking_}
)
print(rfe_results.sort_values(by="RFE_Ranking"))

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]
  Feature  RFE_Selected  RFE_Ranking
0    preg          True            1
5    mass          True            1
6    pedi          True            1
1    plas         False            2
2    pres         False            3
7     age         False            4
3    skin         False            5
4    test         False            6


### Principal Component Analysis

In [None]:
# feature extraction with PCA
pca = PCA(n_components=3)
fit = pca.fit(X)

# summarize components
np.set_printoptions(precision=8)
# print(f"Explained Variance: {fit.explained_variance_ratio_}")
# print(f"Components: \n{fit.components_}")
# Explained variance
print("Explained Variance Ratio:")
for i, var in enumerate(fit.explained_variance_ratio_, start=1):
    print(f"  PC{i}: {var:.8f}")

# Component loadings with feature names
loadings = pd.DataFrame(
    fit.components_,
    index=[f"PC{i + 1}" for i in range(fit.n_components_)],
    columns=names[0:8],
)

print("\nPCA Component Loadings:")
print(loadings)

for i in range(fit.n_components_):
    component_loadings = fit.components_[i]
    sorted_indices = np.argsort(np.abs(component_loadings))[::-1]
    print(f"\nTop features for PC{i + 1}:")
    for idx in sorted_indices[:3]:  # Top 3 features
        print(f"  {names[idx]}: {component_loadings[idx]:.8f}")


Explained Variance Ratio:
  PC1: 0.88854663
  PC2: 0.06159078
  PC3: 0.02579012

PCA Component Loadings:
         preg      plas      pres      skin      test      mass      pedi  \
PC1 -0.002022  0.097812  0.016093  0.060757  0.993111  0.014011  0.000537   
PC2  0.022649  0.972210  0.141909 -0.057861 -0.094627  0.046973  0.000817   
PC3  0.022465 -0.143429  0.922467  0.307013 -0.020977  0.132445  0.000640   

          age  
PC1 -0.003565  
PC2  0.140168  
PC3  0.125454  

Top features for PC1:
  test: 0.99311084
  plas: 0.09781158
  skin: 0.06075669

Top features for PC2:
  plas: 0.97221004
  pres: 0.14190933
  age: 0.14016818

Top features for PC3:
  pres: 0.92246719
  skin: 0.30701306
  plas: -0.14342871


### Feature Importance

In [None]:
# Feature Importance using Extra Trees Classifier
model = ExtraTreesClassifier(n_estimators=100)
model.fit(X, y)
# print(f"Feature Importances: {model.feature_importances_}")

for name, importance in sorted(
    zip(names, model.feature_importances_), key=lambda x: x[1], reverse=True
):
    print(f"{name:10s} {importance:.6f}")

plas       0.230584
age        0.142884
mass       0.138659
pedi       0.120590
preg       0.112262
pres       0.096672
skin       0.080130
test       0.078218
