In [None]:
# Load required libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE, SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression


In [None]:
# load data
filename = "data/pima-indians-diabetes.data.csv"
names = ["preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"]
data = pd.read_csv(filename, names=names)
array = data.values
X = array[:, 0:8]
y = array[:, 8]

### Univariate Selection

In [3]:
# feature extraction
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X, y)

In [5]:
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]


In [None]:
# summarize selected features
print(features[0:5, :])

[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


### Recursive Feature Elimination

In [None]:
# feature selection with RFE
model = LogisticRegression(solver="liblinear")
rfe = RFE(estimator=model, n_features_to_select=3)
fit = rfe.fit(X, y)
print(f"Num Features: {fit.n_features_}")
print(f"Selected Features: {fit.support_}")
print(f"Feature Ranking: {fit.ranking_}")

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]


### Principal Component Analysis

In [13]:
# feature extraction with PCA
pca = PCA(n_components=3)
fit = pca.fit(X)

# summarize components
np.set_printoptions(precision=8)
print(f"Explained Variance: {fit.explained_variance_ratio_}")
print(f"Components: \n{fit.components_}")


Explained Variance: [0.88854663 0.06159078 0.02579012]
Components: 
[[-2.02176587e-03  9.78115765e-02  1.60930503e-02  6.07566861e-02
   9.93110844e-01  1.40108085e-02  5.37167919e-04 -3.56474430e-03]
 [ 2.26488861e-02  9.72210040e-01  1.41909330e-01 -5.78614699e-02
  -9.46266913e-02  4.69729766e-02  8.16804621e-04  1.40168181e-01]
 [ 2.24649003e-02 -1.43428710e-01  9.22467192e-01  3.07013055e-01
  -2.09773019e-02  1.32444542e-01  6.39983017e-04  1.25454310e-01]]


### Feature Importance

In [None]:
# Feature Importance using Extra Trees Classifier
model = ExtraTreesClassifier(n_estimators=100)
model.fit(X, y)
# print(f"Feature Importances: {model.feature_importances_}")

for name, importance in sorted(
    zip(names, model.feature_importances_), key=lambda x: x[1], reverse=True
):
    print(f"{name:10s} {importance:.6f}")

plas       0.230584
age        0.142884
mass       0.138659
pedi       0.120590
preg       0.112262
pres       0.096672
skin       0.080130
test       0.078218
