In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer
from sklearn.feature_selection import SelectKBest, chi2, RFE
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier

In [14]:
pima_df = pd.read_csv("raw_data\diabetes.csv")

In [15]:
# scale data (between 0 and 1)

# Split the data into features and target
X = pima_df.drop('Outcome', axis=1)
y = pima_df['Outcome']

In [16]:
# rescale data (between 0 and 1)
scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_X = scaler.fit_transform(X)

In [17]:
print(rescaled_X[0:5,:])

[[0.353 0.744 0.59  0.354 0.    0.501 0.234 0.483]
 [0.059 0.427 0.541 0.293 0.    0.396 0.117 0.167]
 [0.471 0.92  0.525 0.    0.    0.347 0.254 0.183]
 [0.059 0.447 0.541 0.232 0.111 0.419 0.038 0.   ]
 [0.    0.688 0.328 0.354 0.199 0.642 0.944 0.2  ]]


In [18]:
# Standardize data (0 mean, 1 stdev)
scaler = StandardScaler().fit(X)
rescaled_X = scaler.transform(X)

# summarize transformed data
np.set_printoptions(precision=2)
print(rescaled_X[0:5,:])

[[ 0.64  0.85  0.15  0.91 -0.69  0.2   0.47  1.43]
 [-0.84 -1.12 -0.16  0.53 -0.69 -0.68 -0.37 -0.19]
 [ 1.23  1.94 -0.26 -1.29 -0.69 -1.1   0.6  -0.11]
 [-0.84 -1.   -0.16  0.15  0.12 -0.49 -0.92 -1.04]
 [-1.14  0.5  -1.5   0.91  0.77  1.41  5.48 -0.02]]


In [19]:
# normalize data (length of 1)
scaler = Normalizer().fit(X)
normalized_X = scaler.transform(X)

# summarize transformed data
np.set_printoptions(precision=2)
print(normalized_X[0:5,:])

[[0.03 0.83 0.4  0.2  0.   0.19 0.   0.28]
 [0.01 0.72 0.56 0.24 0.   0.22 0.   0.26]
 [0.04 0.92 0.32 0.   0.   0.12 0.   0.16]
 [0.01 0.59 0.44 0.15 0.62 0.19 0.   0.14]
 [0.   0.6  0.17 0.15 0.73 0.19 0.01 0.14]]


In [20]:
# binarization (thresholding)
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)

# summarize transformed data
np.set_printoptions(precision=2)
print(binary_X[0:5,:])

[[1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 1. 1.]]


### Univariate Selection

In [21]:
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)

In [22]:
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 111.52  1411.887   17.605   53.108 2175.565  127.669    5.393  181.304]


In [23]:
# summarize selected features
print(features[0:5,:])

[[148.    0.   33.6  50. ]
 [ 85.    0.   26.6  31. ]
 [183.    0.   23.3  32. ]
 [ 89.   94.   28.1  21. ]
 [137.  168.   43.1  33. ]]


### Feature Extraction with RFE

In [12]:
# # feature extraction
# model = LogisticRegression(solver='lbfgs', max_iter=1000)
# rfe = RFE(model, n_features_to_select=3)
# fit = rfe.fit(X, y)

# print(f"Num Features: {fit.n_features_}") 
# print(f"Selected Features: {fit.support_}") 
# print(f"Feature Ranking: {fit.ranking_}") 

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 4 6 5 1 1 3]


In [26]:
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)

In [28]:
# summarize components
print(f"Explained Variance: {fit.explained_variance_ratio_}") 
print(fit.components_)

Explained Variance: [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


### Feature Importance

In [31]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)

[0.109 0.237 0.099 0.077 0.077 0.141 0.115 0.145]
