In [3]:
from numpy import set_printoptions
from sklearn.preprocessing import Binarizer
from pandas import read_csv
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
# seperate array into input & output components
X = array [:,0:8]
Y = array [:,8]
binarizer = Binarizer(threshold=0.0) .fit(X)
binaryX = binarizer.transform(X)
#summarize transformed data
set_printoptions(precision=4)
print(binaryX[0:5,:])

[[1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 1. 0. 1. 1. 1.]
 [1. 1. 1. 0. 0. 1. 1. 1.]
 [1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 1. 1.]]


# CH 8: Feature Selection
## 3 Reasons For Feature Selection
1. Overfitting - less redudant data means less likely to decided based on noise
2. Improves Accuracy - less noisey data means data is closer to accurate depiction
3. Reduces Resource Allocation- less data means algo can go faster


In [4]:
#8.2 Univariate Selection Feature Selection Tests 
from pandas import read_csv
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
# seperate array into input & output components
X = array [:,0:8]
Y = array [:,8]
# feature extraction
test = SelectKBest(score_func=f_classif, k=4)
fit = test.fit(X, Y)
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]
[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


# CH 8.3 Recursive Feature Elimination

### In order to find the top features, we will be using RFE to find the combo of attrs. that contribute the most to predicting the target attribute

### The example below uses RFE with the logistic regression algorithm to select the top 3 features. This will return an array of of true/false for each column & will rank them.

In [8]:
#8.3 Linear Algebra & Logistic Regression
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
#load data
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
# seperate array into input & output components
X = array [:,0:8]
Y = array [:,8]
#feature extraction
model = LogisticRegression(solver='liblinear')
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d" % fit.n_features_)
print("Selected Features: %s" % fit.support_)
print("Feature Ranking: %s" % fit.ranking_)

Num Features: 3
Selected Features: [ True False False False False  True  True False]
Feature Ranking: [1 2 3 5 6 1 1 4]




# CH 8.4 Principal Component Analysis
### PCA uses linear algebra jutsu to do data reduction via transfrom + compression. PCA allows you to pick the number of dimensions/principal compnts. to transform. Today we are using 3

In [10]:
#8.4 PCA, use alegbra to do data reduction  and find top 3 tings
from pandas import read_csv
from sklearn.decomposition import PCA

#load data
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
# seperate array into input & output components
X = array [:,0:8]
Y = array [:,8]
#feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
#summarize components
print("Explained Variance: %s" % fit.explained_variance_ratio_)
print(fit.components_)

Explained Variance: [0.88854663 0.06159078 0.02579012]
[[-2.02176587e-03  9.78115765e-02  1.60930503e-02  6.07566861e-02
   9.93110844e-01  1.40108085e-02  5.37167919e-04 -3.56474430e-03]
 [-2.26488861e-02 -9.72210040e-01 -1.41909330e-01  5.78614699e-02
   9.46266913e-02 -4.69729766e-02 -8.16804621e-04 -1.40168181e-01]
 [-2.24649003e-02  1.43428710e-01 -9.22467192e-01 -3.07013055e-01
   2.09773019e-02 -1.32444542e-01 -6.39983017e-04 -1.25454310e-01]]


# 8.5 Feature Importance
### We will use Extra Tree decision-tree to classify diabetes in our dataset. Remember a feature's score, the more important. Below it will be clear that plas, age, and mass are key

In [12]:
#8.5 Feat importance using Extra Treee Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier

#load data
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'] 
dataframe = read_csv(url, names=names)
array = dataframe.values
# seperate array into input & output components
X = array [:,0:8]
Y = array [:,8]
#feature extraction
model = ExtraTreesClassifier(n_estimators=100)
model.fit(X,Y)
print(model.feature_importances_)

[0.11142076 0.23352594 0.0994475  0.07891256 0.07571193 0.13813235
 0.12051233 0.14233662]


# 8.6 Summary on Feature building
1. Univariate Selection - Data scientists can use SelectKbest & f_classif to select which attributes are most important. Today we found the top 4: preq,plas,mass,age

2. Recursive Feature Elimination- Data Scientists can use RFE to remove uneccesary atts and model those that remain. Combo the RFE + logisitic regression to get top 3 features

3. Principial Component Analysis- Data Sci folks must take datasets and do some data reduction. PCA allows folks to pick and choose which # of dimensions to transform

4. Feature Importance - DS folks can use ExtraTreeClassifier to find which attrs are most important. The bigger the # the more likely its important