### 1. Importing Libraries

In [1]:
from pandas import read_csv
from numpy import set_printoptions
from sklearn.fseature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import os

### 2. Loading the dataset

In [4]:
# load data
filename = os.getcwd()+'/datasets_228_482_diabetes.csv'
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(filename, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

### 3. Univariate Feature selection with SelectKBest

In [5]:
# feature extraction
test = SelectKBest(score_func=f_classif, k=4) #f_classif can be replaced by other methods like chi squared
fit = test.fit(X, Y)

In [6]:
# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)

[ 39.67  213.162   3.257   4.304  13.281  71.772  23.871  46.141]


In [7]:
# summarize selected features
print(features[0:5,:])

[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 [  1.   89.   28.1  21. ]
 [  0.  137.   43.1  33. ]]


### 4.  Feature selection using Recursive Feature elimination

In [8]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

In [11]:
model = LogisticRegression(solver='lbfgs')
rfe = RFE(model)
fit = rfe.fit(X, Y)
print('Num of Features:', fit.n_features_)
print('Selected Features: ', fit.support_)
print('Feature Ranking: ',fit.ranking_)

Num of Features: 4
Selected Features:  [ True  True False False False  True  True False]
Feature Ranking:  [1 1 3 4 5 1 1 2]


In [12]:
print(names)

['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']


### 5. Feature importance using Bagged decision trees

In [13]:
from sklearn.ensemble import ExtraTreesClassifier

In [14]:
model = ExtraTreesClassifier(n_estimators=10)
model.fit(X, Y)
print('Feature importances:',model.feature_importances_)

Feature importances: [0.113 0.25  0.107 0.076 0.066 0.136 0.113 0.139]


### 6.  Reducing dimensionality using PCA

In [15]:
from sklearn.decomposition import PCA

In [17]:
pca = PCA(n_components=3)
fit = pca.fit(X)

print('Exaplained Variance: ',fit.explained_variance_ratio_)
print(fit.components_)

Exaplained Variance:  [0.889 0.062 0.026]
[[-2.022e-03  9.781e-02  1.609e-02  6.076e-02  9.931e-01  1.401e-02
   5.372e-04 -3.565e-03]
 [-2.265e-02 -9.722e-01 -1.419e-01  5.786e-02  9.463e-02 -4.697e-02
  -8.168e-04 -1.402e-01]
 [-2.246e-02  1.434e-01 -9.225e-01 -3.070e-01  2.098e-02 -1.324e-01
  -6.400e-04 -1.255e-01]]


### 7. Implementing LASSO regression(uses l1 penalty)

In [18]:
import matplotlib.pyplot as plt
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [19]:
parameters = {'alpha':[0.01,0.001,0.0001]}
lasso_regressor = GridSearchCV(Lasso(), parameters, scoring='neg_mean_squared_error',cv=5)
lasso_regressor.fit(X, Y)
print(lasso_regressor.best_params_)
print(lasso_regressor.best_score_)

{'alpha': 0.0001}
-0.16252660359314186
