# 特徴選択について（勉強）

# https://machinelearningmastery.com/feature-selection-machine-learning-python/

* Having irrelevant features in your data can decrease the accuracy of many models, especially linear algorithms like linear and logistic regression.

* Reduces Overfitting: Less redundant data means less opportunity to make decisions based on noise.
* Improves Accuracy: Less misleading data means modeling accuracy improves.
* Reduces Training Time: Less data means that algorithms train faster.


In [1]:
# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values

In [2]:
dataframe.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
dataframe.values

array([[   6.   ,  148.   ,   72.   , ...,    0.627,   50.   ,    1.   ],
       [   1.   ,   85.   ,   66.   , ...,    0.351,   31.   ,    0.   ],
       [   8.   ,  183.   ,   64.   , ...,    0.672,   32.   ,    1.   ],
       ..., 
       [   5.   ,  121.   ,   72.   , ...,    0.245,   30.   ,    0.   ],
       [   1.   ,  126.   ,   60.   , ...,    0.349,   47.   ,    1.   ],
       [   1.   ,   93.   ,   70.   , ...,    0.315,   23.   ,    0.   ]])

In [4]:
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]

In [5]:
"""
The scikit-learn library provides the SelectKBest class that 
can be used with a suite of different statistical tests to
select a specific number of features.
"""

#SelectKBestについて
#http://hayataka2049.hatenablog.jp/entry/2016/12/28/053956
#chi2はカイ二乗検定

# feature extraction
#こう書くと上位4次元を取ってくるらしい
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)

In [6]:
numpy.set_printoptions(precision=3)
print(fit.scores_)

[  111.52   1411.887    17.605    53.108  2175.565   127.669     5.393
   181.304]


In [7]:
#上位4次元の特徴を持ってきている

features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

[[ 148.     0.    33.6   50. ]
 [  85.     0.    26.6   31. ]
 [ 183.     0.    23.3   32. ]
 [  89.    94.    28.1   21. ]
 [ 137.   168.    43.1   33. ]]


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import roc_curve,auc


In [9]:
#比較

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.3, random_state=3)


model = LogisticRegression()

model.fit(X_train,Y_train)


predicted = model.predict(X_test)
expected = Y_test


print("正解率 : {}".format(metrics.accuracy_score(expected,predicted)))
probas_ = model.predict_proba(X_test)
    

fpr, tpr, thresholds = roc_curve(Y_test, probas_[:, 1])
roc_auc_area = auc(fpr, tpr)

print("AUC : {}".format(roc_auc_area))


正解率 : 0.7445887445887446
AUC : 0.8274512812643854


In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(features, Y,test_size=0.3, random_state=3)

model = LogisticRegression()

model.fit(X_train,Y_train)

predicted = model.predict(X_test)
expected = Y_test


print("正解率 : {}".format(metrics.accuracy_score(expected,predicted)))
probas_ = model.predict_proba(X_test)
    

fpr, tpr, thresholds = roc_curve(Y_test, probas_[:, 1])
roc_auc_area = auc(fpr, tpr)

print("AUC : {}".format(roc_auc_area))


正解率 : 0.70995670995671
AUC : 0.8029768298296763
