# 1. 导入包

In [1]:
# -*- coding: utf-8 -*-
"""
Python 3.7.7
sklearn 0.23.1
使用 过滤法 对糖尿病数据集降维
"""

# 导入包
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# 2. 导入数据集

In [2]:
# 导入数据集
dataset = pd.read_csv('pima-indians-diabetes.csv')

# 3. 检测缺失值

In [3]:
# 检测缺失值
null_df = dataset.isnull().sum()
null_df

preg     0
plas     0
pres     0
skin     0
test     0
mass     0
pedi     0
age      0
class    0
dtype: int64

# 4. 生成自变量和因变量

In [4]:
# 生成自变量和因变量
X = dataset.iloc[:,0:8].values
y = dataset.iloc[:,8].values

# 5. 使用不同的统计指标做特征选择

## 5.1 使用 ANOVA F-value 指标

In [5]:
# 使用不同的统计指标做特征选择
# 特征选择（ANOVA F-value）
test = SelectKBest(score_func=f_classif, k=4) # 只选择4个特征
fit = test.fit(X, y)

In [6]:
# 得到每个特征的p-value
print(fit.pvalues_) # 前4个字段为0 (preq), 1 (plas), 5 (mass), and 7 (age)

[5.06512730e-10 8.93543165e-43 7.15139001e-02 3.83477048e-02
 2.86186460e-04 1.22980749e-16 1.25460701e-06 2.20997546e-11]


#### 找 pvalues_ 值小的字段。前4个字段为 0 (preq), 1 (plas), 5 (mass), and 7 (age)

In [7]:
# 得到每个特征的分数
print(fit.scores_)

[ 39.67022739 213.16175218   3.2569504    4.30438091  13.28110753
  71.7720721   23.8713002   46.14061124]


In [8]:
# 得到筛选后的特征
features = fit.transform(X)

In [9]:
print(features)

[[  6.  148.   33.6  50. ]
 [  1.   85.   26.6  31. ]
 [  8.  183.   23.3  32. ]
 ...
 [  5.  121.   26.2  30. ]
 [  1.  126.   30.1  47. ]
 [  1.   93.   30.4  23. ]]


#### features 存储着特征选择后的特征。后面构建模型时，自变量使用features，而不是X。

## 5.2 使用 卡方检验 指标

In [10]:
# 特征选择（卡方检验）
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, y)

In [11]:
# 得到每个特征的p-value
print(fit.pvalues_) # 前4个字段为1 (plas), 4 (test), 5 (mass), and 7 (age)

[4.55261043e-026 5.48728628e-309 2.71819252e-005 3.15697650e-013
 0.00000000e+000 1.32590849e-029 2.02213728e-002 2.51638830e-041]


#### 找 pvalues_ 值小的字段。前4个字段为 0 (preq), 1 (plas), 5 (mass), and 7 (age)。这和用 ANOVA F-value 指标选取的字段一致

In [12]:
# 得到每个特征的分数
print(fit.scores_)

[ 111.51969064 1411.88704064   17.60537322   53.10803984 2175.56527292
  127.66934333    5.39268155  181.30368904]


In [13]:
# 得到筛选后的特征
features = fit.transform(X)

# 6. 构建逻辑回归模型

## 6.1 使用原始数据构建

In [14]:
# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # 拆分数据集


sc = StandardScaler() # 特征缩放
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = LogisticRegression(penalty='l2', C=1, class_weight='balanced', random_state = 0)
classifier.fit(X_train, y_train)

# 预测测试集
y_pred = classifier.predict(X_test)

# 评估模型性能
print(accuracy_score(y_test, y_pred))

0.78125


## 6.2 使用特征选择后的数据构建

In [15]:
# 数据预处理
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size = 0.25, random_state = 0) # 拆分数据集


sc = StandardScaler() # 特征缩放
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

classifier = LogisticRegression(penalty='l2', C=1, class_weight='balanced', random_state = 0)
classifier.fit(X_train, y_train)

# 预测测试集
y_pred = classifier.predict(X_test)

# 评估模型性能
print(accuracy_score(y_test, y_pred))

0.734375


#### 降维不一定会带来模型性能的提升。