https://www.cnblogs.com/jasonfreak/p/5448385.html

In [1]:
import pandas as pd

## 读取数据

1) csv 数据读取格式：

pd.read_csv("xxx.csv")

pd.read_csv("xxx.csv", encoding='gb18030')

pd.read_csv("xxx.csv", encoding='utf-8')

如有需要，添加参数 engine='python'

2) xlsx读取格式：

pd.read_excel("xxx.xlsx")

In [2]:
df = pd.read_excel('电信客户流失.xlsx')
df.head()

Unnamed: 0,地区,用时,年龄,婚姻,住址,收入,学历,工龄,退休,性别,...,三方通话,手机支付,长途日志,免费服务日志,设备日志,电话卡日志,无线日志,收入日志,类型,流失
0,2,13,44,1,9,64,4,5,0,0,...,0,0,1.308333,,,2.014903,,4.158883,1,1
1,3,11,33,1,7,136,5,5,0,0,...,1,0,1.481605,3.032546,,2.72458,3.575151,4.912655,4,1
2,3,68,52,1,24,116,1,29,0,1,...,1,0,2.898671,2.890372,,3.409496,,4.75359,3,0
3,2,33,33,0,12,33,2,0,0,1,...,0,0,2.246015,,,,,3.496508,1,1
4,2,23,30,1,9,30,1,2,0,0,...,1,0,1.84055,,,,,3.401197,3,0


In [3]:
df.columns

Index(['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别', '人口', '免费',
       '设备', '电话卡', '无线', '长途_近期', '免费_近期', '设备_近期', '电话卡_近期', '无线_近期',
       '长途_长期', '免费_长期', '设备_长期', '电话卡_长期', '无线_长期', '多线', '语音', '传真', '互联网',
       '来电显示', '来电等待', '回拨', '三方通话', '手机支付', '长途日志', '免费服务日志', '设备日志', '电话卡日志',
       '无线日志', '收入日志', '类型', '流失'],
      dtype='object')

## 过滤法

### 方差选择
使用方差选择法，先要计算各个特征的方差，然后根据阈值，选择方差大于阈值的特征。

只需修改参数threshold, 将筛选出方差大于threshold的变量

In [4]:
threshold = 2
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]

In [5]:
from sklearn.feature_selection import VarianceThreshold

model = VarianceThreshold(threshold)
model.fit(X)
list(X.columns[model.get_support().tolist()])

['用时', '年龄', '住址', '收入', '工龄']

### 卡方选择
Chi-squared stats of non-negative features for classification tasks.

只需修改参数k, 表示保留的变量个数

In [6]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['类型']

In [7]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

model = SelectKBest(chi2, k)
model.fit(X, y)
list(X.columns[model.get_support().tolist()])



['用时', '住址', '收入', '工龄']

### 方差分析选择
只需修改参数k, 表示保留的变量个数

ANOVA F-value between label/feature for classification tasks.

In [8]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['类型']

In [9]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

model = SelectKBest(f_classif, k)
model.fit(X, y)
list(X.columns[model.get_support().tolist()])



['用时', '住址', '学历', '工龄']

F-value between label/feature for regression tasks.

In [10]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['收入日志'] 

In [11]:
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

model = SelectKBest(f_regression, k)
model.fit(X, y)
list(X.columns[model.get_support().tolist()])



['用时', '年龄', '收入', '工龄']

### 互信息法
只需修改参数k, 表示保留的变量个数

Mutual information for a discrete target.

In [12]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['类型']

In [13]:
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest

model = SelectKBest(mutual_info_classif, k)
model.fit(X, y)
list(X.columns[model.get_support().tolist()])



['用时', '收入', '学历', '工龄']

Mutual information for a continuous target.

In [14]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['收入日志'] 

In [15]:
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest

model = SelectKBest(mutual_info_regression, k)
model.fit(X, y)
list(X.columns[model.get_support().tolist()])



['用时', '年龄', '收入', '工龄']

### 相关系数法
只需修改参数k, 表示保留的变量个数

pearsonr

In [16]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['收入日志'] 

In [17]:
import heapq
from scipy.stats import pearsonr

rs = []
for i in range(X.shape[1]):
    r, p = pearsonr(X.iloc[:, i], y)
    rs.append(r)
ind = list(map(rs.index, heapq.nlargest(k, rs)))
list(X.columns[ind])

['收入', '工龄', '年龄', '用时']

spearmanr

In [18]:
from scipy.stats import spearmanr

rs = []
for i in range(X.shape[1]):
    r, p = spearmanr(X.iloc[:, i], y)
    rs.append(r)
ind = list(map(rs.index, heapq.nlargest(k, rs)))
list(X.columns[ind])

['收入', '工龄', '年龄', '用时']

## 包装法

### 递归特征消除（需要选择不同的基模型）

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html

https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html#sklearn.feature_selection.RFE

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.ensemble

https://scikit-learn.org/stable/modules/classes.html#module-sklearn.linear_model

y是分类

In [19]:
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [20]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['流失']
# estimator = LogisticRegression()
# estimator = SVC(kernel="linear", C=1)
# estimator = DecisionTreeClassifier()
# estimator = RandomForestClassifier()
estimator = GradientBoostingClassifier()

In [21]:
model = RFE(estimator=estimator, n_features_to_select=k)
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['用时', '年龄', '收入', '工龄']

In [22]:
model = RFECV(estimator=estimator, cv=KFold(n_splits=5, random_state=1), scoring='neg_mean_squared_error')
model.fit(X, y)
list(X.columns[model.get_support().tolist()])



['地区', '用时', '年龄', '住址', '收入', '学历', '工龄', '性别']

y连续

In [23]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [24]:
k = 4
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['收入日志']

In [25]:
estimator = DecisionTreeRegressor()
# estimator = RandomForestRegressor()
# estimator = GradientBoostingRegressor()

In [26]:
model = RFE(estimator=estimator, n_features_to_select=k)
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['用时', '年龄', '住址', '收入']

In [27]:
model = RFECV(estimator=estimator, cv=KFold(n_splits=5, random_state=1), scoring='neg_mean_squared_error')
model.fit(X, y)
list(X.columns[model.get_support().tolist()])



['收入']

## 嵌入法（基于模型的特征选择）

In [28]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import GradientBoostingClassifier

y是分类变量（可以修改分类器中的参数）

In [29]:
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['类型']

In [30]:
model = SelectFromModel(LogisticRegression(penalty="l1", C=0.01, solver='liblinear'))
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['用时', '年龄', '住址', '收入', '学历', '工龄']

In [31]:
model = SelectFromModel(RandomForestClassifier(max_depth=10, random_state=1))
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['用时', '年龄', '住址', '收入', '学历', '工龄']

In [32]:
model = SelectFromModel(GradientBoostingClassifier(max_depth=10, random_state=1))
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['用时', '年龄', '住址', '收入', '工龄']

y是连续变量（可以修改分类器中的参数）

In [33]:
X = df[['地区', '用时', '年龄', '婚姻', '住址', '收入', '学历', '工龄', '退休', '性别']]
y = df['收入日志'] 

In [34]:
model = SelectFromModel(Lasso(alpha=0.01))
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['用时', '年龄', '住址', '收入', '学历', '工龄', '退休']

In [35]:
model = SelectFromModel(RandomForestRegressor(max_depth=10, random_state=1))
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['收入']

In [36]:
model = SelectFromModel(GradientBoostingRegressor(max_depth=10, random_state=1))
model.fit(X, y)
list(X.columns[model.get_support().tolist()])

['收入']