In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.stats import pearsonr
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import RandomizedLasso
from sklearn.feature_selection import RFE

In [2]:
def feature_sort(X,y):
	# 卡方
	s1 = SelectKBest(chi2).fit(X,y).scores_
	s1[np.isnan(s1)] = 0
	s1=s1.reshape(-1,1)
	s1 = MinMaxScaler().fit_transform(s1)
	s1 = map(lambda x:round(x,4),s1)
	s1 = np.array(s1)
	# 皮尔森系数
	s2 = X.apply(lambda x:abs(pearsonr(x,y)[0])).values
	s2[np.isnan(s2)]=0
	s2=s2.reshape(-1,1)
	s2 = MinMaxScaler().fit_transform(s2)
	s2 = map(lambda x:round(x,4),s2)
	s2 = np.array(s2)
	# L2正则
	s3 = SelectFromModel(LogisticRegression(penalty="l2", C=1)).fit(X, y)
	l2 = s3.estimator_
	s3 = abs(l2.coef_[0])
	s3 = s3.reshape(-1,1)
	s3 = MinMaxScaler().fit_transform(s3)
	s3 = map(lambda x:round(x,4),s3)
	s3 = np.array(s3)
	# L1正则
	s4 = SelectFromModel(LogisticRegression(penalty="l1", C=1)).fit(X, y)
	l1 = s4.estimator_
	s4 = abs(l1.coef_[0])
	s4 = s4.reshape(-1,1)
	s4 = MinMaxScaler().fit_transform(s4)
	s4 = map(lambda x:round(x,4),s4)
	s4 = np.array(s4)
	# GBDT
	s5 =SelectFromModel(GradientBoostingClassifier()).fit(X, y)
	gb = s5.estimator_
	s5 = gb.feature_importances_
	s5 = s5.reshape(-1,1)
	s5 = MinMaxScaler().fit_transform(s5)
	s5 = map(lambda x:round(x,4),s5)
	s5 = np.array(s5)
	# 随机森林
	s6 = SelectFromModel(RandomForestRegressor()).fit(X, y)
	rf = s6.estimator_
	s6 = rf.feature_importances_
	s6 = s6.reshape(-1,1)
	s6 = MinMaxScaler().fit_transform(s6)
	s6 = map(lambda x:round(x,4),s6)
	s6 = np.array(s6)
	# 稳定性
	rlasso = RandomizedLasso()  # 默认通过AIC选择正则化系数alpha
	rlasso.fit(X, y)
	s7 = rlasso.scores_
	s7 = s7.reshape(-1,1)
	s7 = MinMaxScaler().fit_transform(s7)
	s7 = map(lambda x:round(x,4),s7)
	s7 = np.array(s7)
	# REF
	lr = LogisticRegression(penalty='l1')
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,y)
	s8 = rfe.ranking_
	s8 = map(lambda x:round((len(s8)+1-x)/float(len(s8)),4),s8)
	s8 = np.array(s8)
	# 总分
	s = s1+s2+s3+s4+s5+s6+s7+s8
	s_sum = pd.Series(s,index=X.columns)
	s_sum.sort_values(ascending=False,inplace=True)
	return list(s_sum.index.values)

In [3]:
def model_selection(X_train,y_train,feature):
	parameters={'C':[0.001,0.01,0.1,1,5,10,20]}
	lr = LogisticRegression()
	clf = GridSearchCV(lr,parameters,scoring='accuracy',cv=10) #roc_auc
	clf.fit(X_train[feature],y_train)
	return clf.best_score_,clf.best_estimator_

In [4]:
def feature_selection(X_train,y_train,sorted_feature):
	selected_feature = []
	score = 0
	while sorted_feature:
		selected_feature.append(sorted_feature[0])
		del sorted_feature[0]
		score_new,clf_best = model_selection(X_train,y_train,selected_feature)
		if score_new <= score:
			selected_feature.pop()
		else:
			score = score_new
			clf = clf_best
	return selected_feature,score,clf