-
Notifications
You must be signed in to change notification settings - Fork 0
/
FeatureFilter.py
110 lines (109 loc) · 4.21 KB
/
FeatureFilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/python
#coding:utf-8
from sklearn.feature_selection import VarianceThreshold,chi2,SelectKBest,RFE,SelectFromModel
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from scipy.stats import pearsonr
from minepy import MINE
from .FeatureProcess import FeatureProcess
import numpy as np
## feature selection
## 1.filter
# 1.1 remove by lower variance
class FeatureFilter(FeatureProcess):
def __init__(self,df,top_k,labels='label',params=None):
FeatureProcess.__init__(self, data = df, labels = labels,
normal_type = 'min_max', fillna_type = '0',
)
self.k = top_k
self.model = XGBClassifier(**params)
self.pre_process()
self.score_dict = {}
self.cols = self.data.columns.values
for i in self.cols:
self.score_dict[i] = 0
#filter methods
# low variance filter
def filter_variance(self):
var_row = self.data.describe().loc['std',:]
var_list= [(self.cols[i],var_row[i]) for i in range(len(self.cols))]
var_list = sorted(var_list,key=lambda x:x[1],reverse=True)
print(var_list[:self.k])
def chiCheck(self):
chi = SelectKBest(chi2,self.k)
chi.fit_transform(self.data,self.data.label)
for cols in chi.get_support(True):
self.score_dict[self.cols[cols]] += 1
print(self.cols[cols])
def pearsonCheck(self):
score_list=[]
for i in range(len(self.cols)):
score,_ = pearsonr(self.data[self.cols[i]], self.data.label)
score_list.append((self.cols[i],abs(score)))
self.score_dict[self.cols[i]] += abs(score)
score_list = sorted(score_list,key=lambda x:x[1],reverse=True)
print(score_list[:self.k])
def MICscore(self):
m = MINE()
score_list=[]
for i in range(len(self.cols)):
m.compute_score(self.data[self.cols[i]], self.data.label)
score_list.append((self.cols[i],abs(m.mic())))
self.score_dict[self.cols[i]] += abs(m.mic())
score_list = sorted(score_list,key=lambda x:x[1],reverse=True)
print(score_list[:self.k])
#model method
def CrossVal(self):
score_list=[]
for i in range(len(self.cols)):
X = self.data[self.cols[i]].values.reshape(-1,1)
y = self.data.label.values
score = cross_val_score(self.model, X, y, scoring="r2", cv=2)
score_list.append((self.cols[i],format(np.mean(score),'.3f')))
self.score_dict[self.cols[i]] += abs(np.round(np.mean(score),3))
score_list = sorted(score_list,key=lambda x:x[1],reverse=True)
print(score_list[:self.k])
#Wrapper
def RecursiveElim(self):
self.data = self.data.fillna(0)
print(self.data.values)
rfe = RFE(self.model, n_features_to_select=self.k)
rfe.fit_transform(self.data.drop('label',axis=1), self.data.label)
for i in rfe.get_support(True):
self.score_dict[self.cols[i]] += 1
print(self.cols[i])
#Embedded
def LinearModelEm(self):
lsvc = LinearSVC(C=0.01, penalty="l1",dual=False)
lsvc.fit(self.data.drop('label',axis=1),self.data.label)
model = SelectFromModel(lsvc, prefit=True,max_features=self.k)
for i in model.get_support(True):
self.score_dict[self.cols[i]] += 1
print(self.cols[i])
def collect(self):
func_list = [
'filter_variance',
'chiCheck',
'pearsonCheck',
'MICscore',
'CrossVal',
'RecursiveElim',
'LinearModelEm']
for i in func_list:
print(i)
print("="*50)
eval('self.'+i)()
print("="*50)
self.score_dict = sorted(self.score_dict.items(), key=lambda x:x[1], reverse=True)
index = 0
self.out_cols = []
print("Total Weight")
print("="*50)
for item in self.score_dict:
if index > self.k + 1:
break
else:
print(item,'\n')
self.out_cols.append(item[0])
index = index + 1