In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import roc_auc_score
from sklearn2pmml import PMMLPipeline,sklearn2pmml

from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn2pmml.feature_extraction.text import Splitter


from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# 读取数据并定义X， y， 切分train，valid

train = pd.read_csv('qz_andr.csv')

X = train['tags']
y = train['is_reg']

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                      stratify=y,
                                                      test_size = 0.2,
                                                     random_state = 123)
X.shape

y.value_counts(dropna=False)

(32552,)

1    23062
0     9490
Name: is_reg, dtype: int64

In [4]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,
                                                      stratify=y_train,
                                                      test_size = 0.3,
                                                     random_state = 123)
X_train.shape, X_valid.shape

((18228,), (7813,))

In [5]:
tag_count = CountVectorizer(tokenizer=Splitter(separator_re=',')).fit(X_train, y_train)

features_train = tag_count.transform(X_train).toarray()
features_valid = tag_count.transform(X_valid).toarray()
features_test = tag_count.transform(X_test).toarray()
features = tag_count.transform(X).toarray()

# features_train = pd.DataFrame(features_train, columns = tag_count.get_feature_names(), index = y_train.index)
# features_valid = pd.DataFrame(features_valid, columns = tag_count.get_feature_names(), index = y_valid.index)
# features = pd.DataFrame(features, columns = tag_count.get_feature_names(), index = y.index)

features_train.shape, features_valid.shape, features.shape

((18228, 377), (7813, 377), (32552, 377))

In [12]:
clf = LogisticRegression(C=0.03, penalty='l2', max_iter=200, random_state=123)
clf.fit(features_train,y_train)
roc_auc_score(y_train,clf.predict_proba(features_train)[:,1])
roc_auc_score(y_valid,clf.predict_proba(features_valid)[:,1])
roc_auc_score(y_test,clf.predict_proba(features_test)[:,1])

LogisticRegression(C=0.03, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.76813233621172095

0.75031779568600487

0.75705398702571669

In [7]:
from deap_v1 import *
ga = FeatureSelectGA(377, features_train, y_train, features_valid, y_valid)

gen	nevals	max     
0  	200   	0.741114
1  	111   	0.742941
2  	110   	0.742941
3  	111   	0.743095
4  	110   	0.744686
5  	119   	0.746573
6  	113   	0.746573
7  	114   	0.747207
8  	108   	0.748869
9  	108   	0.749689
10 	109   	0.751306
11 	120   	0.751306
12 	111   	0.750316
13 	106   	0.751497
14 	102   	0.752295
15 	116   	0.7529  
16 	127   	0.7529  
17 	108   	0.7529  
18 	105   	0.752295
19 	121   	0.752644
20 	102   	0.753399
21 	110   	0.754169
22 	101   	0.754169
23 	97    	0.753707
24 	117   	0.753707
25 	113   	0.753412
26 	104   	0.753987
27 	104   	0.753987
28 	100   	0.753987
29 	106   	0.754091
30 	110   	0.753987
31 	108   	0.754385
32 	107   	0.754385
33 	112   	0.754008
34 	92    	0.754754
35 	101   	0.754754
36 	105   	0.754754
37 	97    	0.754258
38 	116   	0.754258
39 	143   	0.754406
40 	105   	0.754406
41 	101   	0.754589
42 	103   	0.754589
43 	101   	0.755013
44 	113   	0.755013
45 	121   	0.755121
46 	112   	0.755121
47 	100   	0.755121
48 	116   	0.755121


In [11]:
individual = ga[0]
col_idx = [i for i in range(len(individual)) if individual[i] == 1]

clf = LogisticRegression(C=0.03, penalty='l2', max_iter=200, random_state=123)
clf.fit(features_train[:,col_idx], y_train)

roc_auc_score(y_train,clf.predict_proba(features_train[:,col_idx])[:,1])
roc_auc_score(y_valid,clf.predict_proba(features_valid[:,col_idx])[:,1])
roc_auc_score(y_test,clf.predict_proba(features_test[:,col_idx])[:,1])

LogisticRegression(C=0.03, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=200, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.77478427831165197

0.76256902955333317

0.75912617637834345