In [1]:
import pandas as pd
import numpy as np
import joblib
import time
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
from sklearn.model_selection import train_test_split, cross_val_predict, validation_curve
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_recall_curve, precision_score, recall_score, roc_curve, roc_auc_score

csv_path = './data_preprocessing/sequences_with_GC_8.csv'

origin_host_data = pd.read_csv(csv_path)
origin_host_data = origin_host_data.drop("Accession", axis=1).drop("Host", axis=1).drop("Sequence", axis=1)
origin_host_data.head()

Unnamed: 0,Length,Class,GC_content,AAAAAAAA_num,AAAAAAAT_num,AAAAAAAC_num,AAAAAAAG_num,AAAAAATA_num,AAAAAATT_num,AAAAAATC_num,...,GGGGGGTC_num,GGGGGGTG_num,GGGGGGCA_num,GGGGGGCT_num,GGGGGGCC_num,GGGGGGCG_num,GGGGGGGA_num,GGGGGGGT_num,GGGGGGGC_num,GGGGGGGG_num
0,1723,Mammalia|Aves,0.44,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,12063,Mammalia|Aves,0.42,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2330,Mammalia|Aves,0.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2364,Mammalia|Aves,0.4,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1775,Mammalia|Aves,0.44,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# 把Sequence使用one-hot编码

# host_data = origin_host_data.values
host_data = origin_host_data.copy()

# 把一条序列转换成one-hot编码
def encode_sequence_with_one_hot(sequence):
    bases_dict = dict()
    bases_dict['A'] = [1., 0., 0., 0.]
    bases_dict['T'] = [0., 1., 0., 0.]
    bases_dict['C'] = [0., 0., 1., 0.]
    bases_dict['G'] = [0., 0., 0., 1.]
    bases_dict['N'] = [0., 0., 0., 0.]

    one_hot_sequence = []
    for base in sequence:
        if base != 'A' and base != 'T' and base != 'C' and base != 'G':
            base = 'N'
        one_hot_sequence.append(bases_dict[base])
    one_hot_sequence = np.array(one_hot_sequence)
    return one_hot_sequence

# encode_sequence_with_one_hot(host_data[0, 27])
# 循环处理每一条序列并将host_data中的替换
# for i in range(len(host_data)):
#     host_data[i, 27] = encode_sequence_with_one_hot(host_data[i, 27])
# print(host_data[0])

In [3]:
# for i in range(len(host_data)):
#     if host_data[i, 1] == 'Mammalia' or host_data[i, 1] == 'Aves':
#         host_data[i, 1] = 'Mammalia|Aves'
# print(host_data[0])

In [4]:
train_set, test_set = train_test_split(host_data, test_size=0.2, random_state=42)

In [5]:
# 删除一部分others样本

# 筛选样例
# train_set_temp = train_set.iloc[::2] #奇数行 [start:end:step]
# train_set_temp = train_set.iloc[1::2] #偶数行

# 在train_set是DataFrame格式时的筛选
# 每三行删一行
train_set = train_set.drop(train_set.loc[(train_set["Class"] == "others")].iloc[::2].index)
train_set = train_set.drop(train_set.loc[(train_set["Class"] == "others")].iloc[::2].index)
print(len(train_set))
train_set.head()

# 在train_set是ndarray格式时的筛选
# bool_set = train_set[:, 1] != "others"
# # bool_set中筛选前926个True的是Mammalia, Aves，筛选前4923个False的是others
# i = 0
# for j, b in enumerate(bool_set):
#     if not b:
#         i += 1
#         # 四个others留一个，否则others的样例过多
#         if i % 4 == 0:
#             bool_set[j] = True
# train_set = train_set[bool_set]
# print(len(train_set))

2156


Unnamed: 0,Length,Class,GC_content,AAAAAAAA_num,AAAAAAAT_num,AAAAAAAC_num,AAAAAAAG_num,AAAAAATA_num,AAAAAATT_num,AAAAAATC_num,...,GGGGGGTC_num,GGGGGGTG_num,GGGGGGCA_num,GGGGGGCT_num,GGGGGGCC_num,GGGGGGCG_num,GGGGGGGA_num,GGGGGGGT_num,GGGGGGGC_num,GGGGGGGG_num
640,3970,others,0.45,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3396,2274,Mammalia|Aves,0.42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4504,2246,Mammalia|Aves,0.33,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7278,2724,others,0.4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2225,2712,Mammalia|Aves,0.52,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [6]:
# 在train_set是DataFrame格式时的
train_features = train_set.drop("Class", axis=1)
train_labels_named = train_set["Class"].copy()

# 在train_set是ndarray格式时的
# train_labels_named = train_set[:, 1].copy()
# train_features = np.delete(train_set, 1, axis=1)

In [7]:
# 在train_set是DataFrame格式时的
train_features.head()

# 在train_set是ndarray格式时的
# train_features[:5]

Unnamed: 0,Length,GC_content,AAAAAAAA_num,AAAAAAAT_num,AAAAAAAC_num,AAAAAAAG_num,AAAAAATA_num,AAAAAATT_num,AAAAAATC_num,AAAAAATG_num,...,GGGGGGTC_num,GGGGGGTG_num,GGGGGGCA_num,GGGGGGCT_num,GGGGGGCC_num,GGGGGGCG_num,GGGGGGGA_num,GGGGGGGT_num,GGGGGGGC_num,GGGGGGGG_num
640,3970,0.45,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3396,2274,0.42,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4504,2246,0.33,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7278,2724,0.4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2225,2712,0.52,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
# 在train_set是DataFrame格式时的
train_labels_named.head()

# 在train_set是ndarray格式时的
# train_labels_named[:5]

640            others
3396    Mammalia|Aves
4504    Mammalia|Aves
7278           others
2225    Mammalia|Aves
Name: Class, dtype: object

In [9]:
# 给train_set的labels使用one-hot编码
train_labels = np.array(train_labels_named).reshape(len(train_labels_named), -1)
onehotencoder = OneHotEncoder()
onehotencoder.fit(train_labels)
train_labels = onehotencoder.transform(train_labels).toarray()
print(type(train_labels))
train_labels[:5]

<class 'numpy.ndarray'>


array([[0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [1., 0.]])

In [10]:
# 取测试集中的Mammalia数据进行测试

# 在test_set是DataFrame格式时的
test_set_mammalia = test_set.loc[test_set["Class"] == "Mammalia|Aves"]
# test_set_mammalia = test_set.loc[test_set["Class"] == "others"]
test_set_mammalia.head()

# 在test_set是ndarray格式时的
# test_set_mammalia = test_set[test_set[:, 1] == "Mammalia|Aves"]
# test_set_mammalia[:5]

Unnamed: 0,Length,Class,GC_content,AAAAAAAA_num,AAAAAAAT_num,AAAAAAAC_num,AAAAAAAG_num,AAAAAATA_num,AAAAAATT_num,AAAAAATC_num,...,GGGGGGTC_num,GGGGGGTG_num,GGGGGGCA_num,GGGGGGCT_num,GGGGGGCC_num,GGGGGGCG_num,GGGGGGGA_num,GGGGGGGT_num,GGGGGGGC_num,GGGGGGGG_num
6923,29751,Mammalia|Aves,0.41,4,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3473,4899,Mammalia|Aves,0.4,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4615,1798,Mammalia|Aves,0.47,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
132,2343,Mammalia|Aves,0.46,1,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2206,16146,Mammalia|Aves,0.43,0,3,0,1,6,0,0,...,0,0,0,0,0,0,1,0,0,1


In [11]:
# 在test_set是DataFrame格式时的
test_features = test_set.drop("Class", axis=1)
test_labels_named = test_set["Class"].copy()
# test_features = test_set_mammalia.drop("Class", axis=1)
# test_labels_named = test_set_mammalia["Class"].copy()

# 在test_set是ndarray格式时的
# test_labels_named = test_set[:, 1].copy()
# test_features = np.delete(test_set, 1, axis=1)
# test_labels_named = test_set_mammalia[:, 1].copy()
# test_features = np.delete(test_set_mammalia, 1, axis=1)

In [12]:
# 在test_set是DataFrame格式时的
test_features.head()

# 在test_set是ndarray格式时的
# test_features[:5]

Unnamed: 0,Length,GC_content,AAAAAAAA_num,AAAAAAAT_num,AAAAAAAC_num,AAAAAAAG_num,AAAAAATA_num,AAAAAATT_num,AAAAAATC_num,AAAAAATG_num,...,GGGGGGTC_num,GGGGGGTG_num,GGGGGGCA_num,GGGGGGCT_num,GGGGGGCC_num,GGGGGGCG_num,GGGGGGGA_num,GGGGGGGT_num,GGGGGGGC_num,GGGGGGGG_num
2572,50165,0.66,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1561,9214,0.48,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5893,9650,0.43,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4672,46917,0.44,0,2,1,0,4,1,1,2,...,0,0,0,0,0,1,0,0,0,0
5011,9847,0.43,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# 在test_set是DataFrame格式时的
test_labels_named.head()

# 在test_set是ndarray格式时的
# test_labels_named[:5]

2572    others
1561    others
5893    others
4672    others
5011    others
Name: Class, dtype: object

In [14]:
# 将test_labels使用one-hot编码
test_labels = np.array(test_labels_named).reshape(len(test_labels_named), -1)
test_labels = onehotencoder.transform(test_labels).toarray()
test_labels[:5]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [15]:
# 标准化训练集
transform_pipeline = Pipeline([
    ('std_scalar', StandardScaler())
])
# 有Sequence的时候
# train_features = np.c_[transform_pipeline.fit_transform(train_features[:, :-1]), train_features[:, -1]]
# 无Sequence的时候
train_features = transform_pipeline.fit_transform(train_features)
# train_features[:2]
# joblib.dump(transform_pipeline, './transform_pipeline_standardscaler.pkl')

In [16]:
# 标准化测试集
# 有Sequence的时候
# test_features = np.c_[transform_pipeline.transform(test_features[:, :-1]), test_features[:, -1]]
# 无Sequence的时候
test_features = transform_pipeline.transform(test_features)
test_features[:2]

array([[ 0.28483315,  2.49404443, -0.07847582, ..., -0.16073289,
        -0.12355787, -0.11441574],
       [-0.2685568 ,  0.33752564, -0.07847582, ..., -0.16073289,
        -0.12355787, -0.11441574]])

In [17]:
# # @Deprecated
# # softmax_reg = LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=5000, C=100) # 0.8393711551606289
# # softmax_reg = LogisticRegression(multi_class="ovr", solver="lbfgs", max_iter=1000, C=1000) # 0.8380041011619959
# # softmax_reg = LogisticRegression(multi_class="ovr", solver="newton-cg", max_iter=1000, C=1000) # 0.8386876281613124

# # @Deprecated
# # 线性回归
# # lin_reg = LinearRegression()
# # lin_reg.fit(train_features, train_labels)

# # 逻辑回归
# # 完整训练集 0. 筛选训练集 0.7008196721311475
# log_reg = LogisticRegression(C=575, max_iter=1240, random_state=42, solver='newton-cg', n_jobs=-1)
# # 完整训练集 0. 筛选训练集 0.7008196721311475
# # log_reg = LogisticRegression(C=590, max_iter=1200, random_state=42, n_jobs=-1)
# log_reg.fit(train_features, train_labels_named)

# # 随机梯度下降
# sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, eta0=0.1)
# sgd_clf.fit(train_features, train_labels_named)

# # 支持向量机
# # poly_svc_clf = SVC(kernel="poly", degree=3, coef0=1, C=5)
# # rbf_svc_clf = SVC(kernel="rbf", gamma=5, C=0.001)
# svc_clf = SVC(kernel="rbf", gamma=5, C=1, probability=True)
# svc_clf.fit(train_features, train_labels_named)

# 随机森林
# 完整训练集 0. 筛选训练集 0.8565573770491803
# 1-mer
# rdf_clf = RandomForestClassifier(max_features=3, max_samples=0.9, n_estimators=3000, random_state=42, n_jobs=-1)
# 2-mer
# rdf_clf = RandomForestClassifier(max_features=7, max_samples=0.95, n_estimators=3000, random_state=42, n_jobs=-1)
# 3-mer
# rdf_clf = RandomForestClassifier(max_features=7, max_samples=0.95, n_estimators=2500, random_state=42, n_jobs=-1)
# 4-mer
# rdf_clf = RandomForestClassifier(max_features=9, max_samples=0.95, n_estimators=2500, random_state=42, n_jobs=-1)
# 5-mer
# rdf_clf = RandomForestClassifier(max_features=19, max_samples=0.9, n_estimators=1000, random_state=42, n_jobs=-1)
# 6-mer
# rdf_clf = RandomForestClassifier(max_features=19, max_samples=0.95, n_estimators=1000, random_state=42, n_jobs=-1)
# 7-mer
# rdf_clf = RandomForestClassifier(max_features=19, max_samples=0.9, n_estimators=2500, random_state=42, n_jobs=-1)
# 8-mer
rdf_clf = RandomForestClassifier(max_features=19, max_samples=0.95, n_estimators=2000, random_state=42, n_jobs=-1)
rdf_clf.fit(train_features, train_labels_named)
# print(rdf_clf.feature_importances_)
# joblib.dump(rdf_clf, './rdf_clf_'+str(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))+'.pkl')

# # K近邻算法
# # 默认 0.7786885245901639
# # 完整训练集 0. 筛选训练集 0.
# knn_clf = KNeighborsClassifier(leaf_size=10, n_neighbors=1, p=1)
# knn_clf.fit(train_features, train_labels_named)

# # 集成K近邻
# knn_clf_1 = KNeighborsClassifier(leaf_size=10, n_neighbors=1, p=1, n_jobs=-1)
# knn_clf_2 = KNeighborsClassifier(leaf_size=10, n_neighbors=2, p=1, n_jobs=-1)
# knn_clf_3 = KNeighborsClassifier(leaf_size=10, n_neighbors=3, p=1, n_jobs=-1)
# knn_clf_4 = KNeighborsClassifier(leaf_size=10, n_neighbors=5, p=1, n_jobs=-1)
# knn_clf_6 = KNeighborsClassifier(leaf_size=10, n_neighbors=1, p=2, n_jobs=-1)
# knn_clf_7 = KNeighborsClassifier(leaf_size=10, n_neighbors=2, p=2, n_jobs=-1)
# knn_clf_8 = KNeighborsClassifier(leaf_size=10, n_neighbors=3, p=2, n_jobs=-1)
# voting_knn_clf = VotingClassifier(
#     estimators=[
#         ('knn_clf_1', knn_clf_1),
#         ('knn_clf_2', knn_clf_2),
#         ('knn_clf_3', knn_clf_3),
#         ('knn_clf_4', knn_clf_4),
#         ('knn_clf_6', knn_clf_6),
#         ('knn_clf_7', knn_clf_7),
#         ('knn_clf_8', knn_clf_8),],
#     voting='soft')
# voting_knn_clf.fit(train_features, train_labels_named)

# # 集成
# voting_clf = VotingClassifier(
#     estimators=[('rdf', rdf_clf),
#                 ('knn_clf', knn_clf)],
#     voting='hard')
# voting_clf.fit(train_features, train_labels_named)
# # joblib.dump(voting_clf, './voting_clf_'+str(time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()))+'.pkl')

RandomForestClassifier(max_features=19, max_samples=0.95, n_estimators=2000,
                       n_jobs=-1, random_state=42)

In [18]:
# # 线性回归
# # lin_predicted_labels = lin_reg.predict(test_features)
# # print(lin_predicted_labels[:5])
# # 线性回归精度
# # print(accuracy_score(test_labels_named, lin_predicted_labels))

# # 逻辑回归
# log_predicted_labels = log_reg.predict(test_features)
# print(log_predicted_labels[:5])
# # 逻辑回归精度
# print('逻辑回归精度：'+str(accuracy_score(test_labels_named, log_predicted_labels)))

# log_predicted_probas = log_reg.predict_proba(test_features)
# log_predicted_probas = log_predicted_probas[:, 0]
# print("逻辑回归自定义阈值的精度："+str(np.sum(log_predicted_probas>=0.5)/len(log_predicted_probas)))

# print('------------------------------------------------')

# # 随机梯度下降
# sgd_predicted_labels = sgd_clf.predict(test_features)
# print(sgd_predicted_labels[:5])
# # 随机梯度下降精度
# print('随机梯度下降精度：'+str(accuracy_score(test_labels_named, sgd_predicted_labels)))

# print('------------------------------------------------')

# # 支持向量机
# svc_predicted_labels = svc_clf.predict(test_features)
# print(svc_predicted_labels[:5])
# # 支持向量机精度
# print('支持向量机精度：'+str(accuracy_score(test_labels_named, svc_predicted_labels)))

# svc_predicted_probas = svc_clf.predict_proba(test_features)
# svc_predicted_probas = svc_predicted_probas[:, 0]
# print("支持向量机自定义阈值的精度："+str(np.sum(svc_predicted_probas>=0.5)/len(svc_predicted_probas)))

# print('------------------------------------------------')

# 随机森林
rdf_predicted_labels = rdf_clf.predict(test_features)
print(rdf_predicted_labels[:5])
# 随机森林精度
print('随机森林精度：'+str(accuracy_score(test_labels_named, rdf_predicted_labels)))

rdf_predicted_probas = rdf_clf.predict_proba(test_features)
print(rdf_predicted_probas[:5])
rdf_proba = rdf_predicted_probas[:, 0]
print("随机森林自定义阈值0.4的精度："+str(np.sum(rdf_proba>=0.4)/len(rdf_proba)))
print("随机森林自定义阈值0.3的精度："+str(np.sum(rdf_proba>=0.3)/len(rdf_proba)))
print("随机森林自定义阈值0.2的精度："+str(np.sum(rdf_proba>=0.2)/len(rdf_proba)))
print("随机森林自定义阈值0.1的精度："+str(np.sum(rdf_proba>=0.1)/len(rdf_proba)))
print("随机森林mean: "+str(np.array(rdf_proba).mean()))
print("随机森林var: "+str(np.array(rdf_proba).var()))

# # predict_proba的输出有两个（由（长度2的数组）组成的数组）。这两个数组中长度2的数组的两个数是倒着放的。

# print('------------------------------------------------')

# # K近邻
# knn_predicted_labels = knn_clf.predict(test_features)
# print(knn_predicted_labels[:5])
# # K近邻精度
# print('K近邻精度：'+str(accuracy_score(test_labels_named, knn_predicted_labels)))

# knn_predicted_probas = knn_clf.predict_proba(test_features)
# knn_probas = knn_predicted_probas[:, 0]
# print("K近邻自定义阈值的精度："+str(np.sum(knn_probas>=0.5)/len(knn_probas)))

# print('------------------------------------------------')

# # 集成近邻
# voting_knn_predicted_labels = voting_knn_clf.predict(test_features)
# print(voting_knn_predicted_labels[:5])
# # 集成K近邻
# print('集成K近邻：'+str(accuracy_score(test_labels_named, voting_knn_predicted_labels)))

# voting_knn_predicted_probas = voting_knn_clf.predict_proba(test_features)
# voting_knn_probas = voting_knn_predicted_probas[:, 0]
# print("集成近邻自定义阈值的精度："+str(np.sum(voting_knn_probas>=0.2)/len(voting_knn_probas)))

# print('------------------------------------------------')

# # 集成学习
# voting_predicted_labels = voting_clf.predict(test_features)
# print(voting_predicted_labels[:5])
# # 集成学习精度
# print('集成学习精度：'+str(accuracy_score(test_labels_named, voting_predicted_labels)))

# # voting_predicted_probas = voting_clf.predict_proba(test_features)
# # voting_proba = voting_predicted_probas[:, 0]
# # print("集成学习自定义阈值的精度："+str(np.sum(voting_proba>=0.3)/len(voting_proba)))

['others' 'others' 'others' 'others' 'others']
随机森林精度：0.7887901572112098
[[0.1015 0.8985]
 [0.4515 0.5485]
 [0.436  0.564 ]
 [0.1985 0.8015]
 [0.422  0.578 ]]
随机森林自定义阈值0.4的精度：0.6384142173615858
随机森林自定义阈值0.3的精度：0.7019822282980178
随机森林自定义阈值0.2的精度：0.8140806561859193
随机森林自定义阈值0.1的精度：0.9589883800410116
随机森林mean: 0.39313465481886534
随机森林var: 0.02537740087013034


In [19]:
# for clf in (softmax_reg, rdf_clf, poly_svc_clf, rbf_svc_clf, voting_clf):
#     clf.fit(train_features, train_labels)
#     predicted_labels = clf.predict(test_features)
#     print(clf.__class__.__name__, accuracy_score(test_labels, predicted_labels))

In [20]:
test_labels[:5]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [21]:
# # 随机森林的GridSearch
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# param_grid = {
#     'n_estimators': [1000, 1500, 2000, 2500, 3000],
#     'max_features': [17, 19, 21],
#     'max_samples': [0.75, 0.8, 0.85, 0.9, 0.95, 1.0],
#     'random_state': [42]
# }

# # param_grid = {
# #   'n_estimators': range(2000, 5000, 2),
# #     'max_features': range(1, 26, 2),
# #     'max_samples': np.linspace(0.6, 0.99, 30)
# # }

# # RandomForestClassifier(max_features=5, max_samples=0.8, n_estimators=5000)
# # RandomForestClassifier(max_features=5, max_samples=0.85, n_estimators=4500)
# # RandomForestClassifier(max_features=3, max_samples=0.9, n_estimators=3500)
# # RandomForestClassifier(max_features=7, max_samples=0.95, n_estimators=2500)
# # RandomForestClassifier(max_features=5, max_samples=0.9, n_estimators=2500)
# randomforest_clf = RandomForestClassifier()

# grid_search = GridSearchCV(randomforest_clf, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_jobs=-1)
# # grid_search = RandomizedSearchCV(randomforest_clf, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_iter=40000, n_jobs=-1)
# grid_search.fit(train_features, train_labels)

# print(grid_search.best_params_)
# print(grid_search.best_estimator_)
# cvres = grid_search.cv_results_
# # for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
# #     print(np.sqrt(-mean_score), params)

In [22]:
# # 逻辑回归的GridSearch
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# param_grid = {
#     'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
#     'max_iter': range(1100, 2000, 100),
#     'random_state': [42],
#     'C': range(570, 600, 10)
# }

# # LogisticRegression(C=590, max_iter=1400, random_state=42, solver='newton-cg')
# # LogisticRegression(C=575, max_iter=1240, random_state=42, solver='newton-cg')
# logistic_reg = LogisticRegression()

# grid_search = RandomizedSearchCV(logistic_reg, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_iter=500, n_jobs=-1)
# grid_search.fit(train_features, train_labels[:, 0])

# print(grid_search.best_params_)
# print(grid_search.best_estimator_)

In [23]:
# # 支持向量机的GridSearch
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# param_grid = {
#     'kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
#     'gamma': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
#     'C': range(1, 1000, 10)
# }

# support_vector_machine = SVC()

# grid_search = RandomizedSearchCV(support_vector_machine, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_iter=500, n_jobs=-1)
# grid_search.fit(train_features, train_labels[:, 0])

# print(grid_search.best_params_)
# print(grid_search.best_estimator_)

In [24]:
# # K近邻的GridSearch
# from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# param_grid = {
#     'n_neighbors': range(1, 11, 1),
#     'algorithm': ['auto'],
#     'leaf_size': range(10, 110, 10),
#     'p': range(1, 11, 1)
# }

# kneighbors_clf = KNeighborsClassifier()

# grid_search = RandomizedSearchCV(kneighbors_clf, param_grid, cv=5, scoring='neg_mean_squared_error', return_train_score=True, n_iter=1000, n_jobs=-1)
# grid_search.fit(train_features, train_labels)

# print(grid_search.best_params_)
# print(grid_search.best_estimator_)

In [25]:
# # import matplotlib
# # matplotlib.rc("font",family='Times New Roman')

# clf_list = [log_reg, rdf_clf, voting_knn_clf, knn_clf, svc_clf]
# name_list = ['log_reg', 'rdf_clf', 'voting_knn_clf', 'knn_clf', 'svc_clf']
# color_list = ['b', 'm', 'c', 'g', 'y', 'r', 'c', 'k']
# plt.rcParams['figure.figsize'] = (25, 25)
# # plt.rc('font', family='Times New Roman')

# for i, model in enumerate(clf_list):
# #     y_probas_rdf = cross_val_predict(model, train_features, train_labels_named, cv=3, method="predict_proba")
# #     y_scores_rdf = y_probas_rdf[:, 0]
# #     precisions, recalls, thresholds = precision_recall_curve((train_labels_named != "others"), y_scores_rdf)
# #     plt.plot(recalls[:-1], precisions[:-1], color_list[i]+"--", label=name_list[i]+'_train')

#     y_probas_rdf = cross_val_predict(model, test_features, test_labels_named, cv=3, method="predict_proba")
#     y_scores_rdf = y_probas_rdf[:, 0]
#     precisions, recalls, thresholds = precision_recall_curve((test_labels_named != "others"), y_scores_rdf)
#     plt.plot(recalls[:-1], precisions[:-1], color_list[i]+"-", label=name_list[i]+'_test')

# # font = {
# #     'family':'Times New Roman',
# #     'weight':'normal',
# #     'size':10
# # }

# font = FontProperties(fname=r"/usr/share/fonts/Times-New-Roman/TIMES.TTF", size=25)

# plt.xlim((0, 1))
# plt.ylim((0, 1))
# plt.xticks(FontProperties=font)
# plt.yticks(FontProperties=font)
# plt.xlabel('Recall', FontProperties=font)
# plt.ylabel('Precision', FontProperties=font)
# plt.grid()
# plt.legend(loc='best', prop=font)
# plt.show()

In [26]:
# clf_list = [log_reg, rdf_clf, voting_knn_clf, knn_clf, svc_clf]
# name_list = ['log_reg', 'rdf_clf', 'voting_knn_clf', 'knn_clf', 'svc_clf']
# color_list = ['b', 'm', 'c', 'g', 'y', 'r', 'c', 'k']
# plt.rcParams['figure.figsize'] = (25, 25)
# # plt.rc('font', family='Times New Roman')

# for i, model in enumerate(clf_list):
# #     y_probas_rdf = cross_val_predict(model, train_features, train_labels_named, cv=3, method="predict_proba")
# #     y_scores_rdf = y_probas_rdf[:, 0]
# #     fpr, tpr, thresholds = roc_curve((train_labels_named != "others"), y_scores_rdf)
# #     auc = roc_auc_score((train_labels_named != "others"), y_scores_rdf)
# #     plt.plot(fpr, tpr, color_list[i]+"--", label=name_list[i]+'_train (auc=%.2f)' % auc)

#     y_probas_rdf = cross_val_predict(model, test_features, test_labels_named, cv=3, method="predict_proba")
#     y_scores_rdf = y_probas_rdf[:, 0]
#     fpr, tpr, thresholds = roc_curve((test_labels_named != "others"), y_scores_rdf)
#     auc = roc_auc_score((test_labels_named != "others"), y_scores_rdf)
#     plt.plot(fpr, tpr, color_list[i]+"-", label=name_list[i]+'_test (auc=%.2f)' % auc)

# # font = {
# #     'family':'Times New Roman',
# #     'weight':'normal',
# #     'size':10
# # }

# font = FontProperties(fname=r"/usr/share/fonts/Times-New-Roman/TIMES.TTF", size=25)

# plt.xlim((0, 1))
# plt.ylim((0, 1))
# plt.xticks(FontProperties=font)
# plt.yticks(FontProperties=font)
# plt.xlabel('1-Specificity (False Positive Rate)', FontProperties=font)
# plt.ylabel('Sensitivity (True Positive Rate) (Recall)', FontProperties=font)
# plt.grid()
# plt.legend(loc='best', prop=font)
# plt.show()

In [27]:
# param_range = range(100, 5000, 100)
# train_loss, test_loss = validation_curve(
#     RandomForestClassifier(max_features=7, max_samples=0.95, random_state=42, n_jobs=-1),
#     train_features,
#     train_labels,
#     param_name='n_estimators',
#     param_range=param_range,
#     cv=5,
#     scoring='neg_mean_squared_error',
#     n_jobs=-1
# )

# train_loss_mean = -np.mean(train_loss, axis=1)
# test_loss_mean = -np.mean(test_loss, axis=1)

# plt.plot(param_range, train_loss_mean, 'o-', color='r', label='Training')
# plt.plot(param_range, test_loss_mean, 'o-', color='g', label='Cross-validation')

# plt.xlabel('max features')
# plt.ylabel('loss')
# plt.legend(loc='best')