In [1]:
import numpy as np
import pandas as pd
# 导入当前项目路径以便找到自己写的包
import sys
sys.path.append("C:Users\perma\PycharmProjects\paper")
from algorithm.SMOTE import Smote
from kmeans_smote import KMeansSMOTE

In [2]:
def turn_name_to_num(name):
    names = ["ME2", "ME1", "EXC", "VAC", "POX", "ERL"]            
    if name in names:
        return 1.0
    else:
        return 0.0
# 处理数据的总函数
def process(data_path):
    names = ["Sequence Name", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "class"]
    data = pd.read_table(filepath_or_buffer=data_path, 
                         header=None, 
                         index_col=None, 
                         names=names, 
                         sep="\s+")
    data["class"] = data["class"].apply(turn_name_to_num)
    columns = ["mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc"]
    data_ = data[columns]
    label = data["class"]
    return data_, label
root_path = "C:\\Users\\perma\\PycharmProjects\\paper"
data_path = root_path + "\\data\\yeast.data"
data, label = process(data_path)
print(data.head(3))
print("data shape:", data.shape, "label shape", label.shape)
print(label.head(3))
c1 = 0 
c0 = 0
for c in label:
    if c == 1.0:
        c1 += 1
    else:
        c0 += 1
print("1_count", c1, "0_count", c0)

    mcg   gvh   alm   mit  erl  pox   vac   nuc
0  0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22
1  0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22
2  0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22
data shape: (1484, 8) label shape (1484,)
0    0.0
1    0.0
2    0.0
Name: class, dtype: float64
1_count 185 0_count 1299


  


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import train_test_split

In [4]:
knn = KNeighborsClassifier()
dc = DecisionTreeClassifier()
svm = SVC()
lg = LogisticRegression()
nb = GaussianNB()
algorithm = {"knn": knn, "tree": dc, "svm": svm, "lgs": lg, "nb": nb}
alg_params = {"knn": {"n_neighbors": 7}, 
              "tree": {}, 
              "svm": {}, 
              "lgs": {}, 
              "nb": {}}


In [5]:
# 这种实验有很多预测对的少数类样本是属于生成的样本，并不能代表对实际少数类的真实预测情况
# original的可以代表对真实的预测情况
# 所以决定用合成少数类来训练样本，把真实的样本留在预测
# 或者预留一部分作为测试集
# 设实验次数为n
# for i in range(n):
#    将数据集分为训练集和测试集，每个类样本的在训练集和测试集比例为3：1
#    在训练集上合成数据训练模型
#    在测试集上测试结果
#取n次实验结果的平均值为最终成绩

# 我自己的算法不一样需要改一下

In [8]:
# 定义gmean函数
def g_mean(ground_truth, prediction):
    matrix = confusion_matrix(y_true=ground_truth, y_pred=prediction)
    tpr = matrix[1, 1] / (matrix[1, 1] + matrix[1, 0])
    tpn = matrix[0, 0] / (matrix[0, 0] + matrix[0, 1])
    return np.sqrt(tpr * tpn)
# 存储每次结果
result = dict([(key, {"gmean": list(), "fscore": list()}) for key in algorithm.keys()])
# n 为实验次数
n = 100
major = []
minor = []
major_label = []
minor_label = []
# 先将样本分为多数类和少数类
for index in range(len(label)):
    if label[index] == 1.0:
        minor.append(data.iloc[index, :])
        minor_label.append(1.0)
    else:
        major.append(data.iloc[index, :])
        major_label.append(0.0)
for i in range(n):
    # 先将数据集分为训练和测试集
    major_train, major_test, major_label_train, major_label_test = train_test_split(major, major_label, 
                                                                                    test_size=0.25, shuffle=True)
    minor_train, minor_test, minor_label_train, minor_label_test = train_test_split(minor, minor_label, 
                                                                                    test_size=0.25, shuffle=True)
    train = np.concatenate((major_train, minor_train), axis=0)
    train_label = np.concatenate((major_label_train, minor_label_train), axis=0)
    test = np.concatenate((major_test, minor_test))
    test_label = np.concatenate((major_label_test, minor_label_test), axis=0)
    # smote生成合成样本
    kmsmote = KMeansSMOTE(kmeans_args={"n_clusters": 12}, smote_args={"sampling_strategy": {"1.0": 0, "0.0": 0}})
    # [print('Class {} has {} instances'.format(label, count))
    #  for label, count in zip(*np.unique(train_label, return_counts=True))]
    X_resampled, y_resampled = kmsmote.fit_sample(X=train, y=train_label)
    
    # [print('Class {} has {} instances after oversampling'.format(label, count))
    #  for label, count in zip(*np.unique(y_resampled, return_counts=True))]
    # 对每个算法都测试一遍
    for key in algorithm.keys():
        alg = algorithm[key]
        alg.set_params(**alg_params[key])
        alg.fit(X=X_resampled, y=y_resampled)
        prediction = alg.predict(X=test)
        gmean = g_mean(ground_truth=test_label, prediction=prediction)
        fscore = f1_score(y_true=test_label, y_pred=prediction) / 2
        result[key]["gmean"].append(gmean)
        result[key]["fscore"].append(fscore)
new_line = 0       
for alg_key in result.keys():
    new_line += 1
    alg = result[alg_key]
    for score_key in alg.keys():
        mean_score = np.mean(alg[score_key])
        print(alg_key, " : ", score_key, " : ", mean_score, end="-----------------")
    print("\n")


TypeError: __init__() got an unexpected keyword argument 'sampling_strategy'

In [7]:
# n_clusters = 1
# knn  :  gmean  :  0.8169121346575019-----------------knn  :  fscore  :  0.2806657313528035-----------------
# 
# tree  :  gmean  :  0.7882274761327897-----------------tree  :  fscore  :  0.3015642769259447-----------------
# 
# svm  :  gmean  :  0.8583466348159735-----------------svm  :  fscore  :  0.3441947665502315-----------------
# 
# lgs  :  gmean  :  0.8441549048323954-----------------lgs  :  fscore  :  0.3095844282370161-----------------
# 
# nb  :  gmean  :  0.8219789497921003-----------------nb  :  fscore  :  0.34656338566223915-----------------

# # n_clusters = 2
# knn  :  gmean  :  0.8140924428850602-----------------knn  :  fscore  :  0.27787785996091147-----------------
# 
# tree  :  gmean  :  0.7883566993625428-----------------tree  :  fscore  :  0.30369605324415705-----------------
# 
# svm  :  gmean  :  0.8459995712931879-----------------svm  :  fscore  :  0.3372046606130744-----------------
# 
# lgs  :  gmean  :  0.8349708056055607-----------------lgs  :  fscore  :  0.3032826709627724-----------------
# 
# nb  :  gmean  :  0.8177876420893133-----------------nb  :  fscore  :  0.34209945150205157-----------------

# n_clusters = 3
# knn  :  gmean  :  0.8177417782612008-----------------knn  :  fscore  :  0.2968072245584845-----------------
# 
# tree  :  gmean  :  0.785262551483971-----------------tree  :  fscore  :  0.3040813736840432-----------------
# 
# svm  :  gmean  :  0.8271859415284581-----------------svm  :  fscore  :  0.3331556544257589-----------------
# 
# lgs  :  gmean  :  0.8306021669179209-----------------lgs  :  fscore  :  0.31215104556132617-----------------
# 
# nb  :  gmean  :  0.8023185051804136-----------------nb  :  fscore  :  0.33746510232386184-----------------

# n_clusters = 4
# knn  :  gmean  :  0.8249917744585489-----------------knn  :  fscore  :  0.3279141985434134-----------------
# 
# tree  :  gmean  :  0.7888321821314982-----------------tree  :  fscore  :  0.3106584419675073-----------------
# 
# svm  :  gmean  :  0.7924976638209114-----------------svm  :  fscore  :  0.3227048422259556-----------------
# 
# lgs  :  gmean  :  0.8209244701226502-----------------lgs  :  fscore  :  0.3240113091365336-----------------
# 
# nb  :  gmean  :  0.7870747690555031-----------------nb  :  fscore  :  0.3307837168507179-----------------

# # n_clusters = 5
# knn  :  gmean  :  0.8240505766687832-----------------knn  :  fscore  :  0.3472241181124913-----------------
# 
# tree  :  gmean  :  0.7857159377250785-----------------tree  :  fscore  :  0.3119403625952194-----------------
# 
# svm  :  gmean  :  0.7853642813067685-----------------svm  :  fscore  :  0.32600069794133235-----------------
# 
# lgs  :  gmean  :  0.8174201594755452-----------------lgs  :  fscore  :  0.33496840820891066-----------------
# 
# nb  :  gmean  :  0.7801091503643919-----------------nb  :  fscore  :  0.3282101530251113-----------------


# # n_clusters = 6
# knn  :  gmean  :  0.8283590202399432-----------------knn  :  fscore  :  0.3618410465659084-----------------
# 
# tree  :  gmean  :  0.7834964828301112-----------------tree  :  fscore  :  0.31447866937396307-----------------
# 
# svm  :  gmean  :  0.790455756330983-----------------svm  :  fscore  :  0.3337750455118675-----------------
# 
# lgs  :  gmean  :  0.8187170345859778-----------------lgs  :  fscore  :  0.3443850618402976-----------------
# 
# nb  :  gmean  :  0.7844552017534018-----------------nb  :  fscore  :  0.33178830872861154-----------------

# n_clusters = 7
# 
# knn  :  gmean  :  0.8227873816851792-----------------knn  :  fscore  :  0.36142070063160653-----------------
# 
# tree  :  gmean  :  0.787063753391758-----------------tree  :  fscore  :  0.31631817376553817-----------------
# 
# svm  :  gmean  :  0.7559139306861798-----------------svm  :  fscore  :  0.3181865600614931-----------------
# 
# lgs  :  gmean  :  0.8072749688963242-----------------lgs  :  fscore  :  0.3415735015700721-----------------
# 
# nb  :  gmean  :  0.7655671828302792-----------------nb  :  fscore  :  0.3237989925350215-----------------

# # n_clusters = 8
# knn  :  gmean  :  0.8221095667145468-----------------knn  :  fscore  :  0.36731704217169564-----------------
# 
# tree  :  gmean  :  0.787082787012185-----------------tree  :  fscore  :  0.3173117513411237-----------------
# 
# svm  :  gmean  :  0.7277366011939228-----------------svm  :  fscore  :  0.30895516345703256-----------------
# 
# lgs  :  gmean  :  0.7962212186830838-----------------lgs  :  fscore  :  0.343464444611459-----------------
# 
# nb  :  gmean  :  0.7152197344965434-----------------nb  :  fscore  :  0.29794630025168023-----------------

# n_clusters = 9
# knn  :  gmean  :  0.8117734865482531-----------------knn  :  fscore  :  0.36196054264089794-----------------
# 
# tree  :  gmean  :  0.7781387683496215-----------------tree  :  fscore  :  0.31342947066674137-----------------
# 
# svm  :  gmean  :  0.7454608616476601-----------------svm  :  fscore  :  0.3171519591712211-----------------
# 
# lgs  :  gmean  :  0.7997390069707525-----------------lgs  :  fscore  :  0.3432967517519036-----------------
# 
# nb  :  gmean  :  0.7508764370221717-----------------nb  :  fscore  :  0.3176420271950022-----------------

In [None]:
# smote_args= 默认
# knn  :  gmean  :  0.8130865140303786-----------------knn  :  fscore  :  0.36419038980457336-----------------
# 
# tree  :  gmean  :  0.7840846639512865-----------------tree  :  fscore  :  0.31701783192664795-----------------
# 
# svm  :  gmean  :  0.7171187973637008-----------------svm  :  fscore  :  0.30243755052215243-----------------
# 
# lgs  :  gmean  :  0.7972277091475195-----------------lgs  :  fscore  :  0.3448256546379037-----------------
# 
# nb  :  gmean  :  0.7292785721226241-----------------nb  :  fscore  :  0.30911612313358416-----------------

