In [1]:
import numpy as np
import pandas as pd
# 导入当前项目路径以便找到自己写的包
import sys
sys.path.append("C:Users\perma\PycharmProjects\paper")
from algorithm.SMOTE import Smote
from algorithm.border_line_smote import BSMOTE

In [2]:
def turn_name_to_num(name):
    names = ["ME2", "ME1", "EXC", "VAC", "POX", "ERL"]            
    if name in names:
        return 1.0
    else:
        return 0.0
# 处理数据的总函数
def process(data_path):
    names = ["Sequence Name", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "class"]
    data = pd.read_table(filepath_or_buffer=data_path, 
                         header=None, 
                         index_col=None, 
                         names=names, 
                         sep="\s+")
    data["class"] = data["class"].apply(turn_name_to_num)
    columns = ["mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc"]
    data_ = data[columns]
    label = data["class"]
    return data_, label
root_path = "C:\\Users\\perma\\PycharmProjects\\paper"
data_path = root_path + "\\data\\yeast.data"
data, label = process(data_path)
print(data.head(3))
print("data shape:", data.shape, "label shape", label.shape)
print(label.head(3))
c1 = 0 
c0 = 0
for c in label:
    if c == 1.0:
        c1 += 1
    else:
        c0 += 1
print("1_count", c1, "0_count", c0)

    mcg   gvh   alm   mit  erl  pox   vac   nuc
0  0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22
1  0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22
2  0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22
data shape: (1484, 8) label shape (1484,)
0    0.0
1    0.0
2    0.0
Name: class, dtype: float64
1_count 185 0_count 1299


  


In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import train_test_split

In [4]:
knn = KNeighborsClassifier()
dc = DecisionTreeClassifier()
svm = SVC()
lg = LogisticRegression()
nb = GaussianNB()
algorithm = {"knn": knn, "tree": dc, "svm": svm, "lgs": lg, "nb": nb}
alg_params = {"knn": {"n_neighbors": 7}, 
              "tree": {}, 
              "svm": {}, 
              "lgs": {}, 
              "nb": {}}

In [5]:
# 这种实验有很多预测对的少数类样本是属于生成的样本，并不能代表对实际少数类的真实预测情况
# original的可以代表对真实的预测情况
# 所以决定用合成少数类来训练样本，把真实的样本留在预测
# 或者预留一部分作为测试集
# 设实验次数为n
# for i in range(n):
#    将数据集分为训练集和测试集，每个类样本的在训练集和测试集比例为3：1
#    在训练集上合成数据训练模型
#    在测试集上测试结果
#取n次实验结果的平均值为最终成绩

# 我自己的算法不一样需要改一下

In [8]:
# 定义gmean函数
def g_mean(ground_truth, prediction):
    matrix = confusion_matrix(y_true=ground_truth, y_pred=prediction)
    tpr = matrix[1, 1] / (matrix[1, 1] + matrix[1, 0])
    tpn = matrix[0, 0] / (matrix[0, 0] + matrix[0, 1])
    return np.sqrt(tpr * tpn)
# 存储每次结果
result = dict([(key, {"gmean": list(), "fscore": list()}) for key in algorithm.keys()])
# n 为实验次数
n = 100
major = []
minor = []
major_label = []
minor_label = []
# 先将样本分为多数类和少数类
for index in range(len(label)):
    if label[index] == 1.0:
        minor.append(data.iloc[index, :])
        minor_label.append(1.0)
    else:
        major.append(data.iloc[index, :])
        major_label.append(0.0)
for i in range(n):
    # 先将数据集分为训练和测试集
    major_train, major_test, major_label_train, major_label_test = train_test_split(major, major_label, 
                                                                                    test_size=0.25, shuffle=True)
    minor_train, minor_test, minor_label_train, minor_label_test = train_test_split(minor, minor_label, 
                                                                                    test_size=0.25, shuffle=True)
    train = np.concatenate((major_train, minor_train), axis=0)
    train_label = np.concatenate((major_label_train, minor_label_train), axis=0)
    test = np.concatenate((major_test, minor_test))
    test_label = np.concatenate((major_label_test, minor_label_test), axis=0)
    # smote生成合成样本
    bsmote = BSMOTE(data=data.values, label=label.tolist(), K=1, sample_ratio=700)
    syntheticed_samples = bsmote.over_sample()
    syntheticed_labels = len(syntheticed_samples) * [1.0]
    syntheticed_train = np.concatenate((train, syntheticed_samples), axis=0)
    syntheticed_train_label = np.concatenate((train_label, syntheticed_labels), axis=0)
    # 对每个算法都测试一遍
    for key in algorithm.keys():
        alg = algorithm[key]
        alg.set_params(**alg_params[key])
        alg.fit(X=syntheticed_train, y=syntheticed_train_label)
        prediction = alg.predict(X=test)
        gmean = g_mean(ground_truth=test_label, prediction=prediction)
        fscore = f1_score(y_true=test_label, y_pred=prediction) / 2
        result[key]["gmean"].append(gmean)
        result[key]["fscore"].append(fscore)
new_line = 0       
for alg_key in result.keys():
    new_line += 1
    alg = result[alg_key]
    for score_key in alg.keys():
        mean_score = np.mean(alg[score_key])
        print(alg_key, " : ", score_key, " : ", mean_score, end="-----------------")
    print("\n")


knn  :  gmean  :  0.8398893737570391-----------------knn  :  fscore  :  0.36734741095559864-----------------

tree  :  gmean  :  0.8152896506109473-----------------tree  :  fscore  :  0.30562540771322605-----------------

svm  :  gmean  :  0.8371460346475702-----------------svm  :  fscore  :  0.3747214546526932-----------------

lgs  :  gmean  :  0.8451608257854448-----------------lgs  :  fscore  :  0.36127969670961746-----------------

nb  :  gmean  :  0.7501498491145482-----------------nb  :  fscore  :  0.3310565152460449-----------------



In [9]:
# # K = 2
# sample_ratio = 700
# knn  :  gmean  :  0.8424752755807984-----------------knn  :  fscore  :  0.37828882426098615-----------------
# 
# tree  :  gmean  :  0.8318476022482021-----------------tree  :  fscore  :  0.33640695876489857-----------------
# 
# svm  :  gmean  :  0.8439824558899357-----------------svm  :  fscore  :  0.36216013952759546-----------------
# 
# lgs  :  gmean  :  0.8406605242457555-----------------lgs  :  fscore  :  0.35629085501026125-----------------
# 
# nb  :  gmean  :  0.8204527031844523-----------------nb  :  fscore  :  0.3616999670677615-----------------

In [None]:
# K = 1
# sample_ratio = 700
# knn  :  gmean  :  0.8398893737570391-----------------knn  :  fscore  :  0.36734741095559864-----------------
# 
# tree  :  gmean  :  0.8152896506109473-----------------tree  :  fscore  :  0.30562540771322605-----------------
# 
# svm  :  gmean  :  0.8371460346475702-----------------svm  :  fscore  :  0.3747214546526932-----------------
# 
# lgs  :  gmean  :  0.8451608257854448-----------------lgs  :  fscore  :  0.36127969670961746-----------------
# 
# nb  :  gmean  :  0.7501498491145482-----------------nb  :  fscore  :  0.3310565152460449-------------
# 
# 
# 
# 
# 
# 
# 
# 
# 
# 
# ---