In [1]:
import numpy as np
import pandas as pd
import sys
sys.path.append("C:Users\perma\PycharmProjects\paper")

In [2]:
# 将少数类样本的类标签变为数字
def turn_name_to_num(name):
    names = ["ME2", "ME1", "EXC", "VAC", "POX", "ERL"]            
    if name in names:
        return 1.0
    else:
        return 0.0
# 处理数据的总函数
def process(data_path):
    names = ["Sequence Name", "mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc", "class"]
    data = pd.read_table(filepath_or_buffer=data_path, 
                         header=None, 
                         index_col=None, 
                         names=names, 
                         sep="\s+")
    data["class"] = data["class"].apply(turn_name_to_num)
    columns = ["mcg", "gvh", "alm", "mit", "erl", "pox", "vac", "nuc"]
    data_ = data[columns]
    label = data["class"]
    return data_, label

In [3]:
root_path = "C:\\Users\\perma\\PycharmProjects\\paper"
data_path = root_path + "\\data\\yeast.data"
data, label = process(data_path)

  from ipykernel import kernelapp as app


In [4]:
print(data.head(3))
print("data shape:", data.shape, "label shape", label.shape)
print(label.head(3))
c1 = 0 
c0 = 0
for c in label:
    if c == 1.0:
        c1 += 1
    else:
        c0 += 1
print("1_count", c1, "0_count", c0)

    mcg   gvh   alm   mit  erl  pox   vac   nuc
0  0.58  0.61  0.47  0.13  0.5  0.0  0.48  0.22
1  0.43  0.67  0.48  0.27  0.5  0.0  0.53  0.22
2  0.64  0.62  0.49  0.15  0.5  0.0  0.53  0.22
data shape: (1484, 8) label shape (1484,)
0    0.0
1    0.0
2    0.0
Name: class, dtype: float64
1_count 185 0_count 1299


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import train_test_split

In [6]:
knn = KNeighborsClassifier()
dc = DecisionTreeClassifier()
svm = SVC()
lg = LogisticRegression()
nb = GaussianNB()
algorithm = {"knn": knn, "tree": dc, "svm": svm, "lgs": lg, "nb": nb}
alg_params = {"knn": {"n_neighbors": 7}, 
              "tree": {}, 
              "svm": {}, 
              "lgs": {}, 
              "nb": {}}


In [7]:
knn_params = {"n_neighbors": (3, 5, 7, 9, 13, 15, 17, 19)}
tree_params = {"criterion": ("gini", "entropy")}
svm_params = {"degree": (3, 4, 5, 6, 7, 8)}
lg_params = {"penalty": ("l1", "l2")}
nb_params = {}

In [8]:
# 得到knn算法最好的n_neighbor参数为7
grid = GridSearchCV(estimator=knn, param_grid=knn_params, scoring="f1", cv=5)
grid.fit(X=data, y=label)
grid.best_params_


In [11]:
# 将少数类和多数类分开提取出来
major = []
major_label = []
minor = []
minor_label = []
for label_index in range(len(label)):
    if label[label_index] == 1.0:
        minor.append(data.iloc[label_index, :])
        minor_label.append(label[label_index])
    else:
        major.append(data.iloc[label_index, :])
        major_label.append(label[label_index])
        

In [13]:
# 定义gmean函数
def g_mean(ground_truth, prediction):
    matrix = confusion_matrix(y_true=ground_truth, y_pred=prediction)
    tpr = matrix[1, 1] / (matrix[1, 1] + matrix[1, 0])
    tpn = matrix[0, 0] / (matrix[0, 0] + matrix[0, 1])
    return np.sqrt(tpr * tpn)
result = dict([(key, {"gmean": list(), "fscore": list()}) for key in algorithm.keys()])
# 实验次数
n=100
for i in range(n):
    for key in algorithm.keys():
        alg = algorithm[key]
        alg.set_params(**alg_params[key])
        major_train, major_test, \
        major_label_train, major_label_test \
            = train_test_split(major, major_label, shuffle=True, test_size=0.25)
        minor_train, minor_test, \
        minor_label_train, minor_label_test \
            = train_test_split(minor, minor_label, shuffle=True, test_size=0.25)
        data_train = np.concatenate((major_train, minor_train), axis=0)
        label_train = np.concatenate((major_label_train, minor_label_train), axis=0)
        data_test = np.concatenate((major_test, minor_test), axis=0)
        label_test = np.concatenate((major_label_test, minor_label_test), axis=0)
        alg.fit(X=data_train, y=label_train)
        prediction = alg.predict(X=data_test)
        gmean = g_mean(label_test, prediction)
        fscore = f1_score(y_true=label_test, y_pred=prediction) / 2
        result[key]["gmean"].append(gmean)
        result[key]["fscore"].append(fscore)
new_line = 0       
for alg_key in result.keys():
    new_line += 1
    alg = result[alg_key]
    for score_key in alg.keys():
        mean_score = np.mean(alg[score_key])
        print(alg_key, " : ", score_key, " : ", mean_score, end="-----------------")
    print("\n")
    
    

knn  :  gmean  :  0.7789863597508483-----------------knn  :  fscore  :  0.35595590780611425-----------------

tree  :  gmean  :  0.7757417570109364-----------------tree  :  fscore  :  0.31180409066606035-----------------

svm  :  gmean  :  0.37795288714941416-----------------svm  :  fscore  :  0.12716273733057648-----------------

lgs  :  gmean  :  0.624680951757748-----------------lgs  :  fscore  :  0.2773484905534051-----------------

nb  :  gmean  :  0.7551352636441806-----------------nb  :  fscore  :  0.3387135084266024-----------------



alg: knn   fscore:  0.35728248455488953
alg: tree   fscore:  0.30652186705583273
alg: svm   fscore:  0.131164988681498
alg: lgs   fscore:  0.2757027674457761
alg: nb   fscore:  0.34575272850003214


alg: knn   gmean_score:  0.7810731699159459
alg: tree   gmean_score:  0.7743889555021399
alg: svm   gmean_score:  0.38365806518827805
alg: lgs   gmean_score:  0.6213288758758633
alg: nb   gmean_score:  0.7662581729709167


3