In [1]:
import sys
# 直接导入Smote是不行的，必须将路径加入才可以
sys.path.append("C:Users\perma\PycharmProjects\paper")
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler 
from kmeans_smote import KMeansSMOTE
import warnings
warnings.filterwarnings('ignore')

In [2]:
names = ["Sex", "Length", "Diameter", "Height", "Whole", "Shucked", "Viscera", "Shell", "class"]
major_class = list(range(1, 14))
data_path = "C:\\Users\\perma\\PycharmProjects\\paper\\data\\abalone.data"

table = pd.read_table(filepath_or_buffer=data_path,
                      header=None,
                      index_col=None,
                      names=names,
                      sep=","
                      )
label = []
for l in table["class"]:
    if l in major_class:
        label.append(0.0)
    else:
        label.append(1.0)
dummies = pd.get_dummies(data=table, prefix=["Sex"], columns=["Sex"])
attributes = ["Sex_F", "Sex_I", "Sex_M", "Length", "Diameter", "Height", "Whole", "Shucked", "Viscera", "Shell"]    
data = dummies[attributes]

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import train_test_split

In [4]:
knn = KNeighborsClassifier()
dc = DecisionTreeClassifier()
svm = SVC()
lg = LogisticRegression()
nb = GaussianNB()
algorithm = {"knn": knn, "tree": dc, "svm": svm, "lgs": lg, "nb": nb}
alg_params = {"knn": {"n_neighbors": 7}, 
              "tree": {}, 
              "svm": {}, 
              "lgs": {}, 
              "nb": {}}

In [25]:
# 将少数类和多数类分开提取出来
major = []
major_label = []
minor = []
minor_label = []
for label_index in range(len(label)):
    if label[label_index] == 1.0:
        minor.append(data.iloc[label_index, :])
        minor_label.append(label[label_index])
    else:
        major.append(data.iloc[label_index, :])
        major_label.append(label[label_index])
# 定义gmean函数
def g_mean(ground_truth, prediction):
    matrix = confusion_matrix(y_true=ground_truth, y_pred=prediction)
    tpr = matrix[1, 1] / (matrix[1, 1] + matrix[1, 0])
    tpn = matrix[0, 0] / (matrix[0, 0] + matrix[0, 1])
    return np.sqrt(tpr * tpn)
result = dict([(key, {"gmean": list(), "fscore": list()}) for key in algorithm.keys()])
# 实验次数
n=100
for i in range(n):
    for key in algorithm.keys():
        alg = algorithm[key]
        alg.set_params(**alg_params[key])
        major_train, major_test, \
        major_label_train, major_label_test \
            = train_test_split(major, major_label, shuffle=True, test_size=0.25)
        minor_train, minor_test, \
        minor_label_train, minor_label_test \
            = train_test_split(minor, minor_label, shuffle=True, test_size=0.25)
        data_train = np.concatenate((major_train, minor_train), axis=0)
        label_train = np.concatenate((major_label_train, minor_label_train), axis=0)
        data_test = np.concatenate((major_test, minor_test), axis=0)
        label_test = np.concatenate((major_label_test, minor_label_test), axis=0)
        # smote生成合成样本
        kmsmote = KMeansSMOTE(kmeans_args={"n_clusters": 13})
        X_resampled, y_resampled = kmsmote.fit_sample(X=data_train, y=label_train)
        alg.fit(X=X_resampled, y=y_resampled)
        prediction = alg.predict(X=data_test)
        gmean = g_mean(label_test, prediction)
        fscore = f1_score(y_true=label_test, y_pred=prediction) / 2
        result[key]["gmean"].append(gmean)
        result[key]["fscore"].append(fscore)
new_line = 0       
for alg_key in result.keys():
    new_line += 1
    alg = result[alg_key]
    for score_key in alg.keys():
        mean_score = np.mean(alg[score_key])
        print(alg_key, " : ", score_key, " : ", mean_score, end="-----------------")
    print("\n")


knn  :  gmean  :  0.754662156163118-----------------knn  :  fscore  :  0.22225046846989108-----------------

tree  :  gmean  :  0.6451773725362391-----------------tree  :  fscore  :  0.19283284049713842-----------------

svm  :  gmean  :  0.7974828831103404-----------------svm  :  fscore  :  0.23859532612103265-----------------

lgs  :  gmean  :  0.7994636590671178-----------------lgs  :  fscore  :  0.24749471408067653-----------------

nb  :  gmean  :  0.6510763032002435-----------------nb  :  fscore  :  0.15029748958133843-----------------



In [None]:
# 默认参数
# knn  :  gmean  :  0.752408739660329-----------------knn  :  fscore  :  0.22107652908375006-----------------
# 
# tree  :  gmean  :  0.6446628695167719-----------------tree  :  fscore  :  0.19235649791945825-----------------
# 
# svm  :  gmean  :  0.7970035857252608-----------------svm  :  fscore  :  0.239429909909843-----------------
# 
# lgs  :  gmean  :  0.7999699416505978-----------------lgs  :  fscore  :  0.24767590254640715-----------------
# 
# nb  :  gmean  :  0.651140282766539-----------------nb  :  fscore  :  0.1500610413263033-----------------