In [6]:
import sys
# 直接导入Smote是不行的，必须将路径加入才可以
sys.path.append("C:Users\perma\PycharmProjects\paper")
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
from algorithm.SMOTE import Smote

In [2]:
names = ["Sex", "Length", "Diameter", "Height", "Whole", "Shucked", "Viscera", "Shell", "class"]
major_class = list(range(1, 14))
data_path = "C:\\Users\\perma\\PycharmProjects\\paper\\data\\abalone.data"

table = pd.read_table(filepath_or_buffer=data_path,
                      header=None,
                      index_col=None,
                      names=names,
                      sep=","
                      )
label = []
for l in table["class"]:
    if l in major_class:
        label.append(0.0)
    else:
        label.append(1.0)
dummies = pd.get_dummies(data=table, prefix=["Sex"], columns=["Sex"])
attributes = ["Sex_F", "Sex_I", "Sex_M", "Length", "Diameter", "Height", "Whole", "Shucked", "Viscera", "Shell"]    
data = dummies[attributes]

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import train_test_split

In [4]:
knn = KNeighborsClassifier()
dc = DecisionTreeClassifier()
svm = SVC()
lg = LogisticRegression()
nb = GaussianNB()
algorithm = {"knn": knn, "tree": dc, "svm": svm, "lgs": lg, "nb": nb}
alg_params = {"knn": {"n_neighbors": 7}, 
              "tree": {}, 
              "svm": {}, 
              "lgs": {}, 
              "nb": {}}


In [7]:
# 将少数类和多数类分开提取出来
major = []
major_label = []
minor = []
minor_label = []
for label_index in range(len(label)):
    if label[label_index] == 1.0:
        minor.append(data.iloc[label_index, :])
        minor_label.append(label[label_index])
    else:
        major.append(data.iloc[label_index, :])
        major_label.append(label[label_index])
# 定义gmean函数
def g_mean(ground_truth, prediction):
    matrix = confusion_matrix(y_true=ground_truth, y_pred=prediction)
    tpr = matrix[1, 1] / (matrix[1, 1] + matrix[1, 0])
    tpn = matrix[0, 0] / (matrix[0, 0] + matrix[0, 1])
    return np.sqrt(tpr * tpn)
result = dict([(key, {"gmean": list(), "fscore": list()}) for key in algorithm.keys()])
# 实验次数
n = 100
for i in range(n):
    for key in algorithm.keys():
        alg = algorithm[key]
        alg.set_params(**alg_params[key])
        major_train, major_test, \
        major_label_train, major_label_test \
            = train_test_split(major, major_label, shuffle=True, test_size=0.25)
        minor_train, minor_test, \
        minor_label_train, minor_label_test \
            = train_test_split(minor, minor_label, shuffle=True, test_size=0.25)
        data_train = np.concatenate((major_train, minor_train), axis=0)
        label_train = np.concatenate((major_label_train, minor_label_train), axis=0)
        data_test = np.concatenate((major_test, minor_test), axis=0)
        label_test = np.concatenate((major_label_test, minor_label_test), axis=0)
        # smote生成合成样本
        minor_samples = []
        for label_index in range(len(label_train)):
            if label_train[label_index] == 1.0:
                minor_samples.append(data_train[label_index])
        smote = Smote(minor_samples, 700, 3)
        smote.over_sampling()
        syntheticed_samples = smote.synthetic
        syntheticed_labels = len(syntheticed_samples) * [1.0]
        syntheticed_train = np.concatenate((data_train, syntheticed_samples), axis=0)
        syntheticed_train_label = np.concatenate((label_train, syntheticed_labels), axis=0)
        alg.fit(X=syntheticed_train, y=syntheticed_train_label)
        prediction = alg.predict(X=data_test)
        gmean = g_mean(label_test, prediction)
        fscore = f1_score(y_true=label_test, y_pred=prediction) / 2
        result[key]["gmean"].append(gmean)
        result[key]["fscore"].append(fscore)
new_line = 0       
for alg_key in result.keys():
    new_line += 1
    alg = result[alg_key]
    for score_key in alg.keys():
        mean_score = np.mean(alg[score_key])
        print(alg_key, " : ", score_key, " : ", mean_score, end="-----------------")
    print("\n")


knn  :  gmean  :  0.7504449668191557-----------------knn  :  fscore  :  0.21884566492182794-----------------

tree  :  gmean  :  0.6417807755386297-----------------tree  :  fscore  :  0.19371478231622322-----------------

svm  :  gmean  :  0.7953933344309189-----------------svm  :  fscore  :  0.23302021345215418-----------------

lgs  :  gmean  :  0.8016360129072083-----------------lgs  :  fscore  :  0.24366952339428594-----------------

nb  :  gmean  :  0.650675054362151-----------------nb  :  fscore  :  0.14990715285969874-----------------



In [None]:
# knn  :  gmean  :  0.7504449668191557-----------------knn  :  fscore  :  0.21884566492182794-----------------
# 
# tree  :  gmean  :  0.6417807755386297-----------------tree  :  fscore  :  0.19371478231622322-----------------
# 
# svm  :  gmean  :  0.7953933344309189-----------------svm  :  fscore  :  0.23302021345215418-----------------
# 
# lgs  :  gmean  :  0.8016360129072083-----------------lgs  :  fscore  :  0.24366952339428594-----------------
# 
# nb  :  gmean  :  0.650675054362151-----------------nb  :  fscore  :  0.14990715285969874-----------------