In [1]:
import sys
# 直接导入Smote是不行的，必须将路径加入才可以
sys.path.append("C:Users\perma\PycharmProjects\paper")
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler 
from kmeans_smote import KMeansSMOTE
import warnings
warnings.filterwarnings('ignore')
from algorithm.km_smote import Over_Sample
from algorithm.under_sample import Under_Sample

In [2]:
names = ["Sex", "Length", "Diameter", "Height", "Whole", "Shucked", "Viscera", "Shell", "class"]
major_class = list(range(1, 14))
data_path = "C:\\Users\\perma\\PycharmProjects\\paper\\data\\abalone.data"

table = pd.read_table(filepath_or_buffer=data_path,
                      header=None,
                      index_col=None,
                      names=names,
                      sep=","
                      )
label = []
for l in table["class"]:
    if l in major_class:
        label.append(0.0)
    else:
        label.append(1.0)
dummies = pd.get_dummies(data=table, prefix=["Sex"], columns=["Sex"])
attributes = ["Sex_F", "Sex_I", "Sex_M", "Length", "Diameter", "Height", "Whole", "Shucked", "Viscera", "Shell"]    
data = dummies[attributes]

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, make_scorer, f1_score
from sklearn.model_selection import train_test_split



In [4]:
# n 为实验次数
n = 100
# 集成的模型个数
ensemble = 3
keys = ["knn", "tree", "svm", "lgs", "nb"]
constructor = {"knn": KNeighborsClassifier, "tree": DecisionTreeClassifier, "svm": SVC, 
               "lgs": LogisticRegression, "nb": GaussianNB}
algorithm_args = {"knn": {"n_neighbors": 7}, "tree": {}, "svm": {}, "lgs": {}, "nb": {}}
algorithm = dict([(key, list()) for key in keys])
for key in keys:
    for i in range(ensemble):
        construc = constructor[key]
        args = algorithm_args[key]
        alg = construc(**args)
        algorithm[key].append(alg)


In [15]:
# 定义gmean函数
def g_mean(ground_truth, prediction):
    matrix = confusion_matrix(y_true=ground_truth, y_pred=prediction)
    tpr = matrix[1, 1] / (matrix[1, 1] + matrix[1, 0])
    tpn = matrix[0, 0] / (matrix[0, 0] + matrix[0, 1])
    return np.sqrt(tpr * tpn)
# 存储每次结果
result = dict([(key, {"gmean": list(), "fscore": list()}) for key in algorithm.keys()])
major = []
minor = []
major_label = []
minor_label = []
# 先将样本分为多数类和少数类
for index in range(len(label)):
    if label[index] == 1.0:
        minor.append(data.iloc[index, :].tolist())
        minor_label.append(1.0)
    else:
        major.append(data.iloc[index, :].tolist())
        major_label.append(0.0)
for i in range(n):
    if i % 25 == 0:
        print("round: ", i)
    # 先将数据集分为训练和测试集
    major_train, major_test, major_label_train, major_label_test = train_test_split(major, major_label, 
                                                                                    test_size=0.25, shuffle=True)
    minor_train, minor_test, minor_label_train, minor_label_test = train_test_split(minor, minor_label, 
                                                                                    test_size=0.25, shuffle=True)
    train = np.concatenate((major_train, minor_train), axis=0)
    train_label = np.concatenate((major_label_train, minor_label_train), axis=0)
    test = np.concatenate((major_test, minor_test))
    test_label = np.concatenate((major_label_test, minor_label_test), axis=0)
    predictions = dict((key, list()) for key in keys)
    for j in range(ensemble):
        kmeans_arg = {"n_clusters": 20}
        over_sampler = Over_Sample(data=train, label=train_label, n=4, categorical_features=[0, 1, 2], **kmeans_arg)
        syntheticed_samples = over_sampler.do_synthetic()
        syntheticed_labels = len(syntheticed_samples) * [1.0]
        under_sampler = Under_Sample(major=major_train, major_label=major_label_train, synthetics=syntheticed_samples,
                                     synthetics_label=syntheticed_labels, categorical_features=[0, 1, 2], rate=1.0,
                                     **kmeans_arg)
        under_samples = under_sampler.do_undersample()
        under_labels = len(under_samples) * [0.0]
        over_under_samples = np.concatenate((minor_train, syntheticed_samples, under_samples), axis=0)
        over_under_labels = np.concatenate((minor_label_train, syntheticed_labels, under_labels), axis=0)
        for key in keys:
            for alg in algorithm[key]:
                alg.fit(X=over_under_samples, y=over_under_labels)
                prediction = alg.predict(X=test)
                predictions[key].append(prediction)
    for key in keys:
        prediction = predictions[key]
        pre = []
        for z, x, c in zip(prediction[0], prediction[1], prediction[2]):
            if z == x:
                pre.append(z)
            else:
                pre.append(c)
        fscore = f1_score(y_true=test_label, y_pred=pre) / 2
        gmean = g_mean(ground_truth=test_label, prediction=pre)
        result[key]["fscore"].append(fscore)
        result[key]["gmean"].append(gmean)
new_line = 0       
for alg_key in result.keys():
    new_line += 1
    alg = result[alg_key]
    for score_key in alg.keys():
        mean_score = np.mean(alg[score_key])
        print(alg_key, " : ", score_key, " : ", mean_score, end="-----------------")
    print("\n")


round:  0


round:  25


round:  50


round:  75


knn  :  gmean  :  0.5797634314419094-----------------knn  :  fscore  :  0.1941089100329932-----------------

tree  :  gmean  :  0.4423392695868095-----------------tree  :  fscore  :  0.1342254246877389-----------------

svm  :  gmean  :  0.0-----------------svm  :  fscore  :  0.0-----------------

lgs  :  gmean  :  0.4532615035456588-----------------lgs  :  fscore  :  0.16039667055667844-----------------

nb  :  gmean  :  0.6360459488413123-----------------nb  :  fscore  :  0.14442101485899678-----------------

