<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
import math

from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

plt.rcParams['font.family']=['Songti SC']
plt.rcParams['axes.unicode_minus'] = False
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.options.display.max_rows = 200

np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
np.set_printoptions(threshold=np.inf)
np.set_printoptions(linewidth=100, suppress=True)

In [28]:
new15 = pd.read_csv("../data/15.csv",index_col=0).round(2)
new21 = pd.read_csv("../data/21.csv",index_col=0).round(2)
new15_under = pd.read_csv("../data/15_under.csv",index_col=0).round(2)

In [29]:
feature_selected = ['environment_tmp', 'int_tmp','yaw_speed','wind_speed','pitch3_moto_tmp','pitch1_moto_tmp','pitch2_moto_tmp',
'power','pitch1_angle','pitch3_angle','pitch2_angle','generator_speed']
feature_gen = ['wind_speed.div.power', 'power.div.generator_speed','generator_speed.div.wind', 'mean_pitch_angle', 'tmp_diff']
feature_all = new15_under.columns.to_list()

In [30]:
feature_use = feature_selected + feature_gen

In [31]:
# 选择特征
new15f = new15_under[feature_use+['label']]
new21f = new21[feature_use+['label']]

In [32]:
# 强规则
new15fw = new15f[(new15f['power']<=2) & (new15f['environment_tmp']<=2)]
new21fw = new21f[(new21f['power']<=2) & (new21f['environment_tmp']<=2)]

# 分割
svr_new15fw = new15fw[new15fw['power']<=-0.975]
slt_new15fw = new15fw[new15fw['power']>-0.975]

svr_new21fw = new21fw[new21fw['power']<=-0.975]
slt_new21fw = new21fw[new21fw['power']>-0.975]

# svr_new15fw = new15fw[new15fw['wind_speed']<=-1]
# slt_new15fw = new15fw[new15fw['wind_speed']>-1]

# svr_new21fw = new21fw[new21fw['wind_speed']<=-1]
# slt_new21fw = new21fw[new21fw['wind_speed']>-1]

In [33]:
slt_new21fw.label.describe()

count   147392.000
mean         0.057
std          0.231
min          0.000
25%          0.000
50%          0.000
75%          0.000
max          1.000
Name: label, dtype: float64

In [34]:
names = ["Nearest Neighbors",
"Logistic Regression", 
"Decision Tree"
]

classifiers = [
    KNeighborsClassifier(n_jobs=-1,n_neighbors=3,weights="distance"),
    LogisticRegression(n_jobs=-1,random_state=7), #solver='sag',max_iter=10000,class_weight='balanced',
    DecisionTreeClassifier(random_state=0),
]

In [35]:
def competition_score(y_test, y_pred):
    N_fault = sum(y_test) # 标签为1的个数
    N_normal = len(y_test) - N_fault # 标签为0的个数
    alpha = float(N_normal/len(y_test)) # 0的所占比例
    # alpha = 0.5
    beta = 1 - alpha # 1的所占比例
    tn, fn, fp, tp = confusion_matrix(y_test, y_pred).ravel()
    # fp = 2871
    # fn = 8142
    cm_score = (1- alpha*fn/N_normal - beta*fp/N_fault )*100.
    print("len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp")
    print(len(y_test),N_fault,N_normal,alpha,beta, tn, fp, fn, tp)
    print()
    return cm_score

def cal_scores(names, classifiers,X_train, y_train,X_test,y_test):
    scores = pd.DataFrame(columns=['Score','Accuracy','Precision','Recall','AUC','Train time(s)','Test time(s)'], index=names)
    for name, clf in zip(names, classifiers):
        time1 = time.time()
        clf.fit(X_train, y_train)
        time2 = time.time()
        y_pred = clf.predict(X_test)
        time3 = time.time()

        scores.at[name,'Score'] = competition_score(y_test, y_pred)
        scores.at[name,'Accuracy'] = accuracy_score(y_test, y_pred)
        scores.at[name,'Precision'] = precision_score(y_test, y_pred)
        scores.at[name,'Recall'] = recall_score(y_test, y_pred)
        scores.at[name,'AUC'] = roc_auc_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred).ravel()
        scores.at[name,'tn'] = int(cm[0])
        scores.at[name,'fn'] = int(cm[1])
        scores.at[name,'fp'] = int(cm[2])
        scores.at[name,'tp'] = int(cm[3])

        scores.at[name,'Train time(s)'] = time2 - time1
        scores.at[name,'Test time(s)'] = time3 - time2

    return scores

def exp(X_train,X_test,y_train,y_test,names, classifiers):
    return cal_scores(names, classifiers,X_train, y_train,X_test,y_test)

In [36]:
# 对筛过特征、强规则、分割后的进行调优
X_slt_new15fw,y_slt_new15fw = slt_new15fw.drop(columns=['label']),slt_new15fw['label']
X_svr_new15fw,y_svr_new15fw = svr_new15fw.drop(columns=['label']),svr_new15fw['label']

X_slt_train,X_slt_test,y_slt_train,y_slt_test = train_test_split(X_slt_new15fw,y_slt_new15fw,test_size=0.3,random_state=7)
X_svr_train,X_svr_test,y_svr_train,y_svr_test = train_test_split(X_svr_new15fw,y_svr_new15fw,test_size=0.3,random_state=7)

# slt 对15号内部训练并测试
score_1515_slt = exp(X_slt_train,X_slt_test,y_slt_train,y_slt_test, names, classifiers)
score_1515_slt

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
10441 4364 6077 0.5820323723781247 0.41796762762187534 5811 148 266 4216

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
10441 4364 6077 0.5820323723781247 0.41796762762187534 5570 1044 507 3320

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
10441 4364 6077 0.5820323723781247 0.41796762762187534 6036 23 41 4341



Unnamed: 0,Score,Accuracy,Precision,Recall,AUC,Train time(s),Test time(s),tn,fn,fp,tp
Nearest Neighbors,96.035,0.96,0.941,0.966,0.961,0.027,0.116,5811.0,266.0,148.0,4216.0
Logistic Regression,85.145,0.851,0.868,0.761,0.839,0.358,0.001,5570.0,507.0,1044.0,3320.0
Decision Tree,99.387,0.994,0.991,0.995,0.994,0.167,0.002,6036.0,41.0,23.0,4341.0


In [37]:
# svr 对15号内部训练并测试
score_1515_svr = exp(X_svr_train,X_svr_test,y_svr_train,y_svr_test, names, classifiers)
score_1515_svr

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
3378 2769 609 0.1802841918294849 0.8197158081705151 601 0 8 2769

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
3378 2769 609 0.1802841918294849 0.8197158081705151 518 60 91 2709

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
3378 2769 609 0.1802841918294849 0.8197158081705151 605 1 4 2768



Unnamed: 0,Score,Accuracy,Precision,Recall,AUC,Train time(s),Test time(s),tn,fn,fp,tp
Nearest Neighbors,99.763,0.998,0.997,1.0,0.993,0.012,0.109,601.0,8.0,0.0,2769.0
Logistic Regression,95.53,0.955,0.968,0.978,0.914,0.782,0.001,518.0,91.0,60.0,2709.0
Decision Tree,99.852,0.999,0.999,1.0,0.997,0.044,0.001,605.0,4.0,1.0,2768.0


In [38]:
# slt 15号训练部分、21号测试
X_slt_new21fw,y_slt_new21fw = slt_new21fw.drop(columns=['label']),slt_new21fw['label']
score_1521_slt = exp(X_slt_train,X_slt_new21fw,y_slt_train,y_slt_new21fw, names, classifiers)
score_1521_slt

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
147392 8370 139022 0.9432126574033869 0.05678734259661311 129270 3734 9752 4636

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
147392 8370 139022 0.9432126574033869 0.05678734259661311 129885 2953 9137 5417

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
147392 8370 139022 0.9432126574033869 0.05678734259661311 133572 5269 5450 3101



Unnamed: 0,Score,Accuracy,Precision,Recall,AUC,Train time(s),Test time(s),tn,fn,fp,tp
Nearest Neighbors,90.85,0.909,0.322,0.554,0.742,0.04,2.184,129270.0,9752.0,3734.0,4636.0
Logistic Regression,91.797,0.918,0.372,0.647,0.791,0.353,0.003,129885.0,9137.0,2953.0,5417.0
Decision Tree,92.728,0.927,0.363,0.37,0.666,0.139,0.014,133572.0,5450.0,5269.0,3101.0


In [39]:
# svr 15号训练、21号测试
X_svr_new21fw, y_svr_new21fw = svr_new21fw.drop(columns=['label']),svr_new21fw['label']
score_1521_svr = exp(X_svr_train,X_svr_new21fw,y_svr_train,y_svr_new21fw, names, classifiers)
score_1521_svr

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
7003 2113 4890 0.6982721690703984 0.30172783092960165 3946 201 944 1912

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
7003 2113 4890 0.6982721690703984 0.30172783092960165 4199 237 691 1876

len(y_test), N_fault,N_normal,alpha,beta, tn, fp, fn, tp
7003 2113 4890 0.6982721690703984 0.30172783092960165 4173 579 717 1534



Unnamed: 0,Score,Accuracy,Precision,Recall,AUC,Train time(s),Test time(s),tn,fn,fp,tp
Nearest Neighbors,83.65,0.836,0.669,0.905,0.856,0.026,0.113,3946.0,944.0,201.0,1912.0
Logistic Regression,86.749,0.867,0.731,0.888,0.873,0.419,0.001,4199.0,691.0,237.0,1876.0
Decision Tree,81.494,0.815,0.681,0.726,0.79,0.043,0.001,4173.0,717.0,579.0,1534.0
