# Classification: Klassische Methoden
In diesem Notebook versuchen wir die Klassifizierung in "Failure"/"No Failure " mit klassischen machine learning Methoden durchzuführen.

In [1]:
import pandas as pd
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics
#import Preprocessing as pp
#import Helpers
#from keras.models import Sequential
#from keras.layers import Dense

rootdir_train = 'data/train/'
rootdir_test = 'data/test/'

train_labels_path = 'data/train_label.csv'
test_labels_path = 'data/test_label.csv'

feature_path = 'data/features/'
feature_path_test = 'data/features_test/'

feature_path_red = 'data/features_reduced/'
feature_path_test_red = 'data/features_reduced_test/'

# Merge labels and data
train_labels = pd.read_csv(train_labels_path, index_col=0) #Don't use index numbers per row but CSV file name as index

In [2]:
drift = ['pitch motor 1 current', 'pitch motor 2 current', 'Pitch motor 3 current', 'x direction vibration value', 'y direction vibration value', 'hydraulic brake pressure', 'generator current', 'Inverter inlet temperature', 'inverter outlet temperature', 'inverter inlet pressure', 'inverter outlet pressure', 'wind tower ambient temperature', 'Wheel temperature', 'Wheel control cabinet temperature', 'Cabin temperature', 'Cabin control cabinet temperature', 'vane 1 pitch motor temperature', 'blade 2 pitch motor temperature', 'blade 3 pitch motor temperature', 'blade 1 inverter box temperature', 'blade 2 inverter box temperature', 'blade 3 inverter box temperature','inverter grid side current', 'Inverter grid side active power', 'inverter generator side power', 'generator operating frequency','generator stator temperature 1', 'generator stator temperature 2',  'generator stator temperature 3', 'generator stator temperature 4', 'Generator stator temperature 5', 'generator stator temperature 6', 'generator air temperature 1', 'generator air temperature 2','main bearing temperature 1', 'main bearing temperature 2', 'Pitch motor 1 power estimation', 'Pitch motor 2 power estimation', 'Pitch motor 3 power estimation', 'blade 1 battery box temperature', 'blade 2 battery box temperature', 'blade 3 battery box temperature','Inverter INU temperature', 'Inverter ISU temperature','atmospheric pressure', 'reactive power control status', 'reactive power set value', 'Inverter INU RMIO temperature','blade 1 angle','blade 2 angle','blade 3 angle','inverter grid side voltage','inverter grid side reactive power']
strong_drift = ['pitch motor 1 current', 'pitch motor 2 current', 'Pitch motor 3 current', 'x direction vibration value', 'y direction vibration value', 'hydraulic brake pressure', 'generator current', 'Inverter inlet temperature', 'inverter outlet temperature', 'inverter inlet pressure', 'inverter outlet pressure', 'wind tower ambient temperature', 'Wheel temperature', 'Wheel control cabinet temperature', 'Cabin temperature', 'Cabin control cabinet temperature', 'vane 1 pitch motor temperature', 'blade 2 pitch motor temperature', 'blade 3 pitch motor temperature', 'blade 1 inverter box temperature', 'blade 2 inverter box temperature', 'blade 3 inverter box temperature']
print(75-len(drift))

22


In [3]:
def prepare_ml(estimator, x, y):
    clf = BaggingClassifier(base_estimator=estimator, n_estimators=10, random_state=0, n_jobs=-1)
    clf.fit(x,y)
    return clf
    

In [4]:
def prepare_data(path, file_name = 'mean.csv', red_drift = True):
    data = pd.read_csv(path + file_name, index_col=0)
    data.dropna(inplace=True)
    if red_drift:
        drl = list(set(data.columns.values).intersection(drift))
        data.drop(drl, axis = 1, inplace=True)
    label = data['label']
    data.drop(['area','label'], axis = 1, inplace=True)
    return data, label

In [5]:
def score_predict(test_label, prediction):
    print("Accuracy:",metrics.accuracy_score(test_label, prediction))
    print("F1:",metrics.f1_score(test_label, prediction))

In [6]:
features = ["mean.csv", "median.csv", "min.csv", "max.csv", "std.csv", "var.csv"]

In [7]:
def calculateImportance(bm):
    importance = [model.coef_ for model in bm.estimators_]
    # print(importance)
    m_importance = np.mean(importance,axis=0)
    # print(m_importance)
    return np.squeeze(m_importance)

def calculateMagnitude(bm, x):
    coeff = calculateImportance(bm)
    return np.std(x, 0) * coeff

In [9]:
sample_size = 2500

for f in features:
    print(f)
    data = pd.read_csv(feature_path + f, index_col=0)
    data = data.sample(sample_size, random_state=0)
    data.drop(strong_drift, axis = 1, inplace=True)
    labels = data['label']
    data.drop(['area','label'], axis = 1, inplace=True)
    X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=123)
    test_data, test_label = prepare_data(feature_path_test, file_name=f, red_drift=False)
    test_data.drop(strong_drift, axis = 1, inplace=True)

    print("linear SVM")
    svm_l = prepare_ml(SVC(kernel='linear'), X_train, y_train)
    #calculateMagnitude(svm_l,X_train)
    print("")
    print("TRAIN")
    y_pred_l_tr = svm_l.predict(X_test)
    score_predict(y_test, y_pred_l_tr)
    print("TEST")
    y_pred_l_te = svm_l.predict(test_data)
    score_predict(test_label, y_pred_l_te)
    print("*****")
    print("rbf SVM")
    svm_r = prepare_ml(SVC(kernel='rbf'), X_train, y_train)
    print("")
    print("TRAIN")
    y_pred_r_tr = svm_r.predict(X_test)
    score_predict(y_test, y_pred_r_tr)
    print("TEST")
    y_pred_r_te = svm_r.predict(test_data)
    score_predict(test_label, y_pred_r_te)
    print("-----"*5)

mean.csv
linear SVM

TRAIN
Accuracy: 0.704
F1: 0.6796536796536796
TEST
Accuracy: 0.5181599679529977
F1: 0.4756230473007338
*****
rbf SVM

TRAIN
Accuracy: 0.556
F1: 0.2838709677419355
TEST
Accuracy: 0.52009614100681
F1: 0.25404732254047324
-------------------------
median.csv
linear SVM

TRAIN
Accuracy: 0.702
F1: 0.6809421841541756
TEST
Accuracy: 0.5178929096007477
F1: 0.5063239215150065
*****
rbf SVM

TRAIN
Accuracy: 0.554
F1: 0.2736156351791531
TEST
Accuracy: 0.5189611430097476
F1: 0.2448380672885442
-------------------------
min.csv
linear SVM

TRAIN
Accuracy: 0.67
F1: 0.5965770171149144
TEST
Accuracy: 0.5219655494725598
F1: 0.21353251318101935
*****
rbf SVM

TRAIN
Accuracy: 0.526
F1: 0.3323943661971831
TEST
Accuracy: 0.5016691147015623
F1: 0.37737737737737737
-------------------------
max.csv
linear SVM

TRAIN
Accuracy: 0.69
F1: 0.6695095948827292
TEST
Accuracy: 0.5012685271731874
F1: 0.32532514450867056
*****
rbf SVM

TRAIN
Accuracy: 0.55
F1: 0.27184466019417475
TEST
Accuracy: 0.52

ValueError: The number of classes has to be greater than one; got 1 class

In [11]:
def classifyAfterMagnitudeEvaluation(data, labels, mag):
    thresh = 1
    #imp_cut = np.abs(mag.to_numpy()) > thresh
    ind = np.squeeze(np.argwhere(np.abs(mag.to_numpy()) > thresh))
    print(ind + 1)
    clf = BaggingClassifier(base_estimator=SVC(kernel='linear'), n_estimators=10, random_state=0, n_jobs=-1)

    #Train the model using the training sets
    clf.fit(data.iloc[:,ind], labels)

    #Predict the response for test dataset (here still part of training dataset to avoid concept drift)
    y_pred = clf.predict(test_data.iloc[:,ind])

    # Model Accuracy: how often is the classifier correct?
    print("Accuracy:",metrics.accuracy_score(test_label, y_pred))
    print("F1:",metrics.f1_score(test_label, y_pred))

In [12]:
#classifyAfterMagnitudeEvaluation()

In [15]:
from joblib import dump, load
import os.path

def save_model(model, name):
    path = os.path.join("models", name + ".model")
    dump(model, path)

def load_model(path):
    return load(path)

## Vollständiger Datensatz

Reduzierte Daten nach Data_Exploration_Importance_of_Features.ipynb

In [None]:
def prepare_svm(rd = True, k='linear'):
    svc = SVC(kernel=k)
    x, y = prepare_data(feature_path_red, red_drift = rd)
    svc = prepare_ml(svc, x, y)

    return svc

In [13]:
from sklearn.neighbors import KNeighborsClassifier

def prepare_knn(rd = True):
    knn = KNeighborsClassifier()
    x, y = prepare_data(feature_path_red, red_drift = rd)
    knn = prepare_ml(knn, x, y)

    return knn

### Ohne Drift

In [None]:
x_test, y_test = prepare_data(feature_path_test_red)

#### SVC

In [None]:
single_svc = prepare_svm()
save_model(single_svc,"SVC_reducedData")

In [65]:
y_predict_red = single_svc.predict(x_test)
score_predict(y_test,y_predict_red)

Accuracy: 0.5084123380958739
F1: 0.3411185682326621


#### KNN

In [66]:
single_knn = prepare_knn()
save_model(single_knn,"KNN_reducedData")

In [67]:
y_predict_red = single_svc.predict(x_test)
score_predict(y_test,y_predict_red)

Accuracy: 0.5084123380958739
F1: 0.3411185682326621


### Erkenntnis

Sowohl die SVM als auch der KNN raten nur das Ergebnis.
Anscheinend sind die Daten aus den unterschiedlichen Standorten untereinander zu unterschiedlich um gemeinsame Parameter zu finden.

### Mit Drift

In [9]:
x_test, y_test = prepare_data(feature_path_test_red, red_drift= False)

#### SVC

In [None]:
single_svc_wd = prepare_svm(False)
save_model(single_svc_wd, "SVC_reducedData_withDrift")

In [68]:
y_predict_red = single_svc_wd.predict(x_test)
score_predict(y_test,y_predict_red)

Accuracy: 0.5075443984510616
F1: 0.4366025053467767


#### KNN

In [16]:
single_knn = prepare_knn(False)
save_model(single_knn,"KNN_reducedData_withDrift")

In [17]:
y_predict_red = single_knn.predict(x_test)
score_predict(y_test,y_predict_red)

Accuracy: 0.5004673521164374
F1: 0.4504186866460996


## Conclusion

Ohne eine Verarbeitung der Daten bezüglich des Drifts, können keien verlässlichen Modelle trainiert werden.