# Classification: Klassische Methoden
In diesem Notebook versuchen wir die Klassifizierung in "Failure" / "No Failure" mit der klassischen machine learning Methode SVM durchzuführen.

In [16]:
import pandas as pd
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn import svm
from sklearn import metrics
import Preprocessing as pp
import Helpers
from Evaluator import Evaluator
from sklearn.metrics import classification_report
pd.options.mode.chained_assignment = None  # default='warn'


path_data = '/Users/marvinwoller/Desktop/SmartDataAnalytics/Blatt2/data/'

rootdir_train = path_data + 'train/'
rootdir_test = path_data + 'test/'

train_labels_path = path_data + 'train_label.csv'
test_labels_path = path_data + 'test_label.csv'

feature_path = path_data + 'features/'
feature_path_test = path_data + 'features_test/'

resampled_path = path_data + 'resampled/'
resampled_path_test = path_data + 'resampled_test/'

# Merge labels and data
train_labels = pd.read_csv(train_labels_path, index_col=0) #Don't use index numbers per row but CSV file name as index




Evaluator started


In [3]:
def classification(X_train,y_train,X_test,y_test,name):
    # Split train data to get a second test set without concept drift
    X_train, X_test_trainset, y_train, y_test_trainset = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
    # Create the classifier
    clf = BaggingClassifier(base_estimator=SVC(kernel='linear'), n_estimators=25, random_state=0, n_jobs=-1)
    # Fit the classifier
    clf.fit(X_train, y_train)
    # Perform prediction on 20% train set data (no drift)
    y_pred_trainset = clf.predict(X_test_trainset)
    # Perform prediction on test data (with drift)
    y_pred = clf.predict(X_test)
    print("##################### " + name + " #####################")
    print("---------------- TRAIN ----------------")
    print("TRAIN Accuracy (" + name + "):",metrics.accuracy_score(y_test_trainset, y_pred_trainset))
    print(classification_report(y_test_trainset, y_pred_trainset))
    print("---------------- TEST ----------------")
    print("TEST Accuracy (" + name + "):",metrics.accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))


## Input data: Features

In [4]:
# Use extracted Features for classification
features = ['mean', 'median', 'min', 'max', 'std', 'var']
features2 = ['std', 'var']

In [18]:
# Preprocessing: Remove strong drift + scaling
for feature in features:
    df = pd.read_csv(feature_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(feature_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000)
    y_test, X_test = pp.preprocess_test(df_test)
    classification(X_train,y_train,X_test,y_test,feature)

##################### mean #####################
---------------- TRAIN ----------------
TRAIN Accuracy (mean): 0.704
              precision    recall  f1-score   support

         0.0       0.69      0.79      0.73      1032
         1.0       0.73      0.61      0.67       968

    accuracy                           0.70      2000
   macro avg       0.71      0.70      0.70      2000
weighted avg       0.71      0.70      0.70      2000

---------------- TEST ----------------
TEST Accuracy (mean): 0.5247028975831219
              precision    recall  f1-score   support

         0.0       0.52      0.89      0.65      7582
         1.0       0.57      0.15      0.24      7396

    accuracy                           0.52     14978
   macro avg       0.54      0.52      0.45     14978
weighted avg       0.54      0.52      0.45     14978

##################### median #####################
---------------- TRAIN ----------------
TRAIN Accuracy (median): 0.6755
              precision  

In [7]:
# Try with different preprocessing ("good_sensors" + remove strong drift + scaling)
for feature in features:
    df = pd.read_csv(feature_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(feature_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000, get_good=True, )
    y_test, X_test = pp.preprocess_test(df_test)
    classification(X_train,y_train,X_test,y_test,feature)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


##################### mean #####################
---------------- TRAIN ----------------
TRAIN Accuracy (mean): 0.6340625
              precision    recall  f1-score   support

         0.0       0.62      0.77      0.68      1647
         1.0       0.67      0.49      0.56      1553

    accuracy                           0.63      3200
   macro avg       0.64      0.63      0.62      3200
weighted avg       0.64      0.63      0.63      3200

---------------- TEST ----------------
TEST Accuracy (mean): 0.5116838029109361
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.67      7582
         1.0       0.90      0.01      0.02      7396

    accuracy                           0.51     14978
   macro avg       0.71      0.51      0.35     14978
weighted avg       0.70      0.51      0.35     14978



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


##################### median #####################
---------------- TRAIN ----------------
TRAIN Accuracy (median): 0.623125
              precision    recall  f1-score   support

         0.0       0.59      0.81      0.68      1607
         1.0       0.69      0.44      0.54      1593

    accuracy                           0.62      3200
   macro avg       0.64      0.62      0.61      3200
weighted avg       0.64      0.62      0.61      3200

---------------- TEST ----------------
TEST Accuracy (median): 0.512084390439311
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.67      7582
         1.0       0.89      0.01      0.03      7396

    accuracy                           0.51     14978
   macro avg       0.70      0.51      0.35     14978
weighted avg       0.70      0.51      0.35     14978



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


##################### min #####################
---------------- TRAIN ----------------
TRAIN Accuracy (min): 0.631875
              precision    recall  f1-score   support

           0       0.61      0.79      0.69      1670
           1       0.67      0.46      0.54      1530

    accuracy                           0.63      3200
   macro avg       0.64      0.62      0.62      3200
weighted avg       0.64      0.63      0.62      3200

---------------- TEST ----------------
TEST Accuracy (min): 0.5096808652690613
              precision    recall  f1-score   support

         0.0       0.51      0.99      0.67      7582
         1.0       0.68      0.01      0.03      7396

    accuracy                           0.51     14978
   macro avg       0.59      0.50      0.35     14978
weighted avg       0.59      0.51      0.35     14978



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


##################### max #####################
---------------- TRAIN ----------------
TRAIN Accuracy (max): 0.645625
              precision    recall  f1-score   support

           0       0.63      0.76      0.69      1669
           1       0.66      0.52      0.59      1531

    accuracy                           0.65      3200
   macro avg       0.65      0.64      0.64      3200
weighted avg       0.65      0.65      0.64      3200

---------------- TEST ----------------
TEST Accuracy (max): 0.511817332087061
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.67      7582
         1.0       1.00      0.01      0.02      7396

    accuracy                           0.51     14978
   macro avg       0.75      0.51      0.35     14978
weighted avg       0.75      0.51      0.35     14978



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


##################### std #####################
---------------- TRAIN ----------------
TRAIN Accuracy (std): 0.5459375
              precision    recall  f1-score   support

           0       0.54      0.76      0.63      1643
           1       0.56      0.32      0.41      1557

    accuracy                           0.55      3200
   macro avg       0.55      0.54      0.52      3200
weighted avg       0.55      0.55      0.52      3200

---------------- TEST ----------------
TEST Accuracy (std): 0.5208305514754974
              precision    recall  f1-score   support

         0.0       0.52      0.88      0.65      7582
         1.0       0.55      0.16      0.24      7396

    accuracy                           0.52     14978
   macro avg       0.53      0.52      0.45     14978
weighted avg       0.53      0.52      0.45     14978



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


##################### var #####################
---------------- TRAIN ----------------
TRAIN Accuracy (var): 0.51875
              precision    recall  f1-score   support

           0       0.50      0.93      0.65      1556
           1       0.66      0.13      0.22      1644

    accuracy                           0.52      3200
   macro avg       0.58      0.53      0.44      3200
weighted avg       0.58      0.52      0.43      3200

---------------- TEST ----------------
TEST Accuracy (var): 0.5210976098277473
              precision    recall  f1-score   support

         0.0       0.51      0.93      0.66      7582
         1.0       0.59      0.10      0.17      7396

    accuracy                           0.52     14978
   macro avg       0.55      0.52      0.42     14978
weighted avg       0.55      0.52      0.42     14978



In [17]:
# Try with different preprocessing (remove correlation + remove strong drift)
for feature in features:
    df = pd.read_csv(feature_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(feature_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000,rem_corr=True)
    y_test, X_test = pp.preprocess_test(df_test, rem_corr=True)
    classification(X_train,y_train,X_test,y_test,feature)

KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
##################### mean #####################
---------------- TRAIN ----------------
TRAIN Accuracy (mean): 0.6875
              precision    recall  f1-score   support

         0.0       0.69      0.73      0.71      1052
         1.0       0.68      0.64      0.66       948

    accuracy                           0.69      2000
   macro avg       0.69      0.69      0.69      2000
weighted avg       0.69      0.69      0.69      2000

---------------- TEST ----------------
TEST Accuracy (mean): 0.49799706235812524
              precision    recall  f1-score   support

         0.0       0.52      0.11      0.19      7582
         1.0       0.50      0.89      0.64      7396

    accuracy                           0.50     14978
   macro avg       0.51      0.50      0.41  



##################### max #####################
---------------- TRAIN ----------------
TRAIN Accuracy (max): 0.663
              precision    recall  f1-score   support

           0       0.64      0.74      0.69       991
           1       0.70      0.59      0.64      1009

    accuracy                           0.66      2000
   macro avg       0.67      0.66      0.66      2000
weighted avg       0.67      0.66      0.66      2000

---------------- TEST ----------------
TEST Accuracy (max): 0.5084791026839365
              precision    recall  f1-score   support

         0.0       0.51      0.85      0.64      7582
         1.0       0.51      0.16      0.24      7396

    accuracy                           0.51     14978
   macro avg       0.51      0.50      0.44     14978
weighted avg       0.51      0.51      0.44     14978

KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyE



##################### std #####################
---------------- TRAIN ----------------
TRAIN Accuracy (std): 0.55
              precision    recall  f1-score   support

           0       0.57      0.67      0.61      1070
           1       0.52      0.42      0.46       930

    accuracy                           0.55      2000
   macro avg       0.54      0.54      0.54      2000
weighted avg       0.55      0.55      0.54      2000

---------------- TEST ----------------
TEST Accuracy (std): 0.5114835091467486
              precision    recall  f1-score   support

         0.0       0.51      0.69      0.59      7582
         1.0       0.51      0.33      0.40      7396

    accuracy                           0.51     14978
   macro avg       0.51      0.51      0.49     14978
weighted avg       0.51      0.51      0.49     14978

KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyEr

## Input data: Gekürzte Zeitserien

In [44]:
# Use preprocessed time series for classification
features2 = ['resampled_12H_mean', 'resampled_12H_median', 'resampled_6H_mean', 'resampled_6H_median', 'resampled_3H_mean', 'resampled_3H_median']
features = ['resampled_12H_mean']
for feature in features:
    df = pd.read_csv(resampled_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(resampled_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000)
    y_test, X_test = pp.preprocess_test(df_test)
    classification(X_train,y_train,X_test,y_test,feature)








# Classification: Lineares Modell

##################### resampled_12H_mean#####################
---------------- TRAIN ----------------
TRAIN Accuracy (resampled_12H_mean): 0.6743333333333333
              precision    recall  f1-score   support

           0       0.66      0.75      0.70      3032
           1       0.70      0.60      0.65      2968

    accuracy                           0.67      6000
   macro avg       0.68      0.67      0.67      6000
weighted avg       0.68      0.67      0.67      6000

---------------- TEST ----------------
TEST Accuracy (resampled_12H_mean): 0.5135184404516401
              precision    recall  f1-score   support

         0.0       0.51      0.96      0.67     44420
         1.0       0.57      0.06      0.11     43349

    accuracy                           0.51     87769
   macro avg       0.54      0.51      0.39     87769
weighted avg       0.54      0.51      0.39     87769



In [None]:
# Classification: Logitisches Modell
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

pred = logreg.predict(X_test)
print(logreg.score(X_test,y_test))

# Classification: KNN

# Classification: Xgboost

# Classification: LightGBM

# Others ?


