In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn import metrics
import Preprocessing as pp
from sklearn.metrics import classification_report
pd.options.mode.chained_assignment = None

path_data = '/Users/marvinwoller/Desktop/SmartDataAnalytics/Blatt2/data/'

rootdir_train = path_data + 'train/'
rootdir_test = path_data + 'test/'

train_labels_path = path_data + 'train_label.csv'
test_labels_path = path_data + 'test_label.csv'

feature_path = path_data + 'features/'
feature_path_test = path_data + 'features_test/'

resampled_path = path_data + 'resampled/'
resampled_path_test = path_data + 'resampled_test/'

train_labels = pd.read_csv(train_labels_path, index_col=0) #Don't use index numbers per row but CSV file name as index

In [2]:
def svm_classification(X_train,y_train,X_test,y_test,name):
    # Split train data to get a second test set without concept drift
    X_train, X_test_trainset, y_train, y_test_trainset = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
    # Create the classifier
    clf = BaggingClassifier(base_estimator=SVC(kernel='poly'), n_estimators=25, random_state=0, n_jobs=-1)
    # Fit the classifier
    clf.fit(X_train, y_train)
    # Perform prediction on 20% train set data (no drift)
    y_pred_trainset = clf.predict(X_test_trainset)
    # Perform prediction on test data (with drift)
    y_pred = clf.predict(X_test)
    print("##################### " + name + " #####################")
    print("---------------- TRAIN ----------------")
    print("TRAIN Accuracy (" + name + "):",metrics.accuracy_score(y_test_trainset, y_pred_trainset))
    print(classification_report(y_test_trainset, y_pred_trainset))
    print("---------------- TEST ----------------")
    print("TEST Accuracy (" + name + "):",metrics.accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

## Input data: Features

In [3]:
# Use extracted Features for classification
features = ['mean', 'median', 'min', 'max', 'std', 'var']
features2 = ['std', 'var']

In [4]:
# Preprocessing: Remove strong drift + scaling
for feature in features:
    df = pd.read_csv(feature_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(feature_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000)
    y_test, X_test = pp.preprocess_test(df_test)
    svm_classification(X_train,y_train,X_test,y_test,feature)

##################### mean #####################
---------------- TRAIN ----------------
TRAIN Accuracy (mean): 0.5935
              precision    recall  f1-score   support

         0.0       0.57      0.87      0.69      1048
         1.0       0.67      0.29      0.41       952

    accuracy                           0.59      2000
   macro avg       0.62      0.58      0.55      2000
weighted avg       0.62      0.59      0.56      2000

---------------- TEST ----------------
TEST Accuracy (mean): 0.5341167044999332
              precision    recall  f1-score   support

         0.0       0.53      0.72      0.61      7582
         1.0       0.55      0.34      0.42      7396

    accuracy                           0.53     14978
   macro avg       0.54      0.53      0.52     14978
weighted avg       0.54      0.53      0.52     14978

##################### median #####################
---------------- TRAIN ----------------
TRAIN Accuracy (median): 0.6515
              precision 

  'precision', 'predicted', average, warn_for)


##################### var #####################
---------------- TRAIN ----------------
TRAIN Accuracy (var): 0.524
              precision    recall  f1-score   support

           0       0.52      1.00      0.69      1048
           1       0.00      0.00      0.00       952

    accuracy                           0.52      2000
   macro avg       0.26      0.50      0.34      2000
weighted avg       0.27      0.52      0.36      2000

---------------- TEST ----------------
TEST Accuracy (var): 0.5062091066898117
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.67      7582
         1.0       0.00      0.00      0.00      7396

    accuracy                           0.51     14978
   macro avg       0.25      0.50      0.34     14978
weighted avg       0.26      0.51      0.34     14978



  'precision', 'predicted', average, warn_for)


In [5]:
# Try with different preprocessing ("good_sensors" + remove strong drift + scaling)
for feature in features:
    df = pd.read_csv(feature_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(feature_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000, get_good=True)
    y_test, X_test = pp.preprocess_test(df_test, get_good=True)
    svm_classification(X_train,y_train,X_test,y_test,feature)

##################### mean #####################
---------------- TRAIN ----------------
TRAIN Accuracy (mean): 0.597
              precision    recall  f1-score   support

         0.0       0.58      0.85      0.69      1048
         1.0       0.66      0.32      0.43       952

    accuracy                           0.60      2000
   macro avg       0.62      0.58      0.56      2000
weighted avg       0.62      0.60      0.56      2000

---------------- TEST ----------------
TEST Accuracy (mean): 0.5268393644011217
              precision    recall  f1-score   support

         0.0       0.52      0.87      0.65      7582
         1.0       0.57      0.17      0.27      7396

    accuracy                           0.53     14978
   macro avg       0.54      0.52      0.46     14978
weighted avg       0.54      0.53      0.46     14978

##################### median #####################
---------------- TRAIN ----------------
TRAIN Accuracy (median): 0.5725
              precision  

  'precision', 'predicted', average, warn_for)


##################### var #####################
---------------- TRAIN ----------------
TRAIN Accuracy (var): 0.524
              precision    recall  f1-score   support

           0       0.52      1.00      0.69      1048
           1       0.00      0.00      0.00       952

    accuracy                           0.52      2000
   macro avg       0.26      0.50      0.34      2000
weighted avg       0.27      0.52      0.36      2000

---------------- TEST ----------------
TEST Accuracy (var): 0.5062091066898117
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.67      7582
         1.0       0.00      0.00      0.00      7396

    accuracy                           0.51     14978
   macro avg       0.25      0.50      0.34     14978
weighted avg       0.26      0.51      0.34     14978



  'precision', 'predicted', average, warn_for)


In [6]:
# Try with different preprocessing (remove correlation + remove strong drift)
for feature in features:
    df = pd.read_csv(feature_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(feature_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000,rem_corr=True)
    y_test, X_test = pp.preprocess_test(df_test, rem_corr=True)
    svm_classification(X_train,y_train,X_test,y_test,feature)

KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
##################### mean #####################
---------------- TRAIN ----------------
TRAIN Accuracy (mean): 0.6005
              precision    recall  f1-score   support

         0.0       0.59      0.81      0.68      1048
         1.0       0.64      0.37      0.47       952

    accuracy                           0.60      2000
   macro avg       0.61      0.59      0.57      2000
weighted avg       0.61      0.60      0.58      2000

---------------- TEST ----------------
TEST Accuracy (mean): 0.5327146481506209
              precision    recall  f1-score   support

         0.0       0.53      0.59      0.56      7582
         1.0       0.53      0.47      0.50      7396

    accuracy                           0.53     14978
   macro avg       0.53      0.53      0.53   



##################### median #####################
---------------- TRAIN ----------------
TRAIN Accuracy (median): 0.6095
              precision    recall  f1-score   support

         0.0       0.60      0.79      0.68      1048
         1.0       0.64      0.41      0.50       952

    accuracy                           0.61      2000
   macro avg       0.62      0.60      0.59      2000
weighted avg       0.62      0.61      0.59      2000

---------------- TEST ----------------
TEST Accuracy (median): 0.5336493523834958
              precision    recall  f1-score   support

         0.0       0.54      0.56      0.55      7582
         1.0       0.53      0.50      0.52      7396

    accuracy                           0.53     14978
   macro avg       0.53      0.53      0.53     14978
weighted avg       0.53      0.53      0.53     14978

KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skippi

  'precision', 'predicted', average, warn_for)


KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
KeyError skipping...
##################### var #####################
---------------- TRAIN ----------------
TRAIN Accuracy (var): 0.524
              precision    recall  f1-score   support

           0       0.52      1.00      0.69      1048
           1       0.00      0.00      0.00       952

    accuracy                           0.52      2000
   macro avg       0.26      0.50      0.34      2000
weighted avg       0.27      0.52      0.36      2000

---------------- TEST ----------------
TEST Accuracy (var): 0.5062091066898117
              precision    recall  f1-score   support

         0.0       0.51      1.00      0.67      7582
         1.0       0.00      0.00      0.00      7396

    accuracy                           0.51     14978
   macro avg       0.25      0.50      0.34     14

  'precision', 'predicted', average, warn_for)


Die performance einer SVM mit polynomial kernel ist leider nicht besser als mit linearem kernel. Wir erhalten hier für die performance Metrik "Acurracy" einen maximalen Wert von 0,651 für Testdaten aus dem train set und 0,539 für Testdaten aus dem test set. Das zeigt, dass auch die SVM mit polynomial kernel nicht gut mit dem drift der Testdaten umgehen kann.

## Input data: Gekürzte Zeitserien
Wir betrachten nun wie sich die SVM auf den resampelten Zeitseriendaten verhällt.

In [7]:
# Use preprocessed time series for classification
features2 = ['resampled_12H_mean', 'resampled_12H_median', 'resampled_6H_mean', 'resampled_6H_median', 'resampled_3H_mean', 'resampled_3H_median']
features = ['resampled_12H_mean']
for feature in features:
    df = pd.read_csv(resampled_path + feature + '.csv', index_col=0)
    df_test = pd.read_csv(resampled_path_test + feature + '.csv', index_col=0)
    y_train, X_train = pp.preprocess(df, random_n=10000)
    y_test, X_test = pp.preprocess_test(df_test)
    svm_classification(X_train,y_train,X_test,y_test,feature)

##################### resampled_12H_mean #####################
---------------- TRAIN ----------------
TRAIN Accuracy (resampled_12H_mean): 0.586
              precision    recall  f1-score   support

           0       0.56      0.89      0.69      1037
           1       0.69      0.25      0.37       963

    accuracy                           0.59      2000
   macro avg       0.63      0.57      0.53      2000
weighted avg       0.62      0.59      0.54      2000

---------------- TEST ----------------
TEST Accuracy (resampled_12H_mean): 0.5265298681766911
              precision    recall  f1-score   support

         0.0       0.52      0.77      0.62     44420
         1.0       0.54      0.27      0.36     43349

    accuracy                           0.53     87769
   macro avg       0.53      0.52      0.49     87769
weighted avg       0.53      0.53      0.49     87769



Durch verwendung der Zeitserien Daten kann das Ergebnis nicht verbessert werden.