In [2]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
from scipy.signal import detrend, butter, lfilter
from scipy.signal.windows import tukey
from scipy.fft import fftfreq, fft
import joblib

Import data

In [30]:
# get the number of files located in the data folder
dir_path = r'./data/raw_data/single'
source_file_nr = len([entry for entry in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, entry))])

for number in range(source_file_nr):
        with open(f'./data/raw_data/single/single_source_scenario_{number + 1}.data', 'rb') as pickle_file:
                data = pickle.load(pickle_file)
                df_sorted = data[0]
                signals_single = data[1]
                df_plotting_single = df_sorted[['flattened_index', 'power_dbm', 'phase_absolute']].copy()
                df_plotting_single['cluster'] = np.full(len(signals_single), (number+1)) # Cluster labels should begin from 1

        if number == 0:
                signals = signals_single
                df_plotting = df_plotting_single.copy()
        else: 
                signals = np.concatenate((signals, signals_single))
                df_plotting = pd.concat((df_plotting, df_plotting_single))


## Classification tests

### Other classifiers test

Classification imports

In [37]:
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

- Training data

In [38]:
dir_path = r'./data/features/lab_single'
source_file_nr = len([entry for entry in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, entry))])


for number in range(source_file_nr):
    
    with open(os.path.join(dir_path, f'onset_scenario_{number + 1}'), 'rb') as file:
        signal_extracted_scenario = pickle.load(file)
    # create a single array with all the feature files
    if number == 0:
        signal_extracted = signal_extracted_scenario
    else:
        signal_extracted = np.append(signal_extracted, signal_extracted_scenario, axis=0)

- Shuffle dataset and extract variables

In [39]:
np.random.shuffle(signal_extracted)

# Unpack the data
id_train = signal_extracted[:, 0]
sens_train = signal_extracted[:, 1]
cluster_train = signal_extracted[:, 2]
signals_train = signal_extracted[:, 3:]

In [40]:
sensors_sorted = np.sort(np.unique(sens_train))

- Prepare X_train and y_train (define the binary label)

    - First the Corona as PD - index 6 is corona


In [41]:
y_train = np.zeros(np.shape(cluster_train))
y_train[cluster_train > 5] = 1  # the last few labels are interference (1) the first few PD (0)
X_train = signals_train

In [42]:
scores = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'] # in the end the best model is chosen based on pure accuracy
classifiers = {
    'svc': SVC(),
    'rand_forest': RandomForestClassifier(criterion='log_loss'),
    'knn': KNeighborsClassifier(metric='minkowski'), 
    'gbc': HistGradientBoostingClassifier()}

search_spaces = [
                    {'kernel': ['poly'], 'gamma': ['scale', 'auto']},
                    {'n_estimators': [150, 175, 200], 'criterion': ['log_loss']},
                    {'n_neighbors': [3, 5, 10]},
                    {'learning_rate': [0.1, 0.2, 0.3]}
                ]

In [43]:
best_models_bin = []
results_bin = []

# One model per sensor is trained
for sensor in sensors_sorted:
    best_models_sens = []
    results_sens = []


    print(f'Train sens: {sensor}')
    X_train_sens = X_train[sens_train == sensor]
    y_train_sens = y_train[sens_train == sensor]

    # Training through cv grid search
    for i, clf_key in enumerate(list(classifiers.keys())):

        print(f'Fitting CV for: {clf_key}')
        gs_cv = GridSearchCV(estimator = classifiers[clf_key], param_grid = search_spaces[i], scoring=scores, refit='accuracy', cv = 5, verbose=3)
        gs_cv.fit(X_train_sens, y_train_sens)

        best_models_sens.append(gs_cv.best_estimator_)
        print(f'Best score: {gs_cv.best_score_}')
        print(f'Best params: {gs_cv.best_params_}')
        results_sens.append(gs_cv.best_score_)

    best_models_bin.append(best_models_sens)
    results_bin.append(results_sens)

results_avg_clf = np.average(np.array(results_bin), axis = 0)


Train sens: 0
Fitting CV for: svc
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5] END gamma=scale, kernel=poly; accuracy: (test=0.717) f1_macro: (test=0.632) precision_macro: (test=0.840) recall_macro: (test=0.647) total time=  10.7s
[CV 2/5] END gamma=scale, kernel=poly; accuracy: (test=0.718) f1_macro: (test=0.634) precision_macro: (test=0.840) recall_macro: (test=0.648) total time=   9.4s
[CV 3/5] END gamma=scale, kernel=poly; accuracy: (test=0.737) f1_macro: (test=0.665) precision_macro: (test=0.847) recall_macro: (test=0.671) total time=   9.7s
[CV 4/5] END gamma=scale, kernel=poly; accuracy: (test=0.729) f1_macro: (test=0.652) precision_macro: (test=0.844) recall_macro: (test=0.661) total time=  10.0s
[CV 5/5] END gamma=scale, kernel=poly; accuracy: (test=0.723) f1_macro: (test=0.641) precision_macro: (test=0.842) recall_macro: (test=0.654) total time=  10.2s
[CV 1/5] END gamma=auto, kernel=poly; accuracy: (test=0.969) f1_macro: (test=0.968) precision_macro: 

In [44]:
for i, clf in enumerate(classifiers.keys()):
    print(f'Average score over sensors for {clf}: {results_avg_clf[i]}')

Average score over sensors for svc: 0.9679268722775852
Average score over sensors for rand_forest: 0.9689068375802434
Average score over sensors for knn: 0.9799686526742138
Average score over sensors for gbc: 0.9778218548110476


- Prepare X_train and y_train (define the multiclass label)

In [45]:
y_train_mc = cluster_train
X_train_mc = signals_train

In [46]:
best_models_mc = []
results_mc = []

for sensor in sensors_sorted:
    best_models_sens = []
    results_sens = []


    print(f'Train sens: {sensor}')
    X_train_sens = X_train_mc[sens_train == sensor]
    y_train_sens = y_train_mc[sens_train == sensor]


    for i, clf_key in enumerate(list(classifiers.keys())):
        print(f'Fitting CV for: {clf_key}')
        gs_cv = GridSearchCV(estimator = classifiers[clf_key], param_grid = search_spaces[i], scoring=scores, refit='accuracy', cv = 5, verbose=2)
        gs_cv.fit(X_train_sens, y_train_sens)

        best_models_sens.append(gs_cv.best_estimator_)
        print(f'Best score: {gs_cv.best_score_}')
        print(f'Best params: {gs_cv.best_params_}')
        results_sens.append(gs_cv.best_score_)


    best_models_mc.append(best_models_sens)
    results_mc.append(results_sens)

results_avg_clf_mc = np.average(np.array(results_mc), axis = 0)

Train sens: 0
Fitting CV for: svc
Fitting 5 folds for each of 2 candidates, totalling 10 fits


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  12.3s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  12.6s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  12.1s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  12.6s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  11.8s
[CV] END ............................gamma=auto, kernel=poly; total time=   3.5s
[CV] END ............................gamma=auto, kernel=poly; total time=   3.3s
[CV] END ............................gamma=auto, kernel=poly; total time=   3.2s
[CV] END ............................gamma=auto, kernel=poly; total time=   3.6s
[CV] END ............................gamma=auto, kernel=poly; total time=   3.1s
Best score: 0.9563701573683424
Best params: {'gamma': 'auto', 'kernel': 'poly'}
Fitting CV for: rand_forest
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  16.1s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  16.2s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  16.1s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  16.1s
[CV] END .............

  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  10.8s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  11.3s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  10.0s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  12.7s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  10.9s
[CV] END ............................gamma=auto, kernel=poly; total time=   2.0s
[CV] END ............................gamma=auto, kernel=poly; total time=   1.7s
[CV] END ............................gamma=auto, kernel=poly; total time=   1.7s
[CV] END ............................gamma=auto, kernel=poly; total time=   2.2s
[CV] END ............................gamma=auto, kernel=poly; total time=   2.2s
Best score: 0.9771509395023884
Best params: {'gamma': 'auto', 'kernel': 'poly'}
Fitting CV for: rand_forest
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  13.4s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  13.4s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  15.3s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  12.9s
[CV] END .............

  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  14.5s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  14.6s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  14.6s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  14.7s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  14.3s
[CV] END ............................gamma=auto, kernel=poly; total time=  10.9s
[CV] END ............................gamma=auto, kernel=poly; total time=  10.4s
[CV] END ............................gamma=auto, kernel=poly; total time=  11.2s
[CV] END ............................gamma=auto, kernel=poly; total time=  10.6s
[CV] END ............................gamma=auto, kernel=poly; total time=  10.8s
Best score: 0.8602649610936904
Best params: {'gamma': 'auto', 'kernel': 'poly'}
Fitting CV for: rand_forest
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  18.3s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  18.5s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  18.2s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  18.3s
[CV] END .............

  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  19.4s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  16.9s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  16.6s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  17.2s


  _warn_prf(average, modifier, msg_start, len(result))


[CV] END ...........................gamma=scale, kernel=poly; total time=  16.9s
[CV] END ............................gamma=auto, kernel=poly; total time=   4.9s
[CV] END ............................gamma=auto, kernel=poly; total time=   5.5s
[CV] END ............................gamma=auto, kernel=poly; total time=   5.4s
[CV] END ............................gamma=auto, kernel=poly; total time=   5.1s
[CV] END ............................gamma=auto, kernel=poly; total time=   4.9s
Best score: 0.9548743570133189
Best params: {'gamma': 'auto', 'kernel': 'poly'}
Fitting CV for: rand_forest
Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  20.3s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  22.5s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  22.7s
[CV] END ...............criterion=log_loss, n_estimators=150; total time=  23.1s
[CV] END .............

In [47]:
# results_avg_clf_mc = np.average(np.array(results_mc, dtype=list).reshape(4, -1)[:, :-1], axis = 0)

for i, clf in enumerate(classifiers.keys()):
    print(f'Average score over sensors for {clf}: {results_avg_clf_mc[i]}')

Average score over sensors for svc: 0.9371651037444351
Average score over sensors for rand_forest: 0.9588393157115007
Average score over sensors for knn: 0.9664199463037649
Average score over sensors for gbc: 0.9652641301020828


In [48]:
best_models_bin

[[SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=150),
  KNeighborsClassifier(n_neighbors=3),
  HistGradientBoostingClassifier(learning_rate=0.2)],
 [SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=175),
  KNeighborsClassifier(n_neighbors=3),
  HistGradientBoostingClassifier(learning_rate=0.2)],
 [SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=150),
  KNeighborsClassifier(n_neighbors=3),
  HistGradientBoostingClassifier(learning_rate=0.2)],
 [SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=200),
  KNeighborsClassifier(n_neighbors=3),
  HistGradientBoostingClassifier(learning_rate=0.3)]]

In [49]:
best_models_mc 

[[SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=175),
  KNeighborsClassifier(),
  HistGradientBoostingClassifier(learning_rate=0.3)],
 [SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=200),
  KNeighborsClassifier(n_neighbors=3),
  HistGradientBoostingClassifier()],
 [SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=150),
  KNeighborsClassifier(n_neighbors=3),
  HistGradientBoostingClassifier()],
 [SVC(gamma='auto', kernel='poly'),
  RandomForestClassifier(criterion='log_loss', n_estimators=200),
  KNeighborsClassifier(n_neighbors=3),
  HistGradientBoostingClassifier(learning_rate=0.2)]]

### Test

Fix the tests

In [50]:
dir_path_multi = r'./data/features/lab_multi'
test_file_names = ['onset_scenario_3', 'onset_scenario_9', 'onset_scenario_12']
test_cases_names = ['A', 'B', 'C']
test_cases = {}
pd_clusters = [[2], [3], [4, 5]]


for i, test_f in enumerate(test_file_names):
    
    with open(os.path.join(dir_path_multi, test_f), 'rb') as file:
        signal_extracted_scenario = pickle.load(file)
        
        # Cluster labels from clustering are unordered and do not correspond to the acual class. The proper PD-int class is assigned by study of the PRPD
        cluster_labels_old = signal_extracted_scenario[:, 2].copy()
        cluster_labels_new = np.ones(len(cluster_labels_old))
        cluster_labels_new[np.isin(cluster_labels_old, pd_clusters[i])] = 0

        signal_extracted_scenario[:, 2] = cluster_labels_new
        test_cases[test_cases_names[i]] = signal_extracted_scenario
    

- Binary test cases

In [51]:
for case in test_cases_names:

    data_case = test_cases[case]

    id_test = data_case[:, 0]
    sens_test = data_case[:, 1]
    cluster_test = data_case[:, 2]
    signals_test = data_case[:, 3:]
    

    # Scale the input and define binary label
    X_test = signals_test
    y_true = cluster_test

    # Create merged label by sensor (average target class for all events belonging to the same instance)
    unique_inst_id = np.sort(np.unique(id_test))
    y_true_merged = [int(np.average(y_true[id_test == i])) for i in unique_inst_id]

    
    # Iterate through classifiers
    for i, clf_key in enumerate(list(classifiers.keys())):

        classifiers_test = [best_models_bin[j][i] for j in range(4)] # list of 4 classifiers (1 per sensor)

        id_list = []
        y_list = []

        # Iterate through sensors
        for sens in sensors_sorted:

            # Extract X and id for the tried sensor
            X_test_sens = X_test[sens_test == sens]
            id_test_sens = id_test[sens_test == sens]

            # Predict
            y_test_sens = classifiers_test[sens].predict(X_test_sens)

            id_list.extend(id_test_sens)
            y_list.extend(y_test_sens)


        id_array = np.array(id_list)
        y_array = np.array(y_list)
        y_test_merged = [round(np.average(y_array[id_array == id])+0.01) for id in unique_inst_id]

        print(f'Accuracy for case: {case}, classifier {clf_key}: {accuracy_score(y_true_merged, y_test_merged)}')



Accuracy for case: A, classifier svc: 0.9588014981273408
Accuracy for case: A, classifier rand_forest: 0.8838951310861424
Accuracy for case: A, classifier knn: 0.9803370786516854
Accuracy for case: A, classifier gbc: 0.951310861423221
Accuracy for case: B, classifier svc: 0.9519852262234534
Accuracy for case: B, classifier rand_forest: 0.9556786703601108
Accuracy for case: B, classifier knn: 0.9778393351800554
Accuracy for case: B, classifier gbc: 0.9695290858725761
Accuracy for case: C, classifier svc: 0.8580527752502275
Accuracy for case: C, classifier rand_forest: 0.8252957233848953
Accuracy for case: C, classifier knn: 0.89171974522293
Accuracy for case: C, classifier gbc: 0.8844404003639672


- MC test cases

In [53]:
for case in test_cases_names:

    data_case = test_cases[case]

    id_test = data_case[:, 0]
    sens_test = data_case[:, 1]
    cluster_test = data_case[:, 2]
    signals_test = data_case[:, 3:]
    

    # Scale the input and define binary label
    X_test = signals_test
    y_true = cluster_test

    # Create merged label by sensor (average target class for all events belonging to the same instance)
    unique_inst_id = np.sort(np.unique(id_test))
    y_true_merged = [int(np.average(y_true[id_test == i])) for i in unique_inst_id]

    
    # Iterate through classifiers
    for i, clf_key in enumerate(list(classifiers.keys())):

        classifiers_test = [best_models_mc[j][i] for j in range(4)] # list of 4 classifiers (1 per sensor)

        id_list = []
        y_list = []

        # Iterate through sensors
        for sens in sensors_sorted:

            # Extract X and id for the tried sensor
            X_test_sens = X_test[sens_test == sens]
            id_test_sens = id_test[sens_test == sens]

            # Predict
            y_test_sens = classifiers_test[sens].predict(X_test_sens)

            id_list.extend(id_test_sens)
            y_list.extend(y_test_sens)



        id_array = np.array(id_list)
        y_array = np.array(y_list)
        y_array_merged = [round(np.average(y_array[id_array == id])+0.01) for id in unique_inst_id]
        y_test_merged = np.zeros(len(y_array_merged))
        y_test_merged[np.array(y_array_merged) > 5] = 1

        print(f'Accuracy for case: {case}, classifier {clf_key}: {accuracy_score(y_true_merged, y_test_merged)}')

Accuracy for case: A, classifier svc: 0.8848314606741573
Accuracy for case: A, classifier rand_forest: 0.9241573033707865
Accuracy for case: A, classifier knn: 0.947565543071161
Accuracy for case: A, classifier gbc: 0.9044943820224719
Accuracy for case: B, classifier svc: 0.9021237303785781
Accuracy for case: B, classifier rand_forest: 0.9362880886426593
Accuracy for case: B, classifier knn: 0.9122807017543859
Accuracy for case: B, classifier gbc: 0.9141274238227147
Accuracy for case: C, classifier svc: 0.8262056414922657
Accuracy for case: C, classifier rand_forest: 0.8762511373976342
Accuracy for case: C, classifier knn: 0.8835304822565969
Accuracy for case: C, classifier gbc: 0.8589626933575978


In [14]:
folder_path = r'.\data\test_hueng\input'
inputs_all = []

for file in os.listdir(folder_path):
    with open(os.path.join(folder_path, file), 'rb') as f:
        inputs = pickle.load(f)
        print(np.shape(inputs))
        inputs_all.extend(inputs)

np.shape(inputs_all)        

(131, 627)
(57, 627)
(46, 627)
(21, 627)
(133, 627)
(57, 627)
(389, 627)
(167, 627)


(1001, 627)

In [12]:
folder_path = r'.\data\test_hueng\target'
inputs_all = []

for file in os.listdir(folder_path):
    with open(os.path.join(folder_path, file), 'rb') as f:
        inputs = pickle.load(f)
        print(np.shape(inputs))
        inputs_all.append(inputs)

(131, 3)
(46, 3)
(133, 3)
(389, 3)
