## Model testing on prepared data

In [1]:
%reload_ext autoreload
%autoreload 2
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go

module_path = os.path.abspath(os.path.join('./../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.multicovariate_models.gmm_functions import predict_groups_gmm, fit_gaussian_mixture_model
from src.multicovariate_models.GroupsModelWrapper import GroupsModelWrapper
from src.multicovariate_models.DataLoader import DataLoader

In [2]:
%%time

main_df = pd.read_csv("simulated_sensor_data_8_cols.csv")
main_df = main_df.drop(columns=["origin_file"]).drop_duplicates()
"""main_df = main_df[['Sensor1', 'Sensor2', 'Sensor3', 'Sensor4', 'J-Apollo', 'J-RN2',
       'J-RN1', 'node_with_leak']].round(3).drop_duplicates()"""
display(main_df)

Unnamed: 0,Sensor1,Sensor2,Sensor3,Sensor4,J-Apollo,J-RN2,J-RN1,node_with_leak,leak_amount
0,14.8043,15.9458,17.9639,19.3175,15.4271,15.9506,19.8981,Node_Jonctiune-J-34,0.5LPS
1,14.8043,15.9458,17.9639,19.3175,15.4271,15.9506,19.8981,Node_Jonctiune-J-34,0.501LPS
2,14.8043,15.9458,17.9639,19.3174,15.4271,15.9506,19.8981,Node_Jonctiune-J-34,0.502LPS
3,14.8043,15.9457,17.9639,19.3174,15.4271,15.9506,19.8981,Node_Jonctiune-J-34,0.503LPS
4,14.8043,15.9457,17.9639,19.3174,15.4271,15.9506,19.8981,Node_Jonctiune-J-34,0.504LPS
...,...,...,...,...,...,...,...,...,...
2876291,14.5295,15.5398,17.6939,19.2505,15.4271,15.9395,19.8981,Node_Jonctiune-267,10.095LPS
2876292,14.5295,15.5398,17.6939,19.2505,15.4271,15.9395,19.8981,Node_Jonctiune-267,10.096LPS
2876293,14.5295,15.5397,17.6939,19.2505,15.4271,15.9395,19.8981,Node_Jonctiune-267,10.097LPS
2876294,14.5294,15.5396,17.6938,19.2505,15.4271,15.9395,19.8981,Node_Jonctiune-267,10.098LPS


CPU times: user 6.45 s, sys: 732 ms, total: 7.18 s
Wall time: 7.66 s


# DataLoader 
Here we prepare the necessary data: x_train, x_test, y_train, y_test to be used in our models later

In [3]:
%%time

# prepare the data for all models
data_loader = DataLoader(main_df)
x_train, x_test, y_train, y_test, enc_node_dict = data_loader.get_random_data_split_by_node()
print(x_train, x_test, y_train, y_test)

Train data shape: (2583180, 10), test data shape: (287147, 10)
x_train shape: (2583180, 7), y_train shape: (2583180,)
[[14.6317 15.7578 17.6711 ... 15.4271 15.9355 19.8981]
 [14.6646 15.7933 17.7162 ... 15.4271 15.9378 19.8981]
 [14.7709 15.9058 17.8906 ... 15.4271 15.947  19.8981]
 ...
 [14.6437 15.7998 17.8381 ... 15.4271 15.8726 19.8981]
 [14.7886 15.9313 17.9503 ... 15.4271 15.9406 19.8981]
 [14.7655 15.9103 17.9323 ... 15.4271 15.9307 19.8981]] [[14.69   15.8199 17.754  ... 15.4271 15.9398 19.8981]
 [14.6796 15.8092 17.7381 ... 15.4271 15.9389 19.8981]
 [14.7999 15.9387 17.9446 ... 15.4271 15.95   19.8981]
 ...
 [14.6989 15.8498 17.8807 ... 15.4271 15.8999 19.8981]
 [14.6403 15.7968 17.8355 ... 15.4271 15.8709 19.8981]
 [14.8086 15.9494 17.9655 ... 15.4271 15.9491 19.8981]] [  0   0   0 ... 126 126 126] [  0   0   0 ... 126 126 126]
CPU times: user 8.89 s, sys: 776 ms, total: 9.66 s
Wall time: 9.66 s


In [6]:
%%time

x_train_v2, x_test_v2, y_train_v2, y_test_v2, enc_node_dict_v2 = \
                        data_loader.get_sequential_subsample_data_split_by_leaks(subsample_size=0.15)
print(x_train_v2, x_test_v2, y_train_v2, y_test_v2)

Train data shape: (387477, 11), test data shape: (43053, 11)
x_train shape: (387477, 7), y_train shape: (387477,)
[[14.8043 15.9458 17.9639 ... 15.4271 15.9506 19.8981]
 [14.8043 15.9458 17.9639 ... 15.4271 15.9506 19.8981]
 [14.8043 15.9458 17.9639 ... 15.4271 15.9506 19.8981]
 ...
 [14.7153 15.8299 17.8845 ... 15.4271 15.9465 19.8981]
 [14.7153 15.8298 17.8845 ... 15.4271 15.9465 19.8981]
 [14.7153 15.8298 17.8844 ... 15.4271 15.9465 19.8981]] [[14.8042 15.9457 17.9639 ... 15.4271 15.9506 19.8981]
 [14.804  15.9455 17.9638 ... 15.4271 15.9506 19.8981]
 [14.8038 15.9454 17.9637 ... 15.4271 15.9505 19.8981]
 ...
 [14.716  15.8307 17.885  ... 15.4271 15.9466 19.8981]
 [14.7157 15.8304 17.8848 ... 15.4271 15.9466 19.8981]
 [14.7154 15.83   17.8846 ... 15.4271 15.9466 19.8981]] [118 118 118 ...  55  55  55] [118 118 118 ...  55  55  55]
CPU times: user 3.01 s, sys: 168 ms, total: 3.18 s
Wall time: 3.18 s


# GMM model testing

In [5]:
%%time
# sklearn GroupShuffleSplit and pandas.cut may come in handy for splitting data sometime

# GMM model testing
model_random_s = GroupsModelWrapper(x_train, x_test, y_train, y_test, enc_node_dict, model_type="GMM")
#model_seq_s = GroupsModelWrapper(x_train_v2, x_test_v2, y_train_v2, y_test_v2, enc_node_dict_v2, 
#                                 model_type="GMM")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 64.6 µs


In [8]:
%%time

model_seq_s.train()

CPU times: user 3h 25min 3s, sys: 10h 24min 32s, total: 13h 49min 35s
Wall time: 1h 18min 6s


In [9]:
%%time
model_seq_s.evaluate_model_on_node_basis()


Accuracy: 0.013448540171416626
Precision: 0.014040441480686672, recall: 0.013448540171416626
CPU times: user 2min 29s, sys: 48 ms, total: 2min 29s
Wall time: 2min 29s


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
%%time

model_random_s.train()
model_random_s.evaluate_model_on_node_basis()


Accuracy: 0.007717301591171072
Precision: 0.006841631437694475, recall: 0.007717301591171073
CPU times: user 1d 18h 30min 12s, sys: 3d 1h 2min 9s, total: 4d 19h 32min 22s
Wall time: 19h 52min 56s


## SVM model testing

In [10]:
%%time

# SVM model testing
svm_model_random_s = GroupsModelWrapper(x_train, x_test, y_train, y_test, enc_node_dict, model_type="SVM")
svm_model_seq_s = GroupsModelWrapper(x_train_v2, x_test_v2, y_train_v2, y_test_v2, enc_node_dict_v2, 
                                 model_type="SVM")

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 93 µs


In [11]:
%%time

svm_model_seq_s.train()
svm_model_seq_s.evaluate_model_on_node_basis()


Accuracy: 0.007874015748031496
Precision: 6.2000124000248e-05, recall: 0.007874015748031496
CPU times: user 2h 49min 19s, sys: 4min 47s, total: 2h 54min 6s
Wall time: 2h 54min 6s


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
%%time

svm_model_random_s.train()
svm_model_random_s.evaluate_model_on_node_basis()