In [2]:
%reload_ext autoreload
%autoreload 2
import os
import sys
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import plotly.graph_objects as go

module_path = os.path.abspath(os.path.join('./../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.multicovariate_models.GroupsModelWrapper import GroupsModelWrapper
from src.multicovariate_models.DataLoader import DataLoader

In [4]:
%%time

base_dir = '../../'
df = pd.read_csv(os.path.join(base_dir, "data/dataframes/simulations-step0.01.csv"))
df = df.drop(columns=["origin_file"]).drop_duplicates()
"""main_df = main_df[['Sensor1', 'Sensor2', 'Sensor3', 'Sensor4', 'J-Apollo', 'J-RN2',
       'J-RN1', 'node_with_leak']].round(3).drop_duplicates()"""
df['leak_amount'] = df['leak_amount'].str[:-3].astype(float)

# drop all the `dpressure` columns
dp_col_names = []
for col_name in df.columns:
    if col_name.endswith('_dpressure'):
        dp_col_names.append(col_name)

df = df.drop(dp_col_names, axis=1)
display(df)

Unnamed: 0,Sensor1,Sensor2,Sensor3,Sensor4,J-Apollo,J-RN2,J-RN1,node_with_leak,leak_amount
0,14.0689,14.3960,17.4695,19.1754,15.4271,15.9280,19.8981,Node_275-A,19.500
1,14.0689,14.3958,17.4695,19.1754,15.4271,15.9280,19.8981,Node_275-A,19.501
2,14.0688,14.3957,17.4695,19.1754,15.4271,15.9280,19.8981,Node_275-A,19.502
3,14.0688,14.3956,17.4694,19.1754,15.4271,15.9280,19.8981,Node_275-A,19.503
4,14.0687,14.3955,17.4694,19.1754,15.4271,15.9280,19.8981,Node_275-A,19.504
...,...,...,...,...,...,...,...,...,...
1009579,14.6058,15.6646,17.7761,19.2672,15.4271,15.9421,19.8981,Node_Jonctiune-267,7.696
1009580,14.6057,15.6646,17.7761,19.2672,15.4271,15.9421,19.8981,Node_Jonctiune-267,7.697
1009581,14.6057,15.6645,17.7760,19.2672,15.4271,15.9421,19.8981,Node_Jonctiune-267,7.698
1009582,14.6057,15.6645,17.7760,19.2672,15.4271,15.9421,19.8981,Node_Jonctiune-267,7.699


CPU times: user 3.36 s, sys: 432 ms, total: 3.79 s
Wall time: 3.98 s


In [5]:
%%time

# prepare the data for all models
data_loader = DataLoader(df)
X_train, X_test, y_train, y_test, enc_node_dict = data_loader.get_random_data_split_by_node()
#print(X_train, X_test, y_train, y_test, enc_node_dict)

Train data shape: (907200, 10), test data shape: (100944, 10)
x_train shape: (907200, 7), y_train shape: (907200,)
CPU times: user 2.95 s, sys: 336 ms, total: 3.28 s
Wall time: 3.29 s


In [6]:
import numpy as np
import sklearn

n_train = X_train.shape[0]
n_test = X_test.shape[0]

X_train_preproc = data_loader.preprocess_abs_pressure(X_train)
X_test_preproc = data_loader.preprocess_abs_pressure(X_test)
print(X_train_preproc.sum(axis=1))

[119.079 118.531 118.53  ... 117.882 118.905 117.73 ]


In [23]:
for n_neighbours in range(1, 26):
    classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbours, p=np.inf)
    classifier.fit(X_train_preproc, y_train)

    y_hat = classifier.predict(X_test_preproc)

    total_diff = np.sum(np.abs(y_test - y_hat) > 0)
    ca = 1.0 - total_diff / n_test

    print(f'n_neighbours: {n_neighbours}, CA: {ca}')

n_neighbours: 1, CA: 0.5961027896655571
n_neighbours: 2, CA: 0.6253368204152798
n_neighbours: 3, CA: 0.5938837375178316
n_neighbours: 4, CA: 0.6183626565224283
n_neighbours: 5, CA: 0.5959541924235219
n_neighbours: 6, CA: 0.6183923759708353
n_neighbours: 7, CA: 0.5956173720082423
n_neighbours: 8, CA: 0.6165101442383896
n_neighbours: 9, CA: 0.6009569662387066
n_neighbours: 10, CA: 0.6163318275479474
n_neighbours: 11, CA: 0.6026113488666984
n_neighbours: 12, CA: 0.6166389285148202
n_neighbours: 13, CA: 0.6010064986527184
n_neighbours: 14, CA: 0.6152123949912822
n_neighbours: 15, CA: 0.6047412426692027
n_neighbours: 16, CA: 0.616777619274053
n_neighbours: 17, CA: 0.6075943097162784
n_neighbours: 18, CA: 0.6192740529402441
n_neighbours: 19, CA: 0.6075844032334761
n_neighbours: 20, CA: 0.6199080678395943
n_neighbours: 21, CA: 0.6102492471073071
n_neighbours: 22, CA: 0.6181249009351719
n_neighbours: 23, CA: 0.6166290220320177
n_neighbours: 24, CA: 0.6246136471707084
n_neighbours: 25, CA: 0.61

In [7]:
n_neighbours = 24

classifier = sklearn.neighbors.KNeighborsClassifier(n_neighbors=n_neighbours, p=np.inf)
classifier.fit(X_train_preproc, y_train)

y_hat = classifier.predict(X_test_preproc)

total_diff = np.sum(np.abs(y_test - y_hat) > 0)
ca = 1.0 - total_diff / n_test

print(f'n_neighbours: {n_neighbours}, CA: {ca}')

n_neighbours: 24, CA: 0.6254457917261056


In [28]:
# Which nodes can I detect accurately?

import json
import wntr
import src.analytics.analytics as analytics

network_fname = '../../data/epanet_networks/Braila_V2022_2_2.inp'
water_network_model = wntr.network.WaterNetworkModel(network_fname)
network_graph = water_network_model.get_graph()

nodeN_to_node_id_map = {nodeN: node_id[5:] for node_id, nodeN in enc_node_dict.items()}

# precompute the distances
node_dist_map = {}
for node_id in network_graph.nodes():
    path_len_vec, node_id_vec = analytics.undir_dist_from_set_vec(network_graph, [node_id])
    node_dist_map[node_id] = {
        'path_len_vec': path_len_vec,
        'node_id_vec': node_id_vec
    }

stats_map = {}
for valN, y_val_int64 in enumerate(y_test):
    y_val_str = nodeN_to_node_id_map[y_val_int64]
    y_hat_val = y_hat[valN]
    is_undetectable = 0
    
    if y_val_str not in stats_map:
        stats_map[y_val_str] = {'n_hits': 0, 'n_misses': 0, 'is_undetectable': is_undetectable, 'misclassify_id_map': {}, 'misclassify_dist_map': {}}
        
    if y_val_int64 == y_hat_val:
        stats_map[y_val_str]['n_hits'] += 1
    if y_val_int64 != y_hat_val:
        stats_map[y_val_str]['n_misses'] += 1
        
        y_hat_val_str = nodeN_to_node_id_map[y_hat_val]
        
        if y_hat_val_str not in stats_map[y_val_str]['misclassify_id_map']:
            stats_map[y_val_str]['misclassify_id_map'][y_hat_val_str] = 0
        stats_map[y_val_str]['misclassify_id_map'][y_hat_val_str] += 1
        
        # get the distance between y and y_hat
        node_dist_props = node_dist_map[y_val_str]
        path_len_vec = node_dist_props['path_len_vec']
        node_id_vec = node_dist_props['node_id_vec']
        
        misclassify_dist = -1
        for nodeN, node_id in enumerate(node_id_vec):
            if node_id == y_hat_val_str:
                misclassify_dist = path_len_vec[nodeN]
        assert misclassify_dist >= 0
        
        misclassify_dist = min(10000, misclassify_dist)
        
        #misclassify_dist = str(path_len_vec[node_id_vec.index(y_hat_val_str)])
        
        if misclassify_dist not in stats_map[y_val_str]['misclassify_dist_map']:
            stats_map[y_val_str]['misclassify_dist_map'][misclassify_dist] = 0
        stats_map[y_val_str]['misclassify_dist_map'][misclassify_dist] += 1

for node_id, node_stats in stats_map.items():
    total_samples = node_stats['n_hits'] + node_stats['n_misses']
    node_stats['hit_perc'] = float(node_stats['n_hits']) / total_samples
    
    misclassify_dist_map = node_stats['misclassify_dist_map']
    total_misclassify_count = 0
    mean_misclassify_dist = 0.0
    max_misclassify_dist = 0.0
    for misclassify_dist, misclassify_count in misclassify_dist_map.items():
        total_misclassify_count += misclassify_count
        mean_misclassify_dist += misclassify_dist*misclassify_count
        max_misclassify_dist = max(max_misclassify_dist, misclassify_dist)
    mean_misclassify_dist /= total_samples
        
    node_stats['mean_misclassify_dist'] = mean_misclassify_dist
    node_stats['max_misclassify_dist'] = max_misclassify_dist
    
with open(os.path.join(base_dir, 'model_stats.json'), 'w') as f_out:
    json_str = json.dumps(stats_map)
    f_out.write(json_str)
    

  np.reciprocal(reached_indicator_vec, out=tmp_n)


In [26]:
import pickle as pkl

# store the model to disk
with open(os.path.join(base_dir, 'data/models/model-knn-v2.pkl'), 'wb') as f_out:
    pkl.dump(classifier, f_out)
    
with open(os.path.join(base_dir, 'data/models/model-knn-v2-data_loader.pkl'), 'wb') as f_out:
    pkl.dump(data_loader, f_out)

In [36]:
import pickle as pkl
import src.multicovariate_models.knn_functions as knn_functions


with open(os.path.join(base_dir, 'data/models/model-knn-v2.pkl'), 'rb') as f_in:
    model = pkl.load(f_in)


ftr_vec = [
      0.0,
      21.424266250696373,
      19.966521309192206,
      -24.27965348189415,
      0.0,
      17.30824742268041,
      19.29278350515464
    ]

target_node_ftrvec = [
14.4490,15.6394,17.7471,19.1062,15.4271,15.9166,19.8981
]

knn_functions.predict_knn(data_loader, model, target_node_ftrvec)

feature vector: [14.449, 15.6394, 17.7471, 19.1062, 15.4271, 15.9166, 19.8981]


{'0': ['Jonctiune-2739'],
 '1': ['Jonctiune-2207'],
 '2': ['-',
  '255-A',
  '258-A',
  '275-A',
  'J-1640',
  'J-Apollo',
  'J-RN1',
  'J-RN2',
  'Jonctiune-1225',
  'Jonctiune-1226',
  'Jonctiune-12372',
  'Jonctiune-12588',
  'Jonctiune-1405',
  'Jonctiune-1406',
  'Jonctiune-1407',
  'Jonctiune-1413',
  'Jonctiune-1414',
  'Jonctiune-1415',
  'Jonctiune-1419',
  'Jonctiune-1421',
  'Jonctiune-1610',
  'Jonctiune-1616',
  'Jonctiune-1628',
  'Jonctiune-1632',
  'Jonctiune-1634',
  'Jonctiune-1635',
  'Jonctiune-1636',
  'Jonctiune-1637',
  'Jonctiune-1638',
  'Jonctiune-1641',
  'Jonctiune-1642',
  'Jonctiune-1646',
  'Jonctiune-1872',
  'Jonctiune-1874',
  'Jonctiune-1875',
  'Jonctiune-1877',
  'Jonctiune-1995',
  'Jonctiune-1996',
  'Jonctiune-1997',
  'Jonctiune-1998',
  'Jonctiune-2176',
  'Jonctiune-2177',
  'Jonctiune-2180',
  'Jonctiune-2181',
  'Jonctiune-2182',
  'Jonctiune-2184',
  'Jonctiune-2185',
  'Jonctiune-2186',
  'Jonctiune-2187',
  'Jonctiune-2188',
  'Jonctiune-

In [56]:
import src.kafka.preprocessing_functions as preproc_fun
import src.configfile as config

kafka_msg1 = [
  "features_braila_leakage_detection_updated",
  0,
  1447,
  1664182828181,
  0,
  None,
  {
    "timestamp": 1663484448808,
    "ftr_vector": [
        0,
        14.4490,
        15.6394,
        17.7471,
        19.1062,
        15.4271,
        15.9166,
        19.8981
    ]
  },
  [
    
  ],
  None,
  -1,
  157,
  -1
]

kafka_msg2 = [
  "features_braila_leakage_detection_updated",
  0,
  1447,
  1664182828181,
  0,
  None,
  {
    "timestamp": 1663484448808,
    "ftr_vector": [
        0,
        14.5121,
        15.6923,
        17.7837,
        19.1375,
        15.4271,
        15.9214,
        19.8981
    ]
  },
  [
    
  ],
  None,
  -1,
  157,
  -1
]

kafka_msg3 = [
  "features_braila_leakage_detection_updated",
  0,
  1447,
  1664182828181,
  0,
  None,
  {
    "timestamp": 1663484448808,
    "ftr_vector": [
        0,
        14.5122,
        15.6924,
        17.7837,
        19.1376,
        15.4271,
        15.9214,
        19.8981
    ]
  },
  [
    
  ],
  None,
  -1,
  157,
  -1
]

kafka_msg4 = [
  "features_braila_leakage_detection_updated",
  0,
  1447,
  1664182828181,
  0,
  None,
  {
    "timestamp": 1663484448808,
    "ftr_vector": [
        0,
        14.7023,
        15.8556,
        17.8979,
        19.2417,
        15.4271,
        15.9376,
        19.8981
    ]
  },
  [
    
  ],
  None,
  -1,
  157,
  -1
]



kafka_msg = kafka_msg4
preproc_fun.process_kafka_msg(kafka_msg, model, data_loader, os.path.join(base_dir, config.EPANET_NETWORK_FILE_V2))

feature vector: [14.7023, 15.8556, 17.8979, 19.2417, 15.4271, 15.9376, 19.8981]
GENERATING OUTPUT:


{'timestamp': '2022-09-18T09:00:48',
 'timestamp-processed-at': '2022-11-22T16:14:00.504221',
 'status': 200,
 'critical-sensor': 'Jonctiune-2739',
 'deviation': 0.0,
 'method': 'knn+jenks_natural_breaks',
 'epanet-file': 'Braila_V2022_2_2.inp',
 'data': [{'node-name': 'Jonctiune-2739',
   'latitude': 45.247530793505916,
   'longitude': 27.93977157478477,
   'group': 0},
  {'node-name': 'id-2207',
   'latitude': 45.24699441657496,
   'longitude': 27.939296359343107,
   'group': 1},
  {'node-name': 'Jonctiune-4618',
   'latitude': 45.24632184737663,
   'longitude': 27.938703814802007,
   'group': 1},
  {'node-name': '-',
   'latitude': 45.243644598761264,
   'longitude': 27.93380688959838,
   'group': 1},
  {'node-name': 'id-3462',
   'latitude': 45.24447258267515,
   'longitude': 27.935886251052974,
   'group': 1},
  {'node-name': 'id-3954',
   'latitude': 45.249529443723816,
   'longitude': 27.940376048195464,
   'group': 1},
  {'node-name': 'id-12482',
   'latitude': 45.2494436054527

In [55]:
# find feature vectors of node Jonctiune-2207
target_node_id = 'Jonctiune-2207'

df[df['node_with_leak'] == 'Node_' + target_node_id].head(10000).tail(10)

Unnamed: 0,Sensor1,Sensor2,Sensor3,Sensor4,J-Apollo,J-RN2,J-RN1,node_with_leak,leak_amount,encoded_node_with_leak
997514,14.7024,15.8557,17.898,19.2417,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.691,66
997515,14.7024,15.8557,17.8979,19.2417,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.692,66
997516,14.7023,15.8557,17.8979,19.2417,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.693,66
997517,14.7023,15.8556,17.8979,19.2417,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.694,66
997518,14.7023,15.8556,17.8979,19.2417,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.695,66
997519,14.7023,15.8556,17.8979,19.2417,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.696,66
997520,14.7023,15.8556,17.8979,19.2416,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.697,66
997521,14.7023,15.8556,17.8979,19.2416,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.698,66
997522,14.7022,15.8556,17.8979,19.2416,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.699,66
997523,14.7022,15.8555,17.8979,19.2416,15.4271,15.9376,19.8981,Node_Jonctiune-2207,7.7,66
