In [1]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
#import seaborn as sbn
import pickle

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
from sklearn.linear_model import LinearRegression,LogisticRegression
from itertools import permutations, combinations

In [2]:
#df_train_reg_model = pd.read_csv('leak_train.csv')
df_train_reg_model = pd.concat(pd.read_excel('data/leak_train_11.xlsx', sheet_name=None), ignore_index=True)
df_train_reg_model = df_train_reg_model[df_train_reg_model['leak_link']==0]
df_train_class_model_ref = df_train_reg_model[df_train_reg_model['leak_link']==0]
df_train_class_model_rec = df_train_reg_model[df_train_reg_model['leak_link']==0]

# Proxy for recent observations, those need to be tested for leak
#df_test_class_model = pd.read_csv('leak_test.csv')
df_test_class_model = pd.concat(pd.read_excel('data/leak_test_11.xlsx', sheet_name=None), ignore_index=True)
df_test_class_model = df_test_class_model[df_test_class_model['leak_link']==0]

#leak_training = pd.read_csv('leak_train.csv')
leak_training = pd.concat(pd.read_excel('data/leak_train_11.xlsx', sheet_name=None), ignore_index=True)
leak_training = leak_training[leak_training['leak_link']>0]
#leak_testing = pd.read_csv('leak_test.csv')
leak_testing = pd.concat(pd.read_excel('data/leak_test_11.xlsx', sheet_name=None), ignore_index=True)
leak_testing = leak_testing[leak_testing['leak_link']>0]

In [3]:
leak_training.head()


Unnamed: 0,leak_link,Node_head1,Node_head2,Node_head3,Node_head5,Node_head6,Node_head8,Node_head9,Node_head11,Node_head13,Link_flow1,Link_flow2,Link_flow3,Link_flow5,Link_flow6,Link_flow8,Link_flow9,Link_flow11,Link_flow13
7751,1,-0.07,3.2,0.01,0.0,0.11,-0.13,2.37,0.05,-0.07,4.17,-6.09,0.02,0.0,0.12,0.0,0.43,-7.96,0.0
7752,1,-0.07,3.2,0.01,0.0,0.11,-0.13,2.394444,0.05,-0.07,4.17,-6.09,0.02,0.0,0.12,0.0,0.434444,-7.96,0.0
7753,1,-0.07,3.2,0.01,0.0,0.11,-0.13,2.418889,0.05,-0.07,4.17,-6.09,0.02,0.0,0.12,0.0,0.438889,-7.96,0.0
7754,1,-0.07,3.2,0.01,0.0,0.111111,-0.13,2.443333,0.05,-0.07,4.17,-6.09,0.02,0.0,0.117778,0.0,0.443333,-7.96,0.0
7755,1,-0.07,3.2,0.01,0.0,0.112222,-0.13,2.467778,0.05,-0.07,4.167778,-6.09,0.02,0.0,0.115556,0.0,0.447778,-7.96,0.0


In [4]:
def training_data(df,link_names,head_names):
    
    data_flow = np.array(df[link_names])  # convering to litres per sec 
    data_head = np.array(df[head_names])
    

    train_out= data_head[:,0] - data_head[:,1] # deltaH
    train_in = data_flow                       # flow1, flow2
    
    return train_in, train_out

In [5]:
# Manually define a sensor list

sensor_list = [[1,1],[2,2],[3,3],[5,5],[6,6],[8,8],[9,9],[11,11],[13,13]]

sen_nums = np.arange(1,len(sensor_list)+1)
combs = list(combinations(sen_nums,2))
print(sen_nums)
combs

[1 2 3 4 5 6 7 8 9]


[(1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (4, 5),
 (4, 6),
 (4, 7),
 (4, 8),
 (4, 9),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (6, 7),
 (6, 8),
 (6, 9),
 (7, 8),
 (7, 9),
 (8, 9)]

In [6]:
#chk how combs can help in iterating through all possible combinations of sensors
i=2
print(len(combs))
print(combs[i])
print('first sensor is ',combs[i][0])
print('second sensor is ',combs[i][1])

36
(1, 4)
first sensor is  1
second sensor is  4


In [7]:
# function to extract input (flow1, flow2) and output (deltaH) data from datasets prepared earlier
# data without leaks

def data_in_out_noleak(sensor_pair,df):
    
    h1= sensor_list[sensor_pair[0]-1][0]
    h2= sensor_list[sensor_pair[1]-1][0]
    f1= sensor_list[sensor_pair[0]-1][1]
    f2= sensor_list[sensor_pair[1]-1][1]
    
    link_name = ['Link_flow'+str(f1),'Link_flow'+str(f2)] 
    node_name = ['Node_head'+str(h1),'Node_head'+str(h2)]
    
    data_in, data_out = training_data(df,link_name,node_name)
    
    return data_in, data_out 

In [8]:
# function to extract input and output data with leak

def data_in_out_withleak(sensor_pair,df):
    
    h1= sensor_list[sensor_pair[0]-1][0]
    h2= sensor_list[sensor_pair[1]-1][0]
    f1= sensor_list[sensor_pair[0]-1][1]
    f2= sensor_list[sensor_pair[1]-1][1]
    
    link_name_leak = ['Link_flow'+str(f1),'Link_flow'+str(f2)]
    node_name_leak = ['Node_head'+str(h1),'Node_head'+str(h2)]    
    data_in, data_out = training_data(df,link_name_leak,node_name_leak)
    
    return data_in, data_out

In [9]:
for comb in combs:    
    xtrain, ytrain = data_in_out_noleak(comb,df_train_reg_model)    
    poly = PolynomialFeatures(degree=2)
    x_train_poly = poly.fit_transform(xtrain)    
    lin_model = LinearRegression()
    lin_model.fit(x_train_poly,ytrain)
    
    pkl_filename = 'linmodel'+str(comb[0])+str(comb[1])+'.pkl'
    with open(pkl_filename, 'wb') as file:
        pickle.dump(lin_model, file)

In [10]:
# generate column names
colnames = []
for comb in combs:
    n1='obs'+str(comb[0])+str(comb[1])
    n2='prd'+str(comb[0])+str(comb[1])
    n3='stat'+str(comb[0])+str(comb[1])
    n4='pval'+str(comb[0])+str(comb[1])
    colnames.extend([n1,n2,n3,n4])
    

In [11]:
def datafortrees(numcases,df_recent,sample_len,casetype):
    
    df_cases = pd.DataFrame(columns=colnames)
    #fracsize_recent=sample_len/len(df_recent)
    fracsize_recent=0.3
    print(fracsize_recent)
    
    fracsize_reference = sample_len/len(df_train_class_model_ref)
    
    # first loop to randomly select a sample test set

    for i in range(numcases):
        df_recent_sample = df_recent.sample(frac=fracsize_recent)
        df_reference_sample = df_train_class_model_ref.sample(frac=fracsize_reference)
        
        # second loop to cover all possible leak combinations

        comb_data = []
        for comb in combs:
            if casetype=='noleak':
                xtest_rec, ytest_rec = data_in_out_noleak(comb,df_recent_sample)
            else:
                xtest_rec, ytest_rec = data_in_out_withleak(comb,df_recent_sample)
                
            xtest_ref, ytest_ref = data_in_out_noleak(comb,df_reference_sample)
            

            # load the linear regression model, make predictions and store results
            pkl_filename = 'linmodel'+str(comb[0])+str(comb[1])+'.pkl'

            with open(pkl_filename, 'rb') as file:
                lin_model = pickle.load(file)

            xtest_ref_poly = poly.fit_transform(xtest_ref)
            xtest_rec_poly = poly.fit_transform(xtest_rec) 
            pred_ref = lin_model.predict(xtest_ref_poly).reshape(-1)
            pred_rec = lin_model.predict(xtest_rec_poly).reshape(-1)
            error_ref = (ytest_ref-pred_ref)
            error_rec = (ytest_rec-pred_rec)
            stat,pval = ks_2samp(error_ref,error_rec) #to test the model runs fine
            comb_list = [np.mean(error_ref),np.mean(error_rec),stat,pval]            
            comb_data.extend(comb_list)
            
        comb_series = pd.Series(comb_data,index=df_cases.columns)    
        df_cases = df_cases.append(comb_series,ignore_index=True)
      
    return df_cases

In [12]:
leak_training.leak_link.unique()


array([1, 2, 3])

In [13]:
## Defining size of training set. Prefer choosing a multiple of 6 
import warnings
warnings.filterwarnings("ignore")
num_samples = 300
num_training_samples_each = 300
# num_training_sample_total = 3000
# num_training_sample_leak = int(num_training_sample_total/6)
# num_training_sample_noleak = num_training_sample_total - 3* (num_training_sample_leak)
#warning('off')
class_train_noleak = datafortrees(num_training_samples_each,df_train_class_model_rec,num_samples,'noleak')
class_train_noleak['leak']=0
class_train_noleak['leak_num']=0

class_train_all_leaks = pd.DataFrame(columns = class_train_noleak.columns)
# loop through all the links with leakage and prepare the dataframe
for i in leak_training.leak_link.unique():
    print(i)
    leak_df = leak_training[leak_training.leak_link == i].reset_index(drop=True)
    print(len(leak_df))
    class_train = datafortrees(num_training_samples_each,leak_df,num_samples,'leak')
    class_train['leak']=1
    class_train['leak_num']=i
    class_train_all_leaks = class_train_all_leaks.append(class_train)
    

data_train_classification = pd.concat([class_train_noleak,class_train_all_leaks],axis=0)

0.3


1
17813
0.3
2
20499
0.3


In [None]:
num_samples= 300
num_test_samples_each = 300
# num_test_sample_leak = int(num_test_sample_total/6)
# num_test_sample_noleak = num_test_sample_total - 3* (num_test_sample_leak)

class_test_noleak = datafortrees(num_test_samples_each,df_test_class_model,num_samples,'noleak')
class_test_noleak['leak']=0
class_test_noleak['leak_num']=0

class_test_all_leaks = pd.DataFrame(columns = class_test_noleak.columns)
# loop through all the links with leakage and prepare the dataframe
for i in leak_testing.leak_link.unique():
    leak_df = leak_testing[leak_testing.leak_link == i].reset_index(drop=True)
    class_test = datafortrees(num_test_samples_each,leak_df,num_samples,'leak')
    class_test['leak']=1
    class_test['leak_num']=i
    class_test_all_leaks = class_test_all_leaks.append(class_test)
    

data_test_classification = pd.concat([class_test_noleak,class_test_all_leaks],axis=0)

0.3
0.3
0.3
0.3


In [None]:
(data_test_classification)

Unnamed: 0,obs12,prd12,stat12,pval12,obs13,prd13,stat13,pval13,obs14,prd14,...,obs79,prd79,stat79,pval79,obs89,prd89,stat89,pval89,leak,leak_num
0,-0.014085,-0.519856,0.292813,1.113549e-18,0.026708,-0.059787,0.257829,1.549695e-14,0.077928,-0.163350,...,0.016593,0.213334,0.214075,3.881132e-10,-0.010450,-0.028343,0.098483,1.747016e-02,0,0
1,-0.022888,-0.544810,0.335227,1.851329e-24,-0.029695,-0.068948,0.277763,7.915718e-17,-0.072502,-0.193735,...,0.023319,0.209231,0.241063,9.514372e-13,-0.005722,-0.032730,0.080908,8.075314e-02,0,0
2,0.086162,-0.568583,0.381041,1.139242e-31,0.011066,-0.077949,0.310233,5.969975e-21,0.064107,-0.197450,...,-0.078731,0.255461,0.292713,1.148391e-18,-0.003727,-0.032245,0.084839,5.887592e-02,0,0
3,-0.111937,-0.586904,0.332735,4.268915e-24,-0.002733,-0.079940,0.292724,1.145486e-18,-0.040974,-0.209323,...,0.032106,0.246318,0.243511,5.282921e-13,0.000114,-0.027695,0.099568,1.573921e-02,0,0
4,0.015108,-0.570656,0.352691,4.366980e-27,0.009334,-0.076253,0.291063,1.855001e-18,0.053785,-0.185489,...,-0.049235,0.229976,0.265792,1.897251e-15,0.002635,-0.031231,0.112414,4.194854e-03,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.007058,-0.673597,0.313345,1.186179e-23,-0.019575,0.056850,0.217421,1.649160e-11,-0.022406,-0.875329,...,0.056825,-0.341627,0.368494,8.581577e-33,0.014165,-0.083719,0.445857,1.808537e-48,1,3
296,0.024580,-0.678626,0.305997,1.478377e-22,-0.012924,0.054379,0.243145,2.597938e-14,-0.010230,-0.872648,...,0.012055,-0.344229,0.344965,1.086312e-28,0.005849,-0.081894,0.441647,1.610344e-47,1,3
297,-0.052593,-0.706588,0.339077,1.028435e-27,0.007430,0.055252,0.214768,3.074408e-11,-0.036343,-0.904234,...,0.026576,-0.306527,0.322501,4.667279e-25,-0.001679,-0.084855,0.448488,4.664775e-49,1,3
298,-0.058877,-0.687858,0.339758,7.947447e-28,0.022399,0.056208,0.220239,8.434413e-12,0.015327,-0.893828,...,0.027834,-0.339058,0.357378,8.095815e-31,-0.017279,-0.086349,0.405183,8.499201e-40,1,3


In [None]:
#cmat_pd.to_csv('pp1.csv')
data_test_classification.to_csv('data_test_classification_1.csv')

In [None]:
data_train_classification.to_csv('data_train_classification_1.csv')