In [1]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
#import seaborn as sbn
import pickle

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix
from sklearn.linear_model import LinearRegression,LogisticRegression
from itertools import permutations, combinations

In [2]:
#df_train_reg_model = pd.read_csv('leak_train.csv')
df_train_reg_model = pd.concat(pd.read_excel('data/leak_train_11.xlsx', sheet_name=None), ignore_index=True)
df_train_reg_model = df_train_reg_model[df_train_reg_model['leak_link']==0]
df_train_class_model_ref = df_train_reg_model[df_train_reg_model['leak_link']==0]
df_train_class_model_rec = df_train_reg_model[df_train_reg_model['leak_link']==0]

# Proxy for recent observations, those need to be tested for leak
#df_test_class_model = pd.read_csv('leak_test.csv')
df_test_class_model = pd.concat(pd.read_excel('data/leak_test_11.xlsx', sheet_name=None), ignore_index=True)
df_test_class_model = df_test_class_model[df_test_class_model['leak_link']==0]

#leak_training = pd.read_csv('leak_train.csv')
leak_training = pd.concat(pd.read_excel('data/leak_train_11.xlsx', sheet_name=None), ignore_index=True)
leak_training = leak_training[leak_training['leak_link']>0]
#leak_testing = pd.read_csv('leak_test.csv')
leak_testing = pd.concat(pd.read_excel('data/leak_test_11.xlsx', sheet_name=None), ignore_index=True)
leak_testing = leak_testing[leak_testing['leak_link']>0]

In [3]:
leak_training.head()


Unnamed: 0,leak_link,Node_head1,Node_head2,Node_head3,Node_head5,Node_head6,Node_head8,Node_head9,Node_head11,Node_head13,Link_flow1,Link_flow2,Link_flow3,Link_flow5,Link_flow6,Link_flow8,Link_flow9,Link_flow11,Link_flow13
7751,1,-0.07,3.2,0.01,0.0,0.11,-0.13,2.37,0.05,-0.07,4.17,-6.09,0.02,0.0,0.12,0.0,0.43,-7.96,0.0
7752,1,-0.07,3.2,0.01,0.0,0.11,-0.13,2.394444,0.05,-0.07,4.17,-6.09,0.02,0.0,0.12,0.0,0.434444,-7.96,0.0
7753,1,-0.07,3.2,0.01,0.0,0.11,-0.13,2.418889,0.05,-0.07,4.17,-6.09,0.02,0.0,0.12,0.0,0.438889,-7.96,0.0
7754,1,-0.07,3.2,0.01,0.0,0.111111,-0.13,2.443333,0.05,-0.07,4.17,-6.09,0.02,0.0,0.117778,0.0,0.443333,-7.96,0.0
7755,1,-0.07,3.2,0.01,0.0,0.112222,-0.13,2.467778,0.05,-0.07,4.167778,-6.09,0.02,0.0,0.115556,0.0,0.447778,-7.96,0.0


In [4]:
def training_data(df,link_names,head_names):
    
    data_flow = np.array(df[link_names])  # convering to litres per sec 
    data_head = np.array(df[head_names])
    

    train_out= data_head[:,0] - data_head[:,1] # deltaH
    train_in = data_flow                       # flow1, flow2
    
    return train_in, train_out

In [5]:
# Manually define a sensor list

sensor_list = [[1,1],[2,2],[3,3],[5,5],[6,6],[8,8],[9,9],[11,11],[13,13]]

sen_nums = np.arange(1,len(sensor_list)+1)
combs = list(combinations(sen_nums,2))
print(sen_nums)
combs

[1 2 3 4 5 6 7 8 9]


[(1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (1, 6),
 (1, 7),
 (1, 8),
 (1, 9),
 (2, 3),
 (2, 4),
 (2, 5),
 (2, 6),
 (2, 7),
 (2, 8),
 (2, 9),
 (3, 4),
 (3, 5),
 (3, 6),
 (3, 7),
 (3, 8),
 (3, 9),
 (4, 5),
 (4, 6),
 (4, 7),
 (4, 8),
 (4, 9),
 (5, 6),
 (5, 7),
 (5, 8),
 (5, 9),
 (6, 7),
 (6, 8),
 (6, 9),
 (7, 8),
 (7, 9),
 (8, 9)]

In [6]:
#chk how combs can help in iterating through all possible combinations of sensors
i=2
print(len(combs))
print(combs[i])
print('first sensor is ',combs[i][0])
print('second sensor is ',combs[i][1])

36
(1, 4)
first sensor is  1
second sensor is  4


In [7]:
# function to extract input (flow1, flow2) and output (deltaH) data from datasets prepared earlier
# data without leaks

def data_in_out_noleak(sensor_pair,df):
    
    h1= sensor_list[sensor_pair[0]-1][0]
    h2= sensor_list[sensor_pair[1]-1][0]
    f1= sensor_list[sensor_pair[0]-1][1]
    f2= sensor_list[sensor_pair[1]-1][1]
    
    link_name = ['Link_flow'+str(f1),'Link_flow'+str(f2)] 
    node_name = ['Node_head'+str(h1),'Node_head'+str(h2)]
    
    data_in, data_out = training_data(df,link_name,node_name)
    
    return data_in, data_out 

In [8]:
# function to extract input and output data with leak

def data_in_out_withleak(sensor_pair,df):
    
    h1= sensor_list[sensor_pair[0]-1][0]
    h2= sensor_list[sensor_pair[1]-1][0]
    f1= sensor_list[sensor_pair[0]-1][1]
    f2= sensor_list[sensor_pair[1]-1][1]
    
    link_name_leak = ['Link_flow'+str(f1),'Link_flow'+str(f2)]
    node_name_leak = ['Node_head'+str(h1),'Node_head'+str(h2)]    
    data_in, data_out = training_data(df,link_name_leak,node_name_leak)
    
    return data_in, data_out

In [26]:
for comb in combs:    
    xtrain, ytrain = data_in_out_noleak(comb,df_train_reg_model)    
    poly = PolynomialFeatures(degree=2)
    x_train_poly = poly.fit_transform(xtrain)    
    lin_model = LinearRegression()
    lin_model.fit(x_train_poly,ytrain)
    
    pkl_filename = 'linmodel'+str(comb[0])+str(comb[1])+'.pkl'
    with open(pkl_filename, 'wb') as file:
        pickle.dump(lin_model, file)

In [27]:
# generate column names
colnames = []
for comb in combs:
    n1='obs'+str(comb[0])+str(comb[1])
    n2='prd'+str(comb[0])+str(comb[1])
    n3='stat'+str(comb[0])+str(comb[1])
    n4='pval'+str(comb[0])+str(comb[1])
    colnames.extend([n1,n2,n3,n4])
    

In [28]:
def datafortrees(numcases,df_recent,sample_len,casetype):
    
    df_cases = pd.DataFrame(columns=colnames)
    #fracsize_recent=sample_len/len(df_recent)
    fracsize_recent=0.3
    print(fracsize_recent)
    
    fracsize_reference = sample_len/len(df_train_class_model_ref)
    
    # first loop to randomly select a sample test set

    for i in range(numcases):
        df_recent_sample = df_recent.sample(frac=fracsize_recent)
        df_reference_sample = df_train_class_model_ref.sample(frac=fracsize_reference)
        
        # second loop to cover all possible leak combinations

        comb_data = []
        for comb in combs:
            if casetype=='noleak':
                xtest_rec, ytest_rec = data_in_out_noleak(comb,df_recent_sample)
            else:
                xtest_rec, ytest_rec = data_in_out_withleak(comb,df_recent_sample)
                
            xtest_ref, ytest_ref = data_in_out_noleak(comb,df_reference_sample)
            

            # load the linear regression model, make predictions and store results
            pkl_filename = 'linmodel'+str(comb[0])+str(comb[1])+'.pkl'

            with open(pkl_filename, 'rb') as file:
                lin_model = pickle.load(file)

            xtest_ref_poly = poly.fit_transform(xtest_ref)
            xtest_rec_poly = poly.fit_transform(xtest_rec)
            pred_ref = lin_model.predict(xtest_ref_poly).reshape(-1)
            pred_rec = lin_model.predict(xtest_rec_poly).reshape(-1)
            error_ref = (ytest_ref-pred_ref)
            error_rec = (ytest_rec-pred_rec)
            stat,pval = ks_2samp(error_ref,error_rec) #to test the model runs fine
            comb_list = [np.mean(error_ref),np.mean(error_rec),stat,pval]            
            comb_data.extend(comb_list)
            
        comb_series = pd.Series(comb_data,index=df_cases.columns)
        df_cases = df_cases.append(comb_series,ignore_index=True)
      
    return df_cases

In [29]:
leak_training.leak_link.unique()


array([1, 2, 3])

In [30]:
## Defining size of training set. Prefer choosing a multiple of 6 
import warnings
warnings.filterwarnings("ignore")
num_samples = 300
num_training_samples_each = 300
# num_training_sample_total = 3000
# num_training_sample_leak = int(num_training_sample_total/6)
# num_training_sample_noleak = num_training_sample_total - 3* (num_training_sample_leak)
#warning('off')
class_train_noleak = datafortrees(num_training_samples_each,df_train_class_model_rec,num_samples,'noleak')
class_train_noleak['leak']=0
class_train_noleak['leak_num']=0

class_train_all_leaks = pd.DataFrame(columns = class_train_noleak.columns)
# loop through all the links with leakage and prepare the dataframe
for i in leak_training.leak_link.unique():
    print(i)
    leak_df = leak_training[leak_training.leak_link == i].reset_index(drop=True)
    print(len(leak_df))
    class_train = datafortrees(num_training_samples_each,leak_df,num_samples,'leak')
    class_train['leak']=1
    class_train['leak_num']=i
    class_train_all_leaks = class_train_all_leaks.append(class_train)
    

data_train_classification = pd.concat([class_train_noleak,class_train_all_leaks],axis=0)

0.3
(1, 144)
(2, 144)
(3, 144)
(4, 144)
(5, 144)
(6, 144)
(7, 144)
(8, 144)
(9, 144)
(10, 144)
(11, 144)
(12, 144)
(13, 144)
(14, 144)
(15, 144)
(16, 144)
(17, 144)
(18, 144)
(19, 144)
(20, 144)
(21, 144)
(22, 144)
(23, 144)
(24, 144)
(25, 144)
(26, 144)
(27, 144)
(28, 144)
(29, 144)
(30, 144)
(31, 144)
(32, 144)
(33, 144)
(34, 144)
(35, 144)


KeyboardInterrupt: 

In [21]:
class_train_noleak

Unnamed: 0,obs12,prd12,stat12,pval12,obs13,prd13,stat13,pval13,obs14,prd14,...,obs79,prd79,stat79,pval79,obs89,prd89,stat89,pval89,leak,leak_num
0,0.000481,0.028312,0.035699,0.874723,0.013993,0.011031,0.045054,0.636313,-0.058470,0.027690,...,-0.020465,-0.008644,0.028602,0.977169,-0.013116,-0.001280,0.043978,0.665890,0,0
1,-0.101974,0.005374,0.052796,0.434209,-0.052071,0.003125,0.092473,0.019877,-0.101030,0.003615,...,0.053336,-0.006653,0.042581,0.704180,0.003322,-0.003525,0.025484,0.993568,0,0
2,-0.107069,-0.014312,0.050645,0.487184,0.000114,-0.006548,0.051290,0.470964,-0.067394,-0.016559,...,-0.001909,0.007349,0.033656,0.913867,0.000894,0.004495,0.067419,0.170488,0,0
3,0.036892,0.003886,0.041398,0.736140,-0.010108,0.004022,0.044946,0.639269,0.125637,0.006923,...,0.038543,-0.000716,0.048495,0.543013,-0.009649,-0.003774,0.041075,0.744746,0,0
4,0.092321,-0.018695,0.071505,0.125745,0.003722,0.000877,0.025914,0.992118,0.010417,-0.026905,...,0.001004,0.001417,0.051183,0.473649,-0.001064,-0.004290,0.028172,0.980332,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,-0.063413,0.028670,0.059462,0.292513,-0.031732,0.000447,0.073226,0.110021,0.015494,-0.009324,...,0.031915,-0.037374,0.072473,0.116688,0.010819,-0.001907,0.041183,0.741884,0,0
296,0.045020,-0.003997,0.057634,0.327756,-0.024817,-0.003279,0.066452,0.182745,-0.023497,-0.011523,...,-0.078904,-0.003739,0.061075,0.263716,-0.007517,-0.000290,0.035484,0.879172,0,0
297,0.051424,-0.009880,0.038817,0.803005,0.025932,0.001374,0.049785,0.509209,0.078774,0.005739,...,-0.007483,0.012676,0.031398,0.948259,-0.002767,0.000348,0.040538,0.758958,0,0
298,-0.010666,0.010712,0.034624,0.896215,-0.046444,-0.011465,0.075914,0.088727,-0.026988,0.010098,...,-0.038835,-0.010090,0.041290,0.739015,-0.013638,-0.002579,0.041828,0.724584,0,0


In [14]:
num_samples= 300
num_test_samples_each = 300
# num_test_sample_leak = int(num_test_sample_total/6)
# num_test_sample_noleak = num_test_sample_total - 3* (num_test_sample_leak)

class_test_noleak = datafortrees(num_test_samples_each,df_test_class_model,num_samples,'noleak')
class_test_noleak['leak']=0
class_test_noleak['leak_num']=0

class_test_all_leaks = pd.DataFrame(columns = class_test_noleak.columns)
# loop through all the links with leakage and prepare the dataframe
for i in leak_testing.leak_link.unique():
    leak_df = leak_testing[leak_testing.leak_link == i].reset_index(drop=True)
    class_test = datafortrees(num_test_samples_each,leak_df,num_samples,'leak')
    class_test['leak']=1
    class_test['leak_num']=i
    class_test_all_leaks = class_test_all_leaks.append(class_test)
    

data_test_classification = pd.concat([class_test_noleak,class_test_all_leaks],axis=0)

0.3
0.3
0.3
0.3


In [15]:
(data_test_classification)

Unnamed: 0,obs12,prd12,stat12,pval12,obs13,prd13,stat13,pval13,obs14,prd14,...,obs79,prd79,stat79,pval79,obs89,prd89,stat89,pval89,leak,leak_num
0,-0.077475,-0.563435,0.338560,5.985949e-25,0.010461,-0.076746,0.283577,1.569757e-17,-0.004760,-0.191534,...,-0.041947,0.222516,0.244972,3.627021e-13,-0.005791,-0.037484,0.094950,2.434427e-02,0,0
1,-0.017819,-0.490643,0.308594,9.896209e-21,-0.007064,-0.078182,0.264452,2.812094e-15,0.003813,-0.152042,...,0.044228,0.193877,0.191717,3.310794e-08,0.006559,-0.020816,0.067043,2.187595e-01,0,0
2,-0.026496,-0.555832,0.317741,5.661111e-22,-0.027239,-0.065496,0.286135,7.609599e-18,-0.109457,-0.186570,...,0.082064,0.235568,0.195836,1.514727e-08,0.011583,-0.027068,0.100388,1.453509e-02,0,0
3,0.003480,-0.490349,0.308627,9.765636e-21,-0.003520,-0.067805,0.279502,4.881370e-17,-0.037842,-0.140029,...,0.018742,0.221403,0.221130,8.792696e-11,-0.014279,-0.028756,0.090886,3.512574e-02,0,0
4,0.055984,-0.576106,0.332746,4.248818e-24,-0.029428,-0.083032,0.269457,7.505463e-16,0.046109,-0.217097,...,-0.021681,0.214966,0.231838,8.072603e-12,0.002770,-0.030868,0.097962,1.836103e-02,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,-0.019824,-0.707461,0.330961,2.141432e-26,0.016454,0.051300,0.191776,4.886938e-09,0.016069,-0.922814,...,-0.070976,-0.338166,0.298581,1.769301e-21,0.012848,-0.084020,0.415212,7.699258e-42,1,3
296,-0.126010,-0.703405,0.299250,1.415781e-21,-0.027855,0.049401,0.252892,1.849889e-15,-0.068412,-0.885209,...,0.036627,-0.337624,0.362929,8.544299e-32,-0.004967,-0.086028,0.443169,7.372324e-48,1,3
297,0.042621,-0.657925,0.318642,1.848759e-24,-0.042778,0.059900,0.275370,2.749800e-18,-0.004065,-0.914407,...,-0.032709,-0.335258,0.316095,4.534545e-24,0.010726,-0.086069,0.429270,8.169891e-45,1,3
298,-0.142939,-0.699446,0.297366,2.636508e-21,-0.027182,0.057249,0.280326,6.044903e-19,-0.148618,-0.917737,...,0.020731,-0.338361,0.330051,3.005545e-26,0.009253,-0.081374,0.438640,7.490571e-47,1,3


In [16]:
#cmat_pd.to_csv('pp1.csv')
data_test_classification.to_csv('data_test_classification_1.csv')

In [17]:
data_train_classification.to_csv('data_train_classification_1.csv')

In [20]:
data_train_classification.shape

(1200, 146)