In [1]:
import numpy as np
import pandas as pd
import os
import copy
import matplotlib.pylab as plt
#import seaborn as sbn
import pickle
import sys

from scipy.stats import ks_2samp
from sklearn.preprocessing import MinMaxScaler, StandardScaler,PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from itertools import permutations, combinations
#from statsmodels.discrete.discrete_model import Logit

## Loading datafiles for classification model training and testing

In [2]:
# Getting path for the 'parent folder'
path_cwd = os.getcwd()
path_parent = os.path.abspath(os.path.join(path_cwd, os.pardir))

'''
# Getting path for the data files
datafiles_folder_name = 'Datafiles3'

datafile_training= 'data_training_classification_3_0.01.csv'
datafile_test = 'data_testing_classification_3_0.01.csv'

path_training = os.path.join(path_parent,datafiles_folder_name, datafile_training)
path_test = os.path.join(path_parent, datafiles_folder_name, datafile_test)

# Loading files
data_training = pd.read_csv(path_training)
data_test = pd.read_csv(path_test)
'''

data_training = pd.concat(pd.read_excel('data/leak_train_11.xlsx', sheet_name=None), ignore_index=True)
data_test = pd.concat(pd.read_excel('data/leak_test_11.xlsx', sheet_name=None), ignore_index=True)

In [3]:
data_training.head(2)

Unnamed: 0,leak_link,Node_head1,Node_head2,Node_head3,Node_head5,Node_head6,Node_head8,Node_head9,Node_head11,Node_head13,Link_flow1,Link_flow2,Link_flow3,Link_flow5,Link_flow6,Link_flow8,Link_flow9,Link_flow11,Link_flow13
0,0,0.906364,3.175,0.042222,5.932,1.124,0.272727,3.482,0.764,0.94,16.858182,29.135,-0.055556,7.296,0.0,16.026364,4.612,-1.638,27.46
1,0,0.921818,3.176,0.043333,5.929,1.132,0.245455,3.48,0.77,0.952222,16.970909,29.164,-0.063333,7.272,0.0,15.942727,4.61,-1.62,27.378889


In [4]:
data_training.describe()

Unnamed: 0,leak_link,Node_head1,Node_head2,Node_head3,Node_head5,Node_head6,Node_head8,Node_head9,Node_head11,Node_head13,Link_flow1,Link_flow2,Link_flow3,Link_flow5,Link_flow6,Link_flow8,Link_flow9,Link_flow11,Link_flow13
count,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0,64298.0
mean,1.765467,0.998411,3.026405,0.616821,4.117108,1.340521,0.619822,2.074877,0.948424,1.008454,5.235631,3.526214,0.964003,1.388298,1.522424,4.576024,4.193718,-3.465848,1.183556
std,0.993902,1.308345,0.195334,0.98404,2.428319,1.484217,1.085673,1.331008,1.267963,1.252876,4.056622,18.502528,1.34321,2.463868,11.329107,7.988351,3.244175,5.2408,16.273572
min,0.0,-0.18,0.0,-0.21,0.0,-0.06,-0.23,-0.24,-0.4,-0.35,-8.94,-65.3,-0.98,-14.35,-279.91,-10.59,-5.75,-11.29,-40.3
25%,1.0,-0.07,2.885455,-0.07,3.527399,0.05,-0.07,0.623,-0.16,-0.11,3.854,-6.035556,0.106667,0.0,0.0,0.0,1.922222,-8.07,0.0
50%,2.0,0.359045,3.024,0.1,4.26,0.756833,-0.01,2.540455,0.41,0.512,4.08,-2.415182,0.41,1.194881,0.0,0.0,3.826833,-6.4915,0.0
75%,3.0,1.96,3.174545,0.995,5.72,2.458712,1.116273,3.28,2.054409,2.122,7.00475,14.975227,1.663333,1.895556,0.0,10.17,6.33,1.056917,4.75975
max,3.0,4.61,3.42,4.17,11.72,5.16,4.17,3.97,3.86,3.94,36.75,47.71,8.943333,25.42,82.24,248.43,14.6,18.29,31.63


In [5]:
# Selecting the featutes like means, ks_stat or ks_pval

def feature_selection(data_df,features,output_col):
    
    columns = []
    for feature in features:
        col_temp = [col for col in data_df.columns if feature in col]
        columns.extend(col_temp)
    columns.extend(output_col)
    data_df_select = data_df[columns]
    return data_df_select

## Logistic Regression Model

In [6]:
def logistic_reg(output_type,
                 train_df_full,
                 test_df_full,
                 features): # 'obs','prd','stat','pval'
    
    if output_type == 'binary':
        output_col = ['leak']
    else:
        output_col = ['leak_num']
        
    train_df = feature_selection(train_df_full,features,output_col)
    test_df = feature_selection(test_df_full,features,output_col)

    xdata=np.array(train_df)[:,:-2]
    if output_type=='binary':
        ydata=np.array(train_df)[:,-2]
    else:
        ydata=np.array(train_df)[:,-1]
    
    xdata_test=np.array(test_df)[:,:-2]
    if output_type=='binary':
        ydata_test=np.array(test_df)[:,-2]
    else:
        ydata_test=np.array(test_df)[:,-1]
    
    scaler = StandardScaler()
    xtrain_norm = scaler.fit_transform(xdata)
    xtest_norm = scaler.transform(xdata_test)
    
    if output_type=='binary':
        model=LogisticRegression()
    else:
        model=LogisticRegression(multi_class='multinomial',class_weight='balanced',max_iter=5000)
        
    model.fit(xtrain_norm,ydata)
    beta = model.coef_
    ypred = model.predict(xtest_norm)
    report = classification_report(ydata_test,ypred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    conf_mat = confusion_matrix(ydata_test,ypred)
    
    return conf_mat, beta,report_df

In [7]:
def randomforest(output_type,
                 train_df_full,
                 test_df_full,
                 features): # 'obs','prd','stat','pval'
    
    if output_type == 'binary':
        output_col = ['leak']
    else:
        output_col = ['leak_num']
        
    train_df = feature_selection(train_df_full,features,output_col)
    test_df = feature_selection(test_df_full,features,output_col)

    xdata=np.array(train_df)[:,:-2]
    if output_type=='binary':
        ydata=np.array(train_df)[:,-2]
    else:
        ydata=np.array(train_df)[:,-1]
    
    xdata_test=np.array(test_df)[:,:-2]
    if output_type=='binary':
        ydata_test=np.array(test_df)[:,-2]
    else:
        ydata_test=np.array(test_df)[:,-1]
    
    scaler = StandardScaler()
    xtrain_norm = scaler.fit_transform(xdata)
    xtest_norm = scaler.transform(xdata_test)
    
    model=RandomForestClassifier()
    
#     if output_type=='binary':
#         model=LogisticRegression()
#     else:
#         model=LogisticRegression(multi_class='multinomial',class_weight='balanced',max_iter=5000)
        
    model.fit(xtrain_norm,ydata)
    #beta = model.coef_
    ypred = model.predict(xtest_norm)
    report = classification_report(ydata_test,ypred,output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    
    conf_mat = confusion_matrix(ydata_test,ypred)
    
    return conf_mat, report_df

## Results for Prediction Error based approach

* Test Set with normal/expected demand based 'no leak data' plus leak data

In [8]:
lrcmat_mean,lrbeta_mean,lrreport_mean = logistic_reg('multi',
                                                                    data_training,
                                                                    data_test,
                                                                   ['obs','prd'])

KeyError: "None of [Index(['leak_num'], dtype='object')] are in the [columns]"

In [9]:
np.set_printoptions(threshold=sys.maxsize)

In [10]:
[str(0)]+[str(i) for i in range(6,98)]

['0',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '26',
 '27',
 '28',
 '29',
 '30',
 '31',
 '32',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '40',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '51',
 '52',
 '53',
 '54',
 '55',
 '56',
 '57',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95',
 '96',
 '97']

In [11]:
#lrcmat_pd = pd.DataFrame(data=lrcmat_mean,columns=[str(0)]+[str(i) for i in range(1,35)],index=[str(0)]+[str(i) for i in range(1,35)])
#lrcmat_pd = pd.DataFrame(data=lrcmat_mean,columns=[str(0)]+[str(i) for i in range(1,430)],index=[str(0)]+[str(i) for i in range(1,430)])
lrcmat_pd = pd.DataFrame(data=lrcmat_mean,columns=[str(0)]+[str(i) for i in range(1,10)]+[str(i) for i in range(11,119)],index=[str(0)]+[str(i) for i in range(1,10)]+[str(i) for i in range(11,119)])

NameError: name 'lrcmat_mean' is not defined

In [12]:
lrcmat_pd.to_csv('log_reg_conf_mat_m_3_0.01.csv')

NameError: name 'lrcmat_pd' is not defined

In [13]:
lrreport_mean.to_csv('log_reg_acc_report_m_3_0.01.csv')

NameError: name 'lrreport_mean' is not defined

In [14]:
cmat_mean, report_mean = randomforest('multi',
                                                                    data_training,
                                                                    data_test,
                                                                   ['obs','prd'])

KeyError: "None of [Index(['leak_num'], dtype='object')] are in the [columns]"

In [15]:
report_mean

NameError: name 'report_mean' is not defined