In [60]:
import pandas as pd
import numpy as np
import glob
import os

In [61]:
def import_and_filter(dat_folder, string, keys):
	"""imports plant data and creates data frames with filtered data and keys
	
	Input:
    dat_folder = folder containing raw data.
    string = prefix of the csv files to be imported.
    keys = file name from current directory containing the keys spreadsheet

    Output:
    bas1 = dataframe containing filtered plant data
    key = dataframe containing descriptor key"""

	df, key = data_import(dat_folder, string, keys)
	bas = data_BAS(df, key, dim_remove=[])
	bas1 = alarm_filter(bas, key)
	return bas1, key

In [62]:
def data_import(dat_folder, string, keys):
    """imports plant data and creates data frames with raw data and keys

    dat_folder = folder containing raw data.
    string = prefix of the csv files to be imported.
    keys = file name from current directory containing the keys spreadsheet

    Output:
    df = dataframe containing plant data
    key = dataframe containing descriptor key"""

    # Assert that dat_folder is .csv

    # Assert that string is string type

    #extracts file names
    dat_list = [f for f in glob.glob(os.path.join(dat_folder, string + '*'))]
    print(dat_list)
    
    #reads and appends content from file to a data frame
    df = pd.DataFrame()

    for lst in dat_list:
        df_add = pd.read_csv(lst)
        df = pd.concat([df, df_add], ignore_index=True)
    
    key = pd.read_excel(keys)
    
    return df, key

In [63]:
def alarm_filter(bas, key):
	"""removes any datapoints with alarms going off or without optimum control

	bas = dataframe containing plant data
    key = dataframe containing descriptor key"""

    #filters kes to select those with alarm units that are also BAS	
	key_alarm = key[key['Units'].str.contains("Normal/Alarm")==True]
	vals = [x for x in key_alarm if x in bas.columns]

	for alm in vals:
		bas = bas[bas[alm] == 0]

	bas = bas[bas['OptimumControl'] == 1]

	return bas

In [64]:
def data_BAS(df, key, dim_remove=[]):
    '''Filters out descriptors containing NaN values, calculated descriptors,
     and miscelaneous descriptors specified by the user.

    Input:
    df = dataframe containing plant data
    key = dataframe containing descriptor key
    dim_remove = list of descriptors to remove from dataset (default = NULL)

    Output:
    bas = dataframe filtered for descriptors and NaN values'''

    # finds keys from categories BAS, Chiller, Condenser Water Pump
    # and Cooling Tower Cell
    key_bas = key.loc[
        key['PointType'].str.contains("BAS") == True, 'DataPointName'
    ]
    key_chiller = key.loc[
        key['PointType'].str.contains("Chiller") == True, 'DataPointName'
    ]
    key_condenser = key.loc[
        key['PointType'].str.contains("Condenser Water Pump") == True,
        'DataPointName'
    ]
    key_cool = key.loc[
        key['PointType'].str.contains("Cooling Tower Cell") == True,
        'DataPointName'
    ]

    key = pd.concat(
        [key_bas, key_condenser, key_cool, key_chiller], ignore_index=True
    )

    # converts pandas series to a list for future use
    val = key.values.T.tolist()

    # removes DataPointNames that containt the prefix CHWV
    kw = [x for x in val if 'kW' not in x]
    vals = [x for x in kw if not x.startswith('CHWV')]

    # optional dimension remover
    for dim in dim_remove:
        vals.remove(dim)

    # tests whether all values from the point list spreadsheet are column
    # headings of the dataset
    print('Descriptors in the points list that are not in the datasets.')
    for x in vals:
        if x not in df.columns:
            # prints and removes any string not found in the data
            print(x)
            vals.remove(x)
    # tests whether all values from the point list spreadsheet are column
    # headings of the dataset

    vals_new = [x for x in vals if x in df.columns]

    # expresses data using columns specified by the vals list
    bas = df[vals_new+['OptimumControl', 'kW/Ton']]

    print(
        'Original data contains ' + str(df.shape[0]) + ' points and '
        + str(df.shape[1]) + ' dimensions.'
    )

    return bas.dropna()

In [65]:
key_list = ['BAS', 'Chiller', 'Condenser Water Pump', 'Cooling Tower Cell']

### Updated `data_BAS` function:

In [67]:
def data_BAS(df, key, key_list):
    '''Filters out non-BAS descriptors and data containing NaN values

    df = dataframe containing plant data
    key = dataframe containing descriptor key'''
    
    keys = []
    kk = []
    val = []
   
    for k in range(len(key_list)):
        keys = key.loc[key['PointType'].str.contains(key_list[k])==True, 'DataPointName']
        keys.append(key.loc[key['PointType'].str.contains(key_list[k])==True, 'DataPointName'])
        kk.append(keys[k].values.tolist())
        val += kk[k]
    #key_bas = key.loc[key['PointType'].str.contains("BAS")==True,'DataPointName']
	#key_chiller = key.loc[key['PointType'].str.contains("Chiller")==True,'DataPointName']
	#key_condenser = key.loc[key['PointType'].str.contains("Condenser Water Pump")==True,'DataPointName']
	#key_cool = key.loc[key['PointType'].str.contains("Cooling Tower Cell")==True,'DataPointName']
	
    #key = pd.concat([key_bas, key_condenser, key_cool, key_chiller], ignore_index = True)
	#print(key.head())
	#converts pandas series to a list for future use

    #removes DataPointNames that containt the prefix CHWV
    kw = [x for x in val if not 'kW' in x]
    vals = [x for x in kw if not x.startswith('CHWV')]

    #tests whether all values from the point list spreadsheet are column headings of the dataset
    for x in vals:
        if x not in df.columns:
            #prints and removes any string not found in the data
            print(x)
            vals.remove(x)
        #tests whether all values from the point list spreadsheet are column headings of the dataset

    vals_new = [x for x in vals if x in df.columns]
	#vals_kw = [x for x in vals_new if not x]
	#print(vals_new)
	
	#for x in df.columns:
		#if x not in vals:
            #prints and removes any string not found in the data
			#print(x)
    #expresses data using columns specified by the vals list
    bas = df[vals_new+['OptimumControl', 'kW/Ton']]
    
    print('Original data contains '+str(df.shape[0])+' points and '+str(df.shape[1])+ ' dimensions.')
    print('Filtered data contains '+str(bas.dropna().shape[0])+' points and '+str(bas.dropna().shape[1])+ ' dimensions.')
    return bas.dropna()

In [68]:
df, key = data_import('../../../Plt1', 'Plt1 m', '../../../Plt1/Plt1 Points List.xlsx')

['../../../Plt1\\Plt1 m 2016-11.csv', '../../../Plt1\\Plt1 m 2016-12.csv', '../../../Plt1\\Plt1 m 2017-01.csv', '../../../Plt1\\Plt1 m 2017-02.csv', '../../../Plt1\\Plt1 m 2017-03.csv', '../../../Plt1\\Plt1 m 2017-04.csv', '../../../Plt1\\Plt1 m 2017-05.csv', '../../../Plt1\\Plt1 m 2017-06.csv', '../../../Plt1\\Plt1 m 2017-07.csv', '../../../Plt1\\Plt1 m 2017-08.csv', '../../../Plt1\\Plt1 m 2017-09.csv', '../../../Plt1\\Plt1 m 2017-10.csv', '../../../Plt1\\Plt1 m 2017-12.csv', '../../../Plt1\\Plt1 m 2018-01.csv', '../../../Plt1\\Plt1 m 2018-02.csv', '../../../Plt1\\Plt1 m 2018-03.csv', '../../../Plt1\\Plt1 m 2018-04.csv']


In [69]:
time_list = ['2017-06-07', '2017-06-08', '2017-06-09', '2017-06-10', 
             '2017-06-11', '2017-06-12', '2017-06-13', '2017-06-14',
             '2017-06-15', '2017-06-16', '2017-06-17', '2017-06-18', 
             '2017-06-19', '2017-06-20', '2017-06-21']

### Function to remove data of a range of timestamps:

In [71]:
def time_filter(df, key, time_list):
    ''' Filters out a specified timestamp from the dataset 
    
    df = dataframe containing the plant data
    key = dataframe containing descriptor key
    time_list = timestamps to be removed'''
    
    df = df[~df['timestamp'].str.contains('|'.join(time_list))]
    return df

In [72]:
dft = time_filter(df, key, time_list)

In [73]:
dft.describe()

Unnamed: 0,BASMODE,CDWDT,CDWP3Failed,CDWP3HZ,CDWP3S,CDWP3SPD,CDWP3SPD_Alarm,CDWP3SS,CDWP3kW,CDWP4Failed,...,kW/Ton Delta,kW/Ton_InLoopFollow,kW/Ton_InLoopNotFollow,kW/Ton_OutLoop,kWh,kWh Delta,kWhSqFt,kWh_InLoopFollow,kWh_InLoopNotFollow,kWh_OutLoop
count,134725.0,135615.0,134896.0,134395.0,134891.0,134902.0,134906.0,134890.0,134355.0,134398.0,...,133928.0,113733.0,20041.0,29.0,134400.0,134337.0,134400.0,137702.0,137709.0,137709.0
mean,1.0,7.28355,0.0,9.806849,0.195143,16.277611,0.175189,0.195159,1.535175,0.0,...,0.411283,0.418362,0.310179,0.384555,101.879092,99.04042,0.000433,90.918045,8.561938,0.019129
std,0.0,2.021277,0.0,19.871663,0.396312,33.067898,0.38013,0.396325,3.135762,0.0,...,0.822621,0.247623,0.052926,0.053876,54.497382,26.756439,0.000232,65.066261,21.695747,1.353118
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-144.917925,0.174558,0.124685,0.295174,3.8,-71.5,1.6e-05,0.0,0.0,0.0
25%,1.0,6.099998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.369188,0.316382,0.271817,0.337893,57.0,84.047239,0.000242,45.399998,0.0,0.0
50%,1.0,7.099998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.425921,0.400126,0.301741,0.367749,86.099998,93.385253,0.000366,82.099998,0.0,0.0
75%,1.0,8.300003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.481777,0.502244,0.341668,0.436518,140.199997,107.229243,0.000596,138.699997,0.0,0.0
max,1.0,27.5,0.0,60.0,1.0,99.999603,1.0,1.0,14.5,0.0,...,0.736649,9.765633,0.84129,0.498266,331.5,284.253855,0.001409,331.5,241.800003,129.800003


In [74]:
df.describe()

Unnamed: 0,BASMODE,CDWDT,CDWP3Failed,CDWP3HZ,CDWP3S,CDWP3SPD,CDWP3SPD_Alarm,CDWP3SS,CDWP3kW,CDWP4Failed,...,kW/Ton Delta,kW/Ton_InLoopFollow,kW/Ton_InLoopNotFollow,kW/Ton_OutLoop,kWh,kWh Delta,kWhSqFt,kWh_InLoopFollow,kWh_InLoopNotFollow,kWh_OutLoop
count,139042.0,139932.0,139213.0,138712.0,139208.0,139219.0,139223.0,139207.0,138672.0,138715.0,...,138209.0,118000.0,20041.0,33.0,138717.0,138653.0,138717.0,142019.0,142026.0,142026.0
mean,1.0,7.275589,0.0,9.855851,0.196052,16.361062,0.174124,0.195227,1.545879,0.0,...,0.388139,0.432501,0.310179,0.442133,106.620095,96.676552,0.000453,95.874996,8.301691,0.025554
std,0.0,2.000929,0.0,19.915299,0.39701,33.142225,0.379217,0.396377,3.149121,0.0,...,1.672081,0.256909,0.052926,0.165342,61.026688,32.277661,0.000259,70.950974,21.413976,1.876483
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-177.60138,0.174558,0.124685,0.295174,3.8,-216.699997,1.6e-05,0.0,0.0,0.0
25%,1.0,6.099998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.363328,0.319409,0.271817,0.337893,57.700001,83.311716,0.000245,46.599998,0.0,0.0
50%,1.0,7.099998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.423183,0.407123,0.301741,0.404766,88.800003,93.005837,0.000378,85.0,0.0,0.0
75%,1.0,8.300003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.480182,0.512096,0.341668,0.440106,145.800003,107.027919,0.00062,144.300003,0.0,0.0
max,1.0,27.5,0.0,60.0,1.0,99.999603,1.0,1.0,14.5,0.0,...,0.736649,9.765633,0.84129,0.873853,573.099976,400.590114,0.002437,573.099976,241.800003,259.299988
