In [98]:
import pandas as pd
import numpy as np
import glob
import os

In [99]:
def import_and_filter(dat_folder, string, keys):
    """imports plant data and creates data frames with filtered data and keys
    
    Input:
    dat_folder = folder containing raw data.
    string = prefix of the csv files to be imported.
    keys = file name from current directory containing the keys spreadsheet

    Output:
    bas1 = dataframe containing filtered plant data
    key = dataframe containing descriptor key"""

    df, key = data_import(dat_folder, string, keys)
    df1 = time_filter(df, key, time_list)
    bas = data_BAS(df1, key, key_list, dim_remove=[])
    bas1 = alarm_filter(bas, key)
    return bas1, key

In [100]:
def data_import(dat_folder, string, keys):
    """imports plant data and creates data frames with raw data and keys

    dat_folder = folder containing raw data.
    string = prefix of the csv files to be imported.
    keys = file name from current directory containing the keys spreadsheet

    Output:
    df = dataframe containing plant data
    key = dataframe containing descriptor key"""

    # Assert that dat_folder is .csv

    # Assert that string is string type

    #extracts file names
    dat_list = [f for f in glob.glob(os.path.join(dat_folder, string + '*'))]
    print(dat_list)
    
    #reads and appends content from file to a data frame
    df = pd.DataFrame()

    for lst in dat_list:
        df_add = pd.read_csv(lst)
        df = pd.concat([df, df_add], ignore_index=True)
    
    key = pd.read_excel(keys)
    
    return df, key

In [101]:
def alarm_filter(bas, key):
	"""removes any datapoints with alarms going off or without optimum control

	bas = dataframe containing plant data
    key = dataframe containing descriptor key"""

    #filters kes to select those with alarm units that are also BAS	
	key_alarm = key[key['Units'].str.contains("Normal/Alarm")==True]
	vals = [x for x in key_alarm if x in bas.columns]

	for alm in vals:
		bas = bas[bas[alm] == 0]

	bas = bas[bas['OptimumControl'] == 1]

	return bas

### Updated `data_BAS` function:

In [102]:
key_list = ['BAS', 'Chiller', 'Condenser Water Pump', 'Cooling Tower Cell']

In [103]:
def data_BAS(df, key, key_list, dim_remove=[]):
    '''Filters out non-BAS descriptors and data containing NaN values

    df = dataframe containing plant data
    key = dataframe containing descriptor key'''
    
    keys = []
    kk = []
    val = []
   
    for k in range(0, len(key_list)):
        keys.append(key.loc[key['PointType'].str.contains(key_list[k])==True, 'DataPointName'])
        kk.append(keys[k].values.tolist())
        val += kk[k]
    #key_bas = key.loc[key['PointType'].str.contains("BAS")==True,'DataPointName']
	#key_chiller = key.loc[key['PointType'].str.contains("Chiller")==True,'DataPointName']
	#key_condenser = key.loc[key['PointType'].str.contains("Condenser Water Pump")==True,'DataPointName']
	#key_cool = key.loc[key['PointType'].str.contains("Cooling Tower Cell")==True,'DataPointName']
	
    #key = pd.concat([key_bas, key_condenser, key_cool, key_chiller], ignore_index = True)
	#print(key.head())
	#converts pandas series to a list for future use

    #removes DataPointNames that containt the prefix CHWV
    kw = [x for x in val if not 'kW' in x]
    vals = [x for x in kw if not x.startswith('CHWV')]

    #tests whether all values from the point list spreadsheet are column headings of the dataset
    for x in vals:
        if x not in df.columns:
            #prints and removes any string not found in the data
            print(x)
            vals.remove(x)
        #tests whether all values from the point list spreadsheet are column headings of the dataset

    vals_new = [x for x in vals if x in df.columns]
	#vals_kw = [x for x in vals_new if not x]
	#print(vals_new)
	
	#for x in df.columns:
		#if x not in vals:
            #prints and removes any string not found in the data
			#print(x)
    #expresses data using columns specified by the vals list
    bas = df[vals_new+['OptimumControl', 'kW/Ton']]
    
    print('Original data contains '+str(df.shape[0])+' points and '+str(df.shape[1])+ ' dimensions.')
    print('Filtered data contains '+str(bas.dropna().shape[0])+' points and '+str(bas.dropna().shape[1])+ ' dimensions.')
    return bas.dropna()

### Function to remove data of a range of timestamps:

In [104]:
time_list = ['2017-06-07', '2017-06-08', '2017-06-09', '2017-06-10', 
             '2017-06-11', '2017-06-12', '2017-06-13', '2017-06-14',
             '2017-06-15', '2017-06-16', '2017-06-17', '2017-06-18', 
             '2017-06-19', '2017-06-20', '2017-06-21']

In [105]:
def time_filter(df, key, time_list):
    ''' Filters out a specified timestamp from the dataset 
    
    df = dataframe containing the plant data
    key = dataframe containing descriptor key
    time_list = timestamps to be removed'''
    
    df = df[~df['timestamp'].str.contains('|'.join(time_list))]
    return df

In [106]:
df, key = import_and_filter('../../../Plt1', 'Plt1 m', '../../../Plt1/Plt1 Points List.xlsx')

['../../../Plt1\\Plt1 m 2016-11.csv', '../../../Plt1\\Plt1 m 2016-12.csv', '../../../Plt1\\Plt1 m 2017-01.csv', '../../../Plt1\\Plt1 m 2017-02.csv', '../../../Plt1\\Plt1 m 2017-03.csv', '../../../Plt1\\Plt1 m 2017-04.csv', '../../../Plt1\\Plt1 m 2017-05.csv', '../../../Plt1\\Plt1 m 2017-06.csv', '../../../Plt1\\Plt1 m 2017-07.csv', '../../../Plt1\\Plt1 m 2017-08.csv', '../../../Plt1\\Plt1 m 2017-09.csv', '../../../Plt1\\Plt1 m 2017-10.csv', '../../../Plt1\\Plt1 m 2017-12.csv', '../../../Plt1\\Plt1 m 2018-01.csv', '../../../Plt1\\Plt1 m 2018-02.csv', '../../../Plt1\\Plt1 m 2018-03.csv', '../../../Plt1\\Plt1 m 2018-04.csv']
CommunicationFailure_COV
CH3COM1F
CH3Ready
CH4COM1F
CH4Ready
CH4SURGE
CH5COM1F
CH5Ready
Original data contains 138923 points and 414 dimensions.
Filtered data contains 131597 points and 193 dimensions.
