In [1]:
import pandas as pd
import os
import glob

# Specify the path to subfolders with CSV files
main_path = r'C:\ResearchFiles\AllFiles_Cleaned'

# List of buildings to include
selected_buildings = ['Atkinson_Hall', 'EBU3B']

# Recursively search for all CSV files in the subfolders of the selected buildings
all_files = []
for building in selected_buildings:
    building_path = os.path.join(main_path, building)
    all_files.extend(glob.glob(os.path.join(building_path, "*.csv")))

# Combine all files into a single DataFrame
df_list = []
for filename in all_files:
    df = pd.read_csv(filename)
    df['smart_plug'] = os.path.basename(filename).split('_')[1]  # Add a column for the smart plug identifier
    df['building'] = os.path.basename(os.path.dirname(filename))  # Add a column for the building
    df_list.append(df)

# Combine all the dataframes into one
combined_df = pd.concat(df_list, ignore_index=True)

In [2]:
# Drop unneccesary columns
combined_df = combined_df.drop(columns = ['analogInput_2', 'analogInput_4', 'analogInput_5', 'binaryInput_3', 'binaryValue_1', 'building'])

In [3]:
# Convert timestamp column to datetime
combined_df['time'] = pd.to_datetime(combined_df['time'])

In [4]:
# Create a new column for the 15 minute time interval
combined_df['15min'] = combined_df['time'].dt.floor('15T')

In [5]:
# Group by 'smart_plug' and '15min', then sum the power readings
min15_df = combined_df.groupby(['smart_plug', '15min']).agg({'analogInput_3': 'sum'}).reset_index()

# Rename 'analogInput_3' to 'average_power_usage'
min15_df.rename(columns={'analogInput_3': 'average_power_usage'}, inplace=True)

# The 'min15_df' now contains the 15 min interval total power usage for each plug in the selected buildings

In [6]:
min15_df

Unnamed: 0,smart_plug,15min,average_power_usage
0,180408,2023-09-14 18:15:00,26075.0
1,180408,2023-09-14 18:30:00,39545.0
2,180408,2023-09-14 18:45:00,192324.0
3,180408,2023-09-14 19:00:00,28517.0
4,180408,2023-09-14 19:15:00,28002.0
...,...,...,...
2480137,944300,2024-06-10 22:45:00,0.0
2480138,944300,2024-06-10 23:00:00,0.0
2480139,944300,2024-06-10 23:15:00,0.0
2480140,944300,2024-06-10 23:30:00,0.0


In [7]:
# Read in the helper data with building name, load type, and manual inspection status
plug_info = pd.read_csv(r'C:\Users\vaugh\Downloads\helper_spreadsheet(2).csv')

In [8]:
plug_info

Unnamed: 0,smart_plug,building_name,Load_Type,Inspection
0,291824,Atkinson Hall,Computer,disconnected
1,291956,Atkinson Hall,Printer,checked
2,292032,Atkinson Hall,Printer,checked
3,183436,Atkinson Hall,Water Dispenser,disconnected
4,181084,Atkinson Hall,TV,checked
...,...,...,...,...
112,283884,EBU3B,Printer,checked
113,288480,EBU3B,Printer,checked
114,291792,EBU3B,Printer,missing
115,565612,EBU3B,Printer,checked


In [9]:
# Convert the plug id columns for both dataframes to strings to prepare for merging
min15_df['smart_plug'] = min15_df['smart_plug'].astype(str)
plug_info['smart_plug'] = plug_info['smart_plug'].astype(str)

# Merge the aggregated data with the plug info data
merged_15min_total = pd.merge(min15_df, plug_info, on='smart_plug')
merged_15min_total = merged_15min_total[
    (merged_15min_total['Inspection'] == 'checked') & 
    (~merged_15min_total['smart_plug'].isin(['299184', '183436', '186204', '634584', '284068', '298808', 
                                       '180552', '944300', '186212', '291792', '283892', '288848', 
                                       '565996', '283884', '287196', '654200', '286460', '182136', 
                                       '297624', '181084', '291824', '284416', '293688', '291080', 
                                       '290240', '297984', '289840', '182552', '291728', '180672', 
                                       '183428', '284556', '285120', '291076', '291740', '183232', 
                                       '284372', '180940', '182268', '182584', '292032', '294576', 
                                       '296488', '298776', '299852', '639140', '651712', '652512', 
                                       '301192', '562240', '566540', '780512']))
]

In [10]:
merged_15min_total

Unnamed: 0,smart_plug,15min,average_power_usage,building_name,Load_Type,Inspection
0,180408,2023-09-14 18:15:00,26075.0,EBU3B,Printer,checked
1,180408,2023-09-14 18:30:00,39545.0,EBU3B,Printer,checked
2,180408,2023-09-14 18:45:00,192324.0,EBU3B,Printer,checked
3,180408,2023-09-14 19:00:00,28517.0,EBU3B,Printer,checked
4,180408,2023-09-14 19:15:00,28002.0,EBU3B,Printer,checked
...,...,...,...,...,...,...
2458847,782264,2024-06-10 22:45:00,1224031.0,EBU3B,Printer,checked
2458848,782264,2024-06-10 23:00:00,120531.0,EBU3B,Printer,checked
2458849,782264,2024-06-10 23:15:00,121818.0,EBU3B,Printer,checked
2458850,782264,2024-06-10 23:30:00,917158.0,EBU3B,Printer,checked


In [33]:
# Create a new df for each specific type of device 
merged_15min_AirPurifier = merged_15min_total[merged_15min_total['Load_Type'] == 'Air Purifier']
merged_15min_WaterDispenser  = merged_15min_total[merged_15min_total['Load_Type'] == 'Water Dispenser']
merged_15min_TV = merged_15min_total[merged_15min_total['Load_Type'] == 'TV']
merged_15min_Printer = merged_15min_total[merged_15min_total['Load_Type'] == 'Printer']
merged_15min_Computer = merged_15min_total[merged_15min_total['Load_Type'] == 'Computer']

# Create a dictionary of dataframes and their names so that we don't have to repeat code
merged_dfs = [merged_15min_total, merged_15min_AirPurifier, merged_15min_WaterDispenser, merged_15min_TV, merged_15min_Printer, merged_15min_Computer]

In [34]:
# Rename the time column more accurately for each df
for i in range(len(merged_dfs)):
    merged_dfs[i] = merged_dfs[i].rename(columns={'15min': 'Timestamp'})

# Reflect the changes back to the original variables
(merged_15min_total, merged_15min_AirPurifier, merged_15min_WaterDispenser, 
 merged_15min_TV, merged_15min_Printer, merged_15min_Computer) = merged_dfs

In [79]:
# Create new dfs with a column for the total building load from all plugs at each timestamp
total_building_plug_load_AllDevices = merged_15min_total.groupby(['building_name', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()
total_building_plug_load_AirPurifier = merged_15min_AirPurifier.groupby(['building_name', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()
total_building_plug_load_WaterDispenser = merged_15min_AirPurifier.groupby(['building_name', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()
total_building_plug_load_TV = merged_15min_TV.groupby(['building_name', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()
total_building_plug_load_Printer = merged_15min_Printer.groupby(['building_name', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()
total_building_plug_load_Computer = merged_15min_Computer.groupby(['building_name', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()

In [80]:
total_building_plug_load_AllDevices

Unnamed: 0,building_name,Timestamp,average_power_usage
0,Atkinson Hall,2023-09-14 18:15:00,7.185511e+05
1,Atkinson Hall,2023-09-14 18:30:00,2.008210e+06
2,Atkinson Hall,2023-09-14 18:45:00,3.941921e+06
3,Atkinson Hall,2023-09-14 19:00:00,8.282920e+05
4,Atkinson Hall,2023-09-14 19:15:00,1.705698e+06
...,...,...,...
45059,EBU3B,2024-06-10 22:45:00,1.085142e+07
45060,EBU3B,2024-06-10 23:00:00,1.425518e+07
45061,EBU3B,2024-06-10 23:15:00,9.822876e+06
45062,EBU3B,2024-06-10 23:30:00,1.360368e+07


In [81]:
# Read in the csv file containing the metering data we have on EBU3B and Atkinson Hall
building_metering = pd.read_csv(r'C:\Users\vaugh\Downloads\building_metering.csv')

In [82]:
# Drop unnecessary columns from the metering data
building_metering = building_metering.drop(columns = ['WARREN.CAL_IT_E2531#Real Power Mean#kW', 'WARREN.CAL_IT_E2532#Real Power Mean#kW', 'WARREN.CAL_IT_Gate_E2530#Real Power Mean#kW', 'WARREN.EBU3B_E2520#Real Power Mean#kW', 'WARREN.EBU3B_E2521#Real Power Mean#kW'])

In [83]:
building_metering

Unnamed: 0,Timestamp,EBU3B total,Atkinson total
0,2023-01-01 0:15:00,312.613174,632.773163
1,2023-01-01 0:30:00,313.511780,630.595501
2,2023-01-01 0:45:00,317.042526,631.453430
3,2023-01-01 1:00:00,318.071701,627.231457
4,2023-01-01 1:15:00,316.035248,634.164612
...,...,...,...
54521,2024-07-21 23:30:00,397.910767,670.800072
54522,2024-07-21 23:45:00,396.301117,667.563164
54523,2024-07-22 0:00:00,394.709076,666.877243
54524,,0.000000,0.000000


In [84]:
# Convert timestamp column to datetime
building_metering['Timestamp'] = pd.to_datetime(building_metering['Timestamp'])

In [85]:
# Merge the plug load dataframes with the metering dataframe
both_merged_15min_AllDevices = pd.merge(total_building_plug_load_AllDevices, building_metering, on='Timestamp')
both_merged_15min_AirPurifier = pd.merge(total_building_plug_load_AirPurifier, building_metering, on='Timestamp')
both_merged_15min_WaterDispenser = pd.merge(total_building_plug_load_WaterDispenser, building_metering, on='Timestamp')
both_merged_15min_TV = pd.merge(total_building_plug_load_TV, building_metering, on='Timestamp')
both_merged_15min_Printer = pd.merge(total_building_plug_load_Printer, building_metering, on='Timestamp')
both_merged_15min_Computer = pd.merge(total_building_plug_load_Computer, building_metering, on='Timestamp')

In [86]:
both_merged_15min_AllDevices

Unnamed: 0,building_name,Timestamp,average_power_usage,EBU3B total,Atkinson total
0,Atkinson Hall,2023-09-14 18:15:00,7.185511e+05,311.203552,705.908989
1,EBU3B,2023-09-14 18:15:00,9.310813e+06,311.203552,705.908989
2,Atkinson Hall,2023-09-14 18:30:00,2.008210e+06,303.462402,703.535606
3,EBU3B,2023-09-14 18:30:00,5.978398e+06,303.462402,703.535606
4,Atkinson Hall,2023-09-14 18:45:00,3.941921e+06,303.112823,703.993347
...,...,...,...,...,...
45059,EBU3B,2024-06-10 23:15:00,9.822876e+06,385.190674,696.803253
45060,Atkinson Hall,2024-06-10 23:30:00,2.160220e+06,381.697632,693.240364
45061,EBU3B,2024-06-10 23:30:00,1.360368e+07,381.697632,693.240364
45062,Atkinson Hall,2024-06-10 23:45:00,3.268262e+06,383.535980,691.132523


In [87]:
# Create a list of all the merged dfs to reduce repetition of code
final_merged_dfs = [both_merged_15min_AllDevices, both_merged_15min_AirPurifier, both_merged_15min_WaterDispenser, both_merged_15min_TV, both_merged_15min_Printer, both_merged_15min_Computer]

for i in range(len(final_merged_dfs)):
    # Convert the plug load data's milliWatt values to kilowatts for continuity with the metering data
    final_merged_dfs[i]['average_power_usage'] = final_merged_dfs[i]['average_power_usage'] / 1_000_000
    
    # Convert all power usage values from KW to KW/hour
    final_merged_dfs[i]['average_power_usage'] = final_merged_dfs[i]['average_power_usage'] / 4
    final_merged_dfs[i]['EBU3B total'] = final_merged_dfs[i]['EBU3B total'] / 4
    final_merged_dfs[i]['Atkinson total'] = final_merged_dfs[i]['Atkinson total'] / 4
    
    # Creating the 'building_load' column based on the building name
    final_merged_dfs[i]['building_load'] = final_merged_dfs[i].apply(
        lambda row: row['Atkinson total'] if row['building_name'] == 'Atkinson Hall' else row['EBU3B total'],
        axis=1
    )
    final_merged_dfs[i] = final_merged_dfs[i].drop(columns=['EBU3B total', 'Atkinson total'])
    
    # Set Timestamp as index
    final_merged_dfs[i] = final_merged_dfs[i].set_index('Timestamp')
    
    # Rename columns for better readability
    final_merged_dfs[i] = final_merged_dfs[i].rename(columns = {'average_power_usage': 'total_plug_load', 'building_load': 'total_building_load'})

# Reflect the changes back to the original variables
(both_merged_15min_AllDevices, both_merged_15min_AirPurifier, both_merged_15min_WaterDispenser, both_merged_15min_TV, both_merged_15min_Printer, both_merged_15min_Computer) = final_merged_dfs

In [88]:
both_merged_15min_AllDevices

Unnamed: 0_level_0,building_name,total_plug_load,total_building_load
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-09-14 18:15:00,Atkinson Hall,0.179638,176.477247
2023-09-14 18:15:00,EBU3B,2.327703,77.800888
2023-09-14 18:30:00,Atkinson Hall,0.502053,175.883902
2023-09-14 18:30:00,EBU3B,1.494600,75.865601
2023-09-14 18:45:00,Atkinson Hall,0.985480,175.998337
...,...,...,...
2024-06-10 23:15:00,EBU3B,2.455719,96.297668
2024-06-10 23:30:00,Atkinson Hall,0.540055,173.310091
2024-06-10 23:30:00,EBU3B,3.400919,95.424408
2024-06-10 23:45:00,Atkinson Hall,0.817066,172.783131


In [None]:
# Calculate the Pearson correlation coefficient between plug load and building metering for each df
AllDevices_correlation = both_merged_15min_AllDevices['total_plug_load'].corr(both_merged_15min_AllDevices['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for all devices is: {AllDevices_correlation}\n")

AirPurifier_correlation = both_merged_15min_AirPurifier['total_plug_load'].corr(both_merged_15min_AirPurifier['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for air purifiers is: {AirPurifier_correlation}\n")

WaterDispenser_correlation = both_merged_15min_WaterDispenser['total_plug_load'].corr(both_merged_15min_WaterDispenser['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for water dispensers is: {WaterDispenser_correlation}\n")

In [97]:
# Filter all the final Dataframes for only Atkinson Hall values
AllDevices_atkinson_merged_15min = both_merged_15min_AllDevices[both_merged_15min_AllDevices['building_name'] == 'Atkinson Hall']
AirPurifier_atkinson_merged_15min = both_merged_15min_AirPurifier[both_merged_15min_AirPurifier['building_name'] == 'Atkinson Hall']
WaterDispenser_atkinson_merged_15min = both_merged_15min_WaterDispenser[both_merged_15min_WaterDispenser['building_name'] == 'Atkinson Hall']
TV_atkinson_merged_15min = both_merged_15min_TV[both_merged_15min_TV['building_name'] == 'Atkinson Hall']
Printer_atkinson_merged_15min = both_merged_15min_Printer[both_merged_15min_Printer['building_name'] == 'Atkinson Hall']
Computer_atkinson_merged_15min = both_merged_15min_Computer[both_merged_15min_Computer['building_name'] == 'Atkinson Hall']

# Filter all the final Dataframes for only EBU3B values
AllDevices_ebu3b_merged_15min = both_merged_15min_AllDevices[both_merged_15min_AllDevices['building_name'] == 'EBU3B']
AirPurifier_ebu3b_merged_15min = both_merged_15min_AirPurifier[both_merged_15min_AirPurifier['building_name'] == 'EBU3B']
WaterDispenser_ebu3b_merged_15min = both_merged_15min_WaterDispenser[both_merged_15min_WaterDispenser['building_name'] == 'EBU3B']
TV_ebu3b_merged_15min = both_merged_15min_TV[both_merged_15min_TV['building_name'] == 'EBU3B']
Printer_ebu3b_merged_15min = both_merged_15min_Printer[both_merged_15min_Printer['building_name'] == 'EBU3B']
Computer_ebu3b_merged_15min = both_merged_15min_Computer[both_merged_15min_Computer['building_name'] == 'EBU3B']

In [98]:
# Print the Pearson correlation coefficient between plug load and building metering for each df in Atkinson Hall
print("Atkinson Hall:\n")

AllDevices_atkinson_correlation = AllDevices_atkinson_merged_15min['total_plug_load'].corr(AllDevices_atkinson_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for all devices in Atkinson Hall is: {AllDevices_atkinson_correlation}\n")

AirPurifier_atkinson_correlation = AirPurifier_atkinson_merged_15min['total_plug_load'].corr(AirPurifier_atkinson_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for air purifiers in Atkinson Hall is: {AirPurifier_atkinson_correlation}\n")

WaterDispenser_atkinson_correlation = WaterDispenser_atkinson_merged_15min['total_plug_load'].corr(WaterDispenser_atkinson_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for water dispensers in Atkinson Hall is: {WaterDispenser_atkinson_correlation}\n")

TV_atkinson_correlation = TV_atkinson_merged_15min['total_plug_load'].corr(TV_atkinson_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for TVs in Atkinson Hall is: {TV_atkinson_correlation}\n")

Printer_atkinson_correlation = Printer_atkinson_merged_15min['total_plug_load'].corr(Printer_atkinson_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for printers in Atkinson Hall is: {Printer_atkinson_correlation}\n")

Computer_atkinson_correlation = Computer_atkinson_merged_15min['total_plug_load'].corr(Computer_atkinson_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for computers in Atkinson Hall is: {Computer_atkinson_correlation}\n")

Atkinson Hall:

The Pearson correlation coefficient between total_plug_load and total_building_load for all devices in Atkinson Hall is: 0.16283566403865118

The Pearson correlation coefficient between total_plug_load and total_building_load for air purifiers in Atkinson Hall is: nan

The Pearson correlation coefficient between total_plug_load and total_building_load for water dispensers in Atkinson Hall is: nan

The Pearson correlation coefficient between total_plug_load and total_building_load for TVs in Atkinson Hall is: 0.0313703663440461

The Pearson correlation coefficient between total_plug_load and total_building_load for printers in Atkinson Hall is: -0.06471216335650273

The Pearson correlation coefficient between total_plug_load and total_building_load for computers in Atkinson Hall is: nan



In [99]:
# Print the Pearson correlation coefficient between plug load and building metering for each df in EBU3B
print("EBU3B:\n")

AllDevices_ebu3b_correlation = AllDevices_ebu3b_merged_15min['total_plug_load'].corr(AllDevices_ebu3b_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for all devices in EBU3B is: {AllDevices_ebu3b_correlation}\n")

AirPurifier_ebu3b_correlation = AirPurifier_ebu3b_merged_15min['total_plug_load'].corr(AirPurifier_ebu3b_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for air purifiers in EBU3B is: {AirPurifier_ebu3b_correlation}\n")

WaterDispenser_ebu3b_correlation = WaterDispenser_ebu3b_merged_15min['total_plug_load'].corr(WaterDispenser_ebu3b_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for water dispensers in EBU3B is: {WaterDispenser_ebu3b_correlation}\n")

TV_ebu3b_correlation = TV_ebu3b_merged_15min['total_plug_load'].corr(TV_ebu3b_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for TVs in EBU3B is: {TV_ebu3b_correlation}\n")

Printer_ebu3b_correlation = Printer_ebu3b_merged_15min['total_plug_load'].corr(Printer_ebu3b_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for printers in EBU3B is: {Printer_ebu3b_correlation}\n")

Computer_ebu3b_correlation = Computer_ebu3b_merged_15min['total_plug_load'].corr(Computer_ebu3b_merged_15min['total_building_load'])
print(f"The Pearson correlation coefficient between total_plug_load and total_building_load for computers in EBU3B is: {Computer_ebu3b_correlation}\n")

EBU3B:

The Pearson correlation coefficient between total_plug_load and total_building_load for all devices in EBU3B is: -0.16378572246478373

The Pearson correlation coefficient between total_plug_load and total_building_load for air purifiers in EBU3B is: -0.03877615691521168

The Pearson correlation coefficient between total_plug_load and total_building_load for water dispensers in EBU3B is: -0.03877615691521168

The Pearson correlation coefficient between total_plug_load and total_building_load for TVs in EBU3B is: -0.012309574080633155

The Pearson correlation coefficient between total_plug_load and total_building_load for printers in EBU3B is: -0.06244141230729659

The Pearson correlation coefficient between total_plug_load and total_building_load for computers in EBU3B is: 0.08251244589731531



In [90]:
# Saving all the finalized dataframes as csv files to my GitHub repository
both_merged_15min_AllDevices.to_csv(r'C:\Users\vaugh\Desktop\smart-plug-research\PlugLoad-BuildingMetering-AllDevices.csv')
both_merged_15min_AirPurifier.to_csv(r'C:\Users\vaugh\Desktop\smart-plug-research\PlugLoad-BuildingMetering-AirPurifiers.csv')
both_merged_15min_WaterDispenser.to_csv(r'C:\Users\vaugh\Desktop\smart-plug-research\PlugLoad-BuildingMetering-WaterDispensers.csv')
both_merged_15min_TV.to_csv(r'C:\Users\vaugh\Desktop\smart-plug-research\PlugLoad-BuildingMetering-TVs.csv')
both_merged_15min_Printer.to_csv(r'C:\Users\vaugh\Desktop\smart-plug-research\PlugLoad-BuildingMetering-Printers.csv')
both_merged_15min_Computer.to_csv(r'C:\Users\vaugh\Desktop\smart-plug-research\PlugLoad-BuildingMetering-Computers.csv')