In [1]:
import pandas as pd
import os
import glob

# Specify the path to subfolders with CSV files
main_path = r'C:\ResearchFiles\AllFiles_Cleaned'

# List of buildings to include
selected_buildings = ['Atkinson_Hall', 'EBU3B']

# Recursively search for all CSV files in the subfolders of the selected buildings
all_files = []
for building in selected_buildings:
    building_path = os.path.join(main_path, building)
    all_files.extend(glob.glob(os.path.join(building_path, "*.csv")))

# Combine all files into a single DataFrame
df_list = []
for filename in all_files:
    df = pd.read_csv(filename)
    df['smart_plug'] = os.path.basename(filename).split('_')[1]  # Add a column for the smart plug identifier
    df['building'] = os.path.basename(os.path.dirname(filename))  # Add a column for the building
    df_list.append(df)

# Combine all the dataframes into one
combined_df = pd.concat(df_list, ignore_index=True)

In [2]:
combined_df = combined_df.drop(columns = ['analogInput_2', 'analogInput_4', 'analogInput_5', 'binaryInput_3', 'binaryValue_1', 'building'])

In [3]:
combined_df

Unnamed: 0,time,analogInput_3,smart_plug
0,2023-09-14T18:16,30392.714286,180940
1,2023-09-14T18:17,31416.000000,180940
2,2023-09-14T18:18,30044.000000,180940
3,2023-09-14T18:19,30740.000000,180940
4,2023-09-14T18:20,29952.000000,180940
...,...,...,...
36967007,2024-06-10T23:55,0.000000,944300
36967008,2024-06-10T23:56,0.000000,944300
36967009,2024-06-10T23:57,0.000000,944300
36967010,2024-06-10T23:58,0.000000,944300


In [4]:
# Convert timestamp column to datetime
combined_df['time'] = pd.to_datetime(combined_df['time'])

In [5]:
# Create a new column for the hour
combined_df['15min'] = combined_df['time'].dt.floor('15T')

In [6]:
combined_df

Unnamed: 0,time,analogInput_3,smart_plug,15min
0,2023-09-14 18:16:00,30392.714286,180940,2023-09-14 18:15:00
1,2023-09-14 18:17:00,31416.000000,180940,2023-09-14 18:15:00
2,2023-09-14 18:18:00,30044.000000,180940,2023-09-14 18:15:00
3,2023-09-14 18:19:00,30740.000000,180940,2023-09-14 18:15:00
4,2023-09-14 18:20:00,29952.000000,180940,2023-09-14 18:15:00
...,...,...,...,...
36967007,2024-06-10 23:55:00,0.000000,944300,2024-06-10 23:45:00
36967008,2024-06-10 23:56:00,0.000000,944300,2024-06-10 23:45:00
36967009,2024-06-10 23:57:00,0.000000,944300,2024-06-10 23:45:00
36967010,2024-06-10 23:58:00,0.000000,944300,2024-06-10 23:45:00


In [7]:
# Group by 'building', 'smart_plug', and 'hour', then sum the power readings
min15_df = combined_df.groupby(['smart_plug', '15min']).agg({'analogInput_3': 'sum'}).reset_index()

# Optionally, rename 'power_reading' to 'average_power_usage'
min15_df.rename(columns={'analogInput_3': 'average_power_usage'}, inplace=True)

# The 'hourly_df' now contains the hourly total power usage for each plug in the selected buildings

In [8]:
min15_df

Unnamed: 0,smart_plug,15min,average_power_usage
0,180408,2023-09-14 18:15:00,26075.0
1,180408,2023-09-14 18:30:00,39545.0
2,180408,2023-09-14 18:45:00,192324.0
3,180408,2023-09-14 19:00:00,28517.0
4,180408,2023-09-14 19:15:00,28002.0
...,...,...,...
2480137,944300,2024-06-10 22:45:00,0.0
2480138,944300,2024-06-10 23:00:00,0.0
2480139,944300,2024-06-10 23:15:00,0.0
2480140,944300,2024-06-10 23:30:00,0.0


In [9]:
plug_info = pd.read_csv('helper_spreadsheet(2).csv')

In [10]:
plug_info

Unnamed: 0,smart_plug,building_name,Load_Type,Inspection
0,291824,Atkinson Hall,Computer,disconnected
1,291956,Atkinson Hall,Printer,checked
2,292032,Atkinson Hall,Printer,checked
3,183436,Atkinson Hall,Water Dispenser,disconnected
4,181084,Atkinson Hall,TV,checked
...,...,...,...,...
112,283884,EBU3B,Printer,checked
113,288480,EBU3B,Printer,checked
114,291792,EBU3B,Printer,missing
115,565612,EBU3B,Printer,checked


In [11]:
min15_df['smart_plug'] = min15_df['smart_plug'].astype(str)
plug_info['smart_plug'] = plug_info['smart_plug'].astype(str)

# Merge the aggregated data with the plug info data
merged_15min = pd.merge(min15_df, plug_info, on='smart_plug')
merged_15min = merged_15min[
    (merged_15min['Inspection'] == 'checked') & 
    (~merged_15min['smart_plug'].isin(['299184', '183436', '186204', '634584', '284068', '298808', 
                                       '180552', '944300', '186212', '291792', '283892', '288848', 
                                       '565996', '283884', '287196', '654200', '286460', '182136', 
                                       '297624', '181084', '291824', '284416', '293688', '291080', 
                                       '290240', '297984', '289840', '182552', '291728', '180672', 
                                       '183428', '284556', '285120', '291076', '291740', '183232', 
                                       '284372', '180940', '182268', '182584', '292032', '294576', 
                                       '296488', '298776', '299852', '639140', '651712', '652512', 
                                       '301192', '562240', '566540', '780512']))
]

In [12]:
merged_15min = merged_15min[merged_15min['Load_Type'] == 'Air Purifier']

In [13]:
merged_15min = merged_15min.rename(columns = {'15min': 'Timestamp'})

In [14]:
merged_15min

Unnamed: 0,smart_plug,Timestamp,average_power_usage,building_name,Load_Type,Inspection
1060496,286456,2023-09-14 18:15:00,669143.571429,EBU3B,Air Purifier,checked
1060497,286456,2023-09-14 18:30:00,716445.000000,EBU3B,Air Purifier,checked
1060498,286456,2023-09-14 18:45:00,712120.000000,EBU3B,Air Purifier,checked
1060499,286456,2023-09-14 19:00:00,715228.000000,EBU3B,Air Purifier,checked
1060500,286456,2023-09-14 19:15:00,711481.000000,EBU3B,Air Purifier,checked
...,...,...,...,...,...,...
1083022,286456,2024-06-10 22:45:00,0.000000,EBU3B,Air Purifier,checked
1083023,286456,2024-06-10 23:00:00,0.000000,EBU3B,Air Purifier,checked
1083024,286456,2024-06-10 23:15:00,0.000000,EBU3B,Air Purifier,checked
1083025,286456,2024-06-10 23:30:00,0.000000,EBU3B,Air Purifier,checked


In [15]:
total_building_load = merged_15min.groupby(['building_name', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()

In [16]:
total_building_load

Unnamed: 0,building_name,Timestamp,average_power_usage
0,EBU3B,2023-09-14 18:15:00,669143.571429
1,EBU3B,2023-09-14 18:30:00,716445.000000
2,EBU3B,2023-09-14 18:45:00,712120.000000
3,EBU3B,2023-09-14 19:00:00,715228.000000
4,EBU3B,2023-09-14 19:15:00,711481.000000
...,...,...,...
22526,EBU3B,2024-06-10 22:45:00,0.000000
22527,EBU3B,2024-06-10 23:00:00,0.000000
22528,EBU3B,2024-06-10 23:15:00,0.000000
22529,EBU3B,2024-06-10 23:30:00,0.000000


In [17]:
ebu3b_submetering = pd.read_csv('helper_spreadsheet(4).csv')

In [18]:
ebu3b_submetering

Unnamed: 0,Timestamp,Elevator,Lights 1st Floor,Lights 2nd Floor,Lights 3rd Floor,Lights 4th Floor,Total Lights,WARREN.EBU3B_1st_Floor_E2545#Real Power Mean#kW,WARREN.EBU3B_1st_Floor_E2546#Real Power Mean#kW,WARREN.EBU3B_1st_Floor_E2548#Real Power Mean#kW,WARREN.EBU3B_E2544#Real Power Mean#kW,WARREN.EBU3B_Panel_M_E2526#Real Power Mean#kW,Total Servers
0,2023-01-01 0:15:00,0.629064,3.595069,0.305746,3.992644,,7.893459,11.413555,25.305758,7.538886,36.427650,21.350590,102.036439
1,2023-01-01 0:30:00,0.658419,3.596011,0.305836,3.994483,,7.896329,11.610726,25.186268,7.627571,36.456684,21.353943,102.235192
2,2023-01-01 0:45:00,0.504297,3.622392,0.306269,3.995989,,7.924650,11.615798,25.464224,7.578872,36.442036,21.381977,102.482906
3,2023-01-01 1:00:00,5.773901,3.837311,0.307278,3.996098,,8.140687,11.529154,25.393629,7.614870,36.355682,21.401571,102.294906
4,2023-01-01 1:15:00,5.650762,3.613120,0.306951,3.998813,,7.918884,11.446468,25.374859,7.581429,36.251667,21.331484,101.985907
...,...,...,...,...,...,...,...,...,...,...,...,...,...
54521,2024-07-21 23:30:00,0.838737,7.310753,2.332439,1.529642,0.677863,11.850697,13.863780,33.737057,5.733397,20.453098,20.648443,94.435776
54522,2024-07-21 23:45:00,4.860091,7.273276,2.322540,1.533592,0.681411,11.810819,13.865835,33.922832,5.681652,20.426142,20.625990,94.522451
54523,2024-07-22 0:00:00,6.950442,7.206880,2.304355,1.536416,0.677346,11.724997,13.773886,34.070251,5.653042,20.459484,20.563271,94.519934
54524,,,,,,,0.000000,,,,,,0.000000


In [19]:
building_metering = pd.read_csv('building_metering.csv')

In [20]:
building_metering = building_metering.drop(columns = ['WARREN.CAL_IT_E2531#Real Power Mean#kW', 'WARREN.CAL_IT_E2532#Real Power Mean#kW', 'WARREN.CAL_IT_Gate_E2530#Real Power Mean#kW', 'WARREN.EBU3B_E2520#Real Power Mean#kW', 'WARREN.EBU3B_E2521#Real Power Mean#kW'])

In [21]:
building_metering

Unnamed: 0,Timestamp,EBU3B total,Atkinson total
0,2023-01-01 0:15:00,312.613174,632.773163
1,2023-01-01 0:30:00,313.511780,630.595501
2,2023-01-01 0:45:00,317.042526,631.453430
3,2023-01-01 1:00:00,318.071701,627.231457
4,2023-01-01 1:15:00,316.035248,634.164612
...,...,...,...
54521,2024-07-21 23:30:00,397.910767,670.800072
54522,2024-07-21 23:45:00,396.301117,667.563164
54523,2024-07-22 0:00:00,394.709076,666.877243
54524,,0.000000,0.000000


In [22]:
# Convert timestamp column to datetime
building_metering['Timestamp'] = pd.to_datetime(building_metering['Timestamp'])

In [23]:
total_building_load = total_building_load.rename(columns = {'15min': 'Timestamp'})

In [24]:
total_building_load

Unnamed: 0,building_name,Timestamp,average_power_usage
0,EBU3B,2023-09-14 18:15:00,669143.571429
1,EBU3B,2023-09-14 18:30:00,716445.000000
2,EBU3B,2023-09-14 18:45:00,712120.000000
3,EBU3B,2023-09-14 19:00:00,715228.000000
4,EBU3B,2023-09-14 19:15:00,711481.000000
...,...,...,...
22526,EBU3B,2024-06-10 22:45:00,0.000000
22527,EBU3B,2024-06-10 23:00:00,0.000000
22528,EBU3B,2024-06-10 23:15:00,0.000000
22529,EBU3B,2024-06-10 23:30:00,0.000000


In [25]:
both_merged_15min = pd.merge(total_building_load, building_metering, on='Timestamp')

In [26]:
both_merged_15min['average_power_usage'] = both_merged_15min['average_power_usage'] / 1_000_000

In [27]:
both_merged_15min

Unnamed: 0,building_name,Timestamp,average_power_usage,EBU3B total,Atkinson total
0,EBU3B,2023-09-14 18:15:00,0.669144,311.203552,705.908989
1,EBU3B,2023-09-14 18:30:00,0.716445,303.462402,703.535606
2,EBU3B,2023-09-14 18:45:00,0.712120,303.112823,703.993347
3,EBU3B,2023-09-14 19:00:00,0.715228,307.819702,697.937096
4,EBU3B,2023-09-14 19:15:00,0.711481,304.548492,700.763321
...,...,...,...,...,...
22526,EBU3B,2024-06-10 22:45:00,0.000000,372.360016,734.441208
22527,EBU3B,2024-06-10 23:00:00,0.000000,381.564133,699.659142
22528,EBU3B,2024-06-10 23:15:00,0.000000,385.190674,696.803253
22529,EBU3B,2024-06-10 23:30:00,0.000000,381.697632,693.240364


In [28]:
both_merged_15min['average_power_usage'] = both_merged_15min['average_power_usage'] / 4

In [29]:
both_merged_15min['EBU3B total'] = both_merged_15min['EBU3B total'] / 4

In [30]:
both_merged_15min['Atkinson total'] = both_merged_15min['Atkinson total'] / 4

In [31]:
both_merged_15min

Unnamed: 0,building_name,Timestamp,average_power_usage,EBU3B total,Atkinson total
0,EBU3B,2023-09-14 18:15:00,0.167286,77.800888,176.477247
1,EBU3B,2023-09-14 18:30:00,0.179111,75.865601,175.883902
2,EBU3B,2023-09-14 18:45:00,0.178030,75.778206,175.998337
3,EBU3B,2023-09-14 19:00:00,0.178807,76.954926,174.484274
4,EBU3B,2023-09-14 19:15:00,0.177870,76.137123,175.190830
...,...,...,...,...,...
22526,EBU3B,2024-06-10 22:45:00,0.000000,93.090004,183.610302
22527,EBU3B,2024-06-10 23:00:00,0.000000,95.391033,174.914785
22528,EBU3B,2024-06-10 23:15:00,0.000000,96.297668,174.200813
22529,EBU3B,2024-06-10 23:30:00,0.000000,95.424408,173.310091


In [32]:
# Creating the 'building_load' column based on the building name
both_merged_15min['building_load'] = both_merged_15min.apply(
    lambda row: row['Atkinson total'] if row['building_name'] == 'Atkinson Hall' else row['EBU3B total'],
    axis=1
)

# Dropping the old total columns (optional)
both_merged_15min = both_merged_15min.drop(columns=['EBU3B total', 'Atkinson total'])

In [33]:
both_merged_15min = both_merged_15min.set_index('Timestamp')

In [34]:
both_merged_15min = both_merged_15min.rename(columns = {'average_power_usage': 'total_plug_load', 'building_load': 'total_building_load'})

In [35]:
both_merged_15min

Unnamed: 0_level_0,building_name,total_plug_load,total_building_load
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-09-14 18:15:00,EBU3B,0.167286,77.800888
2023-09-14 18:30:00,EBU3B,0.179111,75.865601
2023-09-14 18:45:00,EBU3B,0.178030,75.778206
2023-09-14 19:00:00,EBU3B,0.178807,76.954926
2023-09-14 19:15:00,EBU3B,0.177870,76.137123
...,...,...,...
2024-06-10 22:45:00,EBU3B,0.000000,93.090004
2024-06-10 23:00:00,EBU3B,0.000000,95.391033
2024-06-10 23:15:00,EBU3B,0.000000,96.297668
2024-06-10 23:30:00,EBU3B,0.000000,95.424408


In [36]:
# Assuming your DataFrame is named df
# Calculate the Pearson correlation coefficient
correlation = both_merged_15min['total_plug_load'].corr(both_merged_15min['total_building_load'])

print(f"The Pearson correlation coefficient between total_plug_load and total_building_load is: {correlation}")

The Pearson correlation coefficient between total_plug_load and total_building_load is: -0.03877615691521168


In [37]:
atkinson_merged_15min = both_merged_15min[both_merged_15min['building_name'] == 'Atkinson Hall']

In [38]:
ebu3b_merged_15min = both_merged_15min[both_merged_15min['building_name'] == 'EBU3B']

In [39]:
atkinson_correlation = atkinson_merged_15min['total_plug_load'].corr(atkinson_merged_15min['total_building_load'])

print(f"The Pearson correlation coefficient between total_plug_load and total_building_load in Atkinson Hall is: {atkinson_correlation}")

The Pearson correlation coefficient between total_plug_load and total_building_load in Atkinson Hall is: nan


In [40]:
ebu3b_correlation = ebu3b_merged_15min['total_plug_load'].corr(ebu3b_merged_15min['total_building_load'])

print(f"The Pearson correlation coefficient between total_plug_load and total_building_load in EBU3B is: {ebu3b_correlation}")

The Pearson correlation coefficient between total_plug_load and total_building_load in EBU3B is: -0.03877615691521168
