In [1]:
import pandas as pd
import os
import glob

# Specify the path to your main folder containing subfolders with CSV files
main_path = r'C:\ResearchFiles\AllFiles_Cleaned'

# List of buildings you want to include
selected_buildings = ['EBU3B']

# Recursively search for all CSV files in the subfolders of the selected buildings
all_files = []
for building in selected_buildings:
    building_path = os.path.join(main_path, building)
    all_files.extend(glob.glob(os.path.join(building_path, "*.csv")))

# Combine all files into a single DataFrame
df_list = []
for filename in all_files:
    df = pd.read_csv(filename)
    df['smart_plug'] = os.path.basename(filename).split('_')[1]  # Add a column for the smart plug identifier
    df['building'] = os.path.basename(os.path.dirname(filename))  # Add a column for the building
    df_list.append(df)

# Combine all the dataframes into one
combined_df = pd.concat(df_list, ignore_index=True)

In [2]:
# Drop unneccessary columns
combined_df = combined_df.drop(columns = ['analogInput_2', 'analogInput_4', 'analogInput_5', 'binaryInput_3', 'binaryValue_1', 'building'])

# Convert timestamp column to datetime
combined_df['time'] = pd.to_datetime(combined_df['time'])

# Create a new column for the 15 minute time interval
combined_df['15min'] = combined_df['time'].dt.floor('15T')

In [3]:
# Group by 'smart_plug' and '15min', then find the mean of the power readings over this 15 min interval
min15_df = combined_df.groupby(['smart_plug', '15min']).agg({'analogInput_3': 'mean'}).reset_index()

# Optionally, rename 'power_reading' to 'average_plug_usage'
min15_df.rename(columns={'analogInput_3': 'average_plug_usage'}, inplace=True)

In [4]:
min15_df

Unnamed: 0,smart_plug,15min,average_plug_usage
0,180408,2023-09-14 18:15:00,1862.500000
1,180408,2023-09-14 18:30:00,2636.333333
2,180408,2023-09-14 18:45:00,12821.600000
3,180408,2023-09-14 19:00:00,1901.133333
4,180408,2023-09-14 19:15:00,1866.800000
...,...,...,...
1726493,944300,2024-06-10 22:45:00,0.000000
1726494,944300,2024-06-10 23:00:00,0.000000
1726495,944300,2024-06-10 23:15:00,0.000000
1726496,944300,2024-06-10 23:30:00,0.000000


In [6]:
# Read in the helper data with the floor of the plug and the manual inspection status
plug_info = pd.read_csv(r'C:\Users\vaugh\Downloads\helper_spreadsheet(5).csv')

In [7]:
plug_info

Unnamed: 0,smart_plug,plug_floor,Inspection
0,182288,1,checked
1,286268,1,checked
2,286456,1,checked
3,186316,1,missing
4,291880,1,checked
...,...,...,...
76,283884,4,checked
77,288480,4,checked
78,291792,4,missing
79,565612,4,checked


In [12]:
# Convert the plug id columns for both dataframes to strings to prepare for merging
min15_df['smart_plug'] = min15_df['smart_plug'].astype(str)
plug_info['smart_plug'] = plug_info['smart_plug'].astype(str)

# Merge the aggregated data with the plug info data
merged_15min = pd.merge(min15_df, plug_info, on='smart_plug')
merged_15min = merged_15min[
    (merged_15min['Inspection'] == 'checked') & 
    (~merged_15min['smart_plug'].isin(['299184', '183436', '186204', '634584', '284068', '298808', 
                                       '180552', '944300', '186212', '291792', '283892', '288848', 
                                       '565996', '283884', '287196', '654200', '286460', '182136', 
                                       '297624', '181084', '291824', '284416', '293688', '291080', 
                                       '290240', '297984', '289840', '182552', '291728', '180672', 
                                       '183428', '284556', '285120', '291076', '291740', '183232', 
                                       '284372', '180940', '182268', '182584', '292032', '294576', 
                                       '296488', '298776', '299852', '639140', '651712', '652512', 
                                       '301192', '562240', '566540', '780512']))
]

# Rename the 15min column more accurately to Timestamp
merged_15min.rename(columns = {'15min': 'Timestamp'}, inplace=True)

In [13]:
merged_15min

Unnamed: 0,smart_plug,Timestamp,average_plug_usage,plug_floor,Inspection
0,180408,2023-09-14 18:15:00,1862.500000,2,checked
1,180408,2023-09-14 18:30:00,2636.333333,2,checked
2,180408,2023-09-14 18:45:00,12821.600000,2,checked
3,180408,2023-09-14 19:00:00,1901.133333,2,checked
4,180408,2023-09-14 19:15:00,1866.800000,2,checked
...,...,...,...,...,...
1705203,782264,2024-06-10 22:45:00,81602.066667,4,checked
1705204,782264,2024-06-10 23:00:00,8035.400000,4,checked
1705205,782264,2024-06-10 23:15:00,8121.200000,4,checked
1705206,782264,2024-06-10 23:30:00,61143.866667,4,checked


In [15]:
# Sum the values from all the plugs on each floor for a total plug consumption at each timestamp
total_floor_load = merged_15min.groupby(['plug_floor', 'Timestamp']).agg({'average_plug_usage': 'sum'}).reset_index()

In [16]:
total_floor_load

Unnamed: 0,plug_floor,Timestamp,average_plug_usage
0,1,2023-09-14 18:15:00,115509.581633
1,1,2023-09-14 18:30:00,110912.600000
2,1,2023-09-14 18:45:00,115569.800000
3,1,2023-09-14 19:00:00,114158.933333
4,1,2023-09-14 19:15:00,113398.133333
...,...,...,...
90121,4,2024-06-10 22:45:00,285569.466667
90122,4,2024-06-10 23:00:00,269492.266667
90123,4,2024-06-10 23:15:00,271619.400000
90124,4,2024-06-10 23:30:00,383998.066667


In [17]:
# Read in the data for the EBU3B submetering
ebu3b_submetering = pd.read_csv(r'C:\Users\vaugh\Downloads\helper_spreadsheet(6).csv')

# Convert timestamp column to datetime
ebu3b_submetering['Timestamp'] = pd.to_datetime(ebu3b_submetering['Timestamp'])

In [23]:
# Merge the plug load df with the submetering df
all_merged = pd.merge(total_floor_load, ebu3b_submetering, on='Timestamp')

# Set Timestamp as the index for better readability
all_merged = all_merged.set_index('Timestamp')

In [24]:
all_merged

Unnamed: 0_level_0,plug_floor,average_plug_usage,lights_1,lights_2,lights_3,lights_4
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-09-14 18:15:00,1,115509.581633,8.946934,2.344829,5.794400,4.893859
2023-09-14 18:15:00,2,122467.596939,8.946934,2.344829,5.794400,4.893859
2023-09-14 18:15:00,3,13687.142857,8.946934,2.344829,5.794400,4.893859
2023-09-14 18:15:00,4,413393.768622,8.946934,2.344829,5.794400,4.893859
2023-09-14 18:30:00,1,110912.600000,8.904392,2.237408,5.617190,4.782036
...,...,...,...,...,...,...
2024-06-10 23:30:00,4,383998.066667,7.439948,3.264275,5.114317,0.788109
2024-06-10 23:45:00,1,160159.600000,7.352859,3.259342,5.539901,0.804296
2024-06-10 23:45:00,2,130514.533333,7.352859,3.259342,5.539901,0.804296
2024-06-10 23:45:00,3,141740.933333,7.352859,3.259342,5.539901,0.804296


In [25]:
# Create 4 new dataframes, one for each floor in EBU3B
first_floor = all_merged[all_merged['plug_floor'] == 1].drop(columns = ['lights_2', 'lights_3', 'lights_4'])

second_floor = all_merged[all_merged['plug_floor'] == 2].drop(columns = ['lights_1', 'lights_3', 'lights_4'])

third_floor = all_merged[all_merged['plug_floor'] == 3].drop(columns = ['lights_1', 'lights_2', 'lights_4'])

fourth_floor = all_merged[all_merged['plug_floor'] == 4].drop(columns = ['lights_1', 'lights_2', 'lights_3'])

In [26]:
first_floor

Unnamed: 0_level_0,plug_floor,average_plug_usage,lights_1
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-09-14 18:15:00,1,115509.581633,8.946934
2023-09-14 18:30:00,1,110912.600000,8.904392
2023-09-14 18:45:00,1,115569.800000,8.873688
2023-09-14 19:00:00,1,114158.933333,8.839038
2023-09-14 19:15:00,1,113398.133333,8.852958
...,...,...,...
2024-06-10 22:45:00,1,267746.000000,7.390546
2024-06-10 23:00:00,1,291958.333333,7.299205
2024-06-10 23:15:00,1,132128.000000,7.396194
2024-06-10 23:30:00,1,164812.466667,7.439948


In [28]:
# Calculate the Pearson correlation coefficient for the first floor
floor1_correlation = first_floor['average_plug_usage'].corr(first_floor['lights_1'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on first floor is: {floor1_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on first floor is: 0.04877739575235005


In [29]:
# Calculate the Pearson correlation coefficient for the second floor
floor2_correlation = second_floor['average_plug_usage'].corr(second_floor['lights_2'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on second floor is: {floor2_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on second floor is: 0.1353938210052898


In [30]:
# Calculate the Pearson correlation coefficient for the third floor
floor3_correlation = third_floor['average_plug_usage'].corr(third_floor['lights_3'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on third floor is: {floor3_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on third floor is: 0.03891315944881746


In [31]:
# Calculate the Pearson correlation coefficient for the fourth floor
floor4_correlation = fourth_floor['average_plug_usage'].corr(fourth_floor['lights_4'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on fourth floor is: {floor4_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on fourth floor is: 0.11596153814313159


In [32]:
# Saving the finalized dataframe as a csv file to my GitHub Repository for future reference
all_merged.to_csv(r'C:\Users\vaugh\Desktop\smart-plug-research\PlugLoad-Lights-by-floor.csv')