In [2]:
import pandas as pd
import os
import glob

# Specify the path to your main folder containing subfolders with CSV files
main_path = r'C:\ResearchFiles\AllFiles_Cleaned'

# List of buildings you want to include
selected_buildings = ['EBU3B']

# Recursively search for all CSV files in the subfolders of the selected buildings
all_files = []
for building in selected_buildings:
    building_path = os.path.join(main_path, building)
    all_files.extend(glob.glob(os.path.join(building_path, "*.csv")))

# Combine all files into a single DataFrame
df_list = []
for filename in all_files:
    df = pd.read_csv(filename)
    df['smart_plug'] = os.path.basename(filename).split('_')[1]  # Add a column for the smart plug identifier
    df['building'] = os.path.basename(os.path.dirname(filename))  # Add a column for the building
    df_list.append(df)

# Combine all the dataframes into one
combined_df = pd.concat(df_list, ignore_index=True)

# Optional: save the combined DataFrame to a CSV file
# combined_df.to_csv(r'C:\ResearchFiles\combined_data_selected_buildings.csv', index=False)

In [3]:
combined_df = combined_df.drop(columns = ['analogInput_2', 'analogInput_4', 'analogInput_5', 'binaryInput_3', 'binaryValue_1', 'building'])

In [4]:
# Convert timestamp column to datetime
combined_df['time'] = pd.to_datetime(combined_df['time'])

In [5]:
# Create a new column for the hour
combined_df['15min'] = combined_df['time'].dt.floor('15T')

In [6]:
# Group by 'building', 'smart_plug', and 'hour', then sum the power readings
min15_df = combined_df.groupby(['smart_plug', '15min']).agg({'analogInput_3': 'sum'}).reset_index()

# Optionally, rename 'power_reading' to 'average_power_usage'
min15_df.rename(columns={'analogInput_3': 'average_plug_usage'}, inplace=True)

In [8]:
min15_df

Unnamed: 0,smart_plug,15min,average_power_usage
0,180408,2023-09-14 18:15:00,26075.0
1,180408,2023-09-14 18:30:00,39545.0
2,180408,2023-09-14 18:45:00,192324.0
3,180408,2023-09-14 19:00:00,28517.0
4,180408,2023-09-14 19:15:00,28002.0
...,...,...,...
1726493,944300,2024-06-10 22:45:00,0.0
1726494,944300,2024-06-10 23:00:00,0.0
1726495,944300,2024-06-10 23:15:00,0.0
1726496,944300,2024-06-10 23:30:00,0.0


In [16]:
plug_info = pd.read_csv('helper_spreadsheet(5).csv')

In [17]:
plug_info

Unnamed: 0,smart_plug,plug_floor,Inspection
0,182288,1,checked
1,286268,1,checked
2,286456,1,checked
3,186316,1,missing
4,291880,1,checked
...,...,...,...
76,283884,4,checked
77,288480,4,checked
78,291792,4,missing
79,565612,4,checked


In [18]:
min15_df['smart_plug'] = min15_df['smart_plug'].astype(str)
plug_info['smart_plug'] = plug_info['smart_plug'].astype(str)

# Merge the aggregated data with the plug info data
merged_15min = pd.merge(min15_df, plug_info, on='smart_plug')
merged_15min = merged_15min[
    (merged_15min['Inspection'] == 'checked') & 
    (~merged_15min['smart_plug'].isin(['299184', '183436', '186204', '634584', '284068', '298808', 
                                       '180552', '944300', '186212', '291792', '283892', '288848', 
                                       '565996', '283884', '287196', '654200', '286460', '182136', 
                                       '297624', '181084', '291824', '284416', '293688', '291080', 
                                       '290240', '297984', '289840', '182552', '291728', '180672', 
                                       '183428', '284556', '285120', '291076', '291740', '183232', 
                                       '284372', '180940', '182268', '182584', '292032', '294576', 
                                       '296488', '298776', '299852', '639140', '651712', '652512', 
                                       '301192', '562240', '566540', '780512']))
]

In [20]:
merged_15min = merged_15min.rename(columns = {'15min': 'Timestamp'})

In [21]:
merged_15min

Unnamed: 0,smart_plug,Timestamp,average_power_usage,plug_floor,Inspection
0,180408,2023-09-14 18:15:00,26075.0,2,checked
1,180408,2023-09-14 18:30:00,39545.0,2,checked
2,180408,2023-09-14 18:45:00,192324.0,2,checked
3,180408,2023-09-14 19:00:00,28517.0,2,checked
4,180408,2023-09-14 19:15:00,28002.0,2,checked
...,...,...,...,...,...
1705203,782264,2024-06-10 22:45:00,1224031.0,4,checked
1705204,782264,2024-06-10 23:00:00,120531.0,4,checked
1705205,782264,2024-06-10 23:15:00,121818.0,4,checked
1705206,782264,2024-06-10 23:30:00,917158.0,4,checked


In [30]:
total_floor_load = merged_15min.groupby(['plug_floor', 'Timestamp']).agg({'average_power_usage': 'sum'}).reset_index()

In [32]:
total_floor_load

Unnamed: 0,plug_floor,Timestamp,average_power_usage
0,1,2023-09-14 18:15:00,1.617134e+06
1,1,2023-09-14 18:30:00,1.663689e+06
2,1,2023-09-14 18:45:00,1.733547e+06
3,1,2023-09-14 19:00:00,1.712384e+06
4,1,2023-09-14 19:15:00,1.700972e+06
...,...,...,...
90121,4,2024-06-10 22:45:00,4.283542e+06
90122,4,2024-06-10 23:00:00,4.042384e+06
90123,4,2024-06-10 23:15:00,4.074291e+06
90124,4,2024-06-10 23:30:00,5.759971e+06


In [22]:
ebu3b_submetering = pd.read_csv('helper_spreadsheet(6).csv')

In [24]:
# Convert timestamp column to datetime
ebu3b_submetering['Timestamp'] = pd.to_datetime(ebu3b_submetering['Timestamp'])

In [35]:
ebu3b_submetering

Unnamed: 0,lights_1,lights_2,lights_3,lights_4,Timestamp
0,3.595069,0.305746,3.992644,,2023-01-01 00:15:00
1,3.596011,0.305836,3.994483,,2023-01-01 00:30:00
2,3.622392,0.306269,3.995989,,2023-01-01 00:45:00
3,3.837311,0.307278,3.996098,,2023-01-01 01:00:00
4,3.613120,0.306951,3.998813,,2023-01-01 01:15:00
...,...,...,...,...,...
54519,7.406206,2.331642,1.522662,0.678084,2024-07-21 23:00:00
54520,7.370774,2.330738,1.526133,0.676441,2024-07-21 23:15:00
54521,7.310753,2.332439,1.529642,0.677863,2024-07-21 23:30:00
54522,7.273276,2.322540,1.533592,0.681411,2024-07-21 23:45:00


In [33]:
all_merged = pd.merge(total_floor_load, ebu3b_submetering, on='Timestamp')

In [34]:
all_merged

Unnamed: 0,plug_floor,Timestamp,average_power_usage,lights_1,lights_2,lights_3,lights_4
0,1,2023-09-14 18:15:00,1.617134e+06,8.946934,2.344829,5.794400,4.893859
1,2,2023-09-14 18:15:00,1.714546e+06,8.946934,2.344829,5.794400,4.893859
2,3,2023-09-14 18:15:00,1.916200e+05,8.946934,2.344829,5.794400,4.893859
3,4,2023-09-14 18:15:00,5.787513e+06,8.946934,2.344829,5.794400,4.893859
4,1,2023-09-14 18:30:00,1.663689e+06,8.904392,2.237408,5.617190,4.782036
...,...,...,...,...,...,...,...
90121,4,2024-06-10 23:30:00,5.759971e+06,7.439948,3.264275,5.114317,0.788109
90122,1,2024-06-10 23:45:00,2.402394e+06,7.352859,3.259342,5.539901,0.804296
90123,2,2024-06-10 23:45:00,1.957718e+06,7.352859,3.259342,5.539901,0.804296
90124,3,2024-06-10 23:45:00,2.126114e+06,7.352859,3.259342,5.539901,0.804296


In [36]:
first_floor = all_merged[all_merged['plug_floor'] == 1].drop(columns = ['lights_2', 'lights_3', 'lights_4'])

In [37]:
first_floor

Unnamed: 0,plug_floor,Timestamp,average_power_usage,lights_1
0,1,2023-09-14 18:15:00,1.617134e+06,8.946934
4,1,2023-09-14 18:30:00,1.663689e+06,8.904392
8,1,2023-09-14 18:45:00,1.733547e+06,8.873688
12,1,2023-09-14 19:00:00,1.712384e+06,8.839038
16,1,2023-09-14 19:15:00,1.700972e+06,8.852958
...,...,...,...,...
90106,1,2024-06-10 22:45:00,4.016190e+06,7.390546
90110,1,2024-06-10 23:00:00,4.379375e+06,7.299205
90114,1,2024-06-10 23:15:00,1.981920e+06,7.396194
90118,1,2024-06-10 23:30:00,2.472187e+06,7.439948


In [38]:
second_floor = all_merged[all_merged['plug_floor'] == 2].drop(columns = ['lights_1', 'lights_3', 'lights_4'])

In [39]:
third_floor = all_merged[all_merged['plug_floor'] == 3].drop(columns = ['lights_1', 'lights_2', 'lights_4'])

In [40]:
fourth_floor = all_merged[all_merged['plug_floor'] == 4].drop(columns = ['lights_1', 'lights_2', 'lights_3'])

In [41]:
# Calculate the Pearson correlation coefficient
floor1_correlation = first_floor['average_power_usage'].corr(first_floor['lights_1'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on first floor is: {floor1_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on first floor is: 0.04822566104792325


In [42]:
# Calculate the Pearson correlation coefficient
floor2_correlation = second_floor['average_power_usage'].corr(second_floor['lights_2'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on second floor is: {floor2_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on second floor is: 0.13349100007551243


In [43]:
# Calculate the Pearson correlation coefficient
floor3_correlation = third_floor['average_power_usage'].corr(third_floor['lights_3'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on third floor is: {floor3_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on third floor is: 0.038126330281296084


In [44]:
# Calculate the Pearson correlation coefficient
floor4_correlation = fourth_floor['average_power_usage'].corr(fourth_floor['lights_4'])

print(f"The Pearson correlation coefficient between the average plug load and the lights submetering on fourth floor is: {floor4_correlation}")

The Pearson correlation coefficient between the average plug load and the lights submetering on fourth floor is: 0.11399619158391724
