In [1]:
#!pip install tqdm

In [2]:
import math
import pandas as pd
import matplotlib.pyplot as plt
import csv
import GPy
import numpy as np
import pickle

from datetime import datetime
from glob import glob
from scipy import stats

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from scipy.stats import norm, percentileofscore
from tqdm import tqdm





# Load cleaned dataset

In [3]:
cleaned_meters_path = "..\\data\\meters\\cleaned\\"

# files in directory
files = glob(cleaned_meters_path + "*.csv")


In [4]:
dfs = [] # empty list of the dataframes to create
for file in files: # for each file in directory
    meter_type = file.split("\\")[4].split(".")[0] # meter_type to rename the value feature
    meter = pd.read_csv(file) # load the dataset
    meter = pd.melt(meter, id_vars = "timestamp", var_name = "building_id", value_name = "meter_reading") # melt dataset
    meter["meter"] = str(meter_type) # adds column with the meter type
    dfs.append(meter) # append to list
complete_data_cleaned = pd.concat(dfs, axis=0, ignore_index=True) # concatenate all meter
del(dfs, meter, file, files, meter_type)

complete_data_cleaned.head()


Unnamed: 0,timestamp,building_id,meter_reading,meter
0,2016-01-01 00:00:00,Panther_office_Clementine,,chilledwater_cleaned
1,2016-01-01 01:00:00,Panther_office_Clementine,,chilledwater_cleaned
2,2016-01-01 02:00:00,Panther_office_Clementine,,chilledwater_cleaned
3,2016-01-01 03:00:00,Panther_office_Clementine,,chilledwater_cleaned
4,2016-01-01 04:00:00,Panther_office_Clementine,,chilledwater_cleaned


In [5]:
# Note this cell might take some time to finish

# Convert timestamp field from string into pd.datetime object
complete_data_cleaned['timestamp'] = pd.to_datetime(complete_data_cleaned['timestamp'])

# Add column indicating the year, month and dayOfTheWeek for that timestamp
complete_data_cleaned['date'] = complete_data_cleaned['timestamp'].dt.date
complete_data_cleaned['year'] = complete_data_cleaned['timestamp'].dt.year
complete_data_cleaned['month'] = complete_data_cleaned['timestamp'].dt.month
complete_data_cleaned['dayOfWeek'] = complete_data_cleaned['timestamp'].dt.dayofweek


# Load benchmarks

In [6]:
cleaned_meters_path = "..\\data\\"

# files in directory
files = glob(cleaned_meters_path + "*.csv")


benchmark = pd.read_csv(files[0]) # load the dataset

benchmark

Unnamed: 0,name,building_id,RMSE,MAE,horizon
0,Bear_utility_Sidney,utility,1.157131,0.846614,hourly
1,Bear_utility_Sidney,utility,1.255013,0.86239,daily
2,Bear_utility_Sidney,utility,1.851878,1.167219,weekly
3,Cockatoo_religion_Diedre,religion,1.475301,1.018945,hourly
4,Cockatoo_religion_Diedre,religion,2.34936,1.820794,daily
5,Cockatoo_religion_Diedre,religion,2.833513,1.958076,weekly
6,Cockatoo_science_Rex,science,7.304536,5.529282,hourly
7,Cockatoo_science_Rex,science,10.882962,7.975783,daily
8,Cockatoo_science_Rex,science,12.667458,8.26134,weekly
9,Eagle_education_Teresa,education,8.286079,5.855556,hourly


In [7]:
buildingNames = benchmark['name'].unique()

# Specify constants

In [8]:
# We load the first 
building_name = buildingNames[0]
representative_df = complete_data_cleaned.loc[(complete_data_cleaned['building_id'] == building_name)
                                               & (complete_data_cleaned['meter'] == 'electricity_cleaned')]

In [9]:
TRAIN_TEST_SPLIT = 0.3
DATAPOINS_PER_BUILDING_AND_METER_TYPE = representative_df.shape[0]
SPLIT_INDEX = int(DATAPOINS_PER_BUILDING_AND_METER_TYPE * (1 - TRAIN_TEST_SPLIT))
SPLIT_TIMESTAMP =  representative_df.iloc[SPLIT_INDEX]['timestamp']
TRAIN_DIMENSIONS = 5
SAMPLE_SIZE = 2000
DATAPOINTS_ONE_WEEK = 24 * 7

# Intermediate computations and ordering of dataframes

In [10]:
# This cell might take some time to finish

# Group the DataFrame by 'building_id' and 'meter_type'
complete_data_grouped = complete_data_cleaned.groupby(['building_id', 'meter'])

# Initialize an empty list to store the smaller DataFrames
dfs = []
df_keys = []

# Iterate over the groups and create smaller DataFrames
for group_key, group in complete_data_grouped:
    df_keys.append(group_key)
    dfs.append(group.copy()) 

In [11]:
group_keys = np.array(df_keys)

In [13]:
# Can be moved closer to the heading

# Split a dataframe into train and test data according to a split_date


def split_df_to_train_and_test(df, split_date):
    df_train = df.loc[df['timestamp'] < split_date].copy().reset_index(drop=True) 
    df_test  = df.loc[df['timestamp'] >= split_date].copy().reset_index(drop=True)
    return df_train, df_test

In [14]:
dfs_train = []
dfs_test = []

for df in dfs: 
    
    temp_train_df, temp_test_df = split_df_to_train_and_test(df, SPLIT_TIMESTAMP)
    dfs_train.append(temp_train_df)
    dfs_test.append(temp_test_df)
        

# Helper functions

In [15]:
'''
Computation of Spearmas Rank Correlation
@param  target: a pandas.dataframe (target) for wich we want to calculate the correlation matrix 
        possible_features: a list of pandas dataframes that are our features
        keys: a helping list, that manages the building_names and keeps track of the order in our possible_features 
        name: the building_name of the current dataframe we are predicting
@returns spearmans_matrix: a numpy.ndarray that contains the correlation value as first entry in each row and the corresponding p-value as a second element in each row. 
                            the order of stations in the matrix is consistent to the order in the dataset:
'''
def spearmansCorrelation (target, possible_features, name, keys = df_keys): 
    y_target = target['meter_reading']
    spearmans_matrix = np.zeros((len(possible_features), 2))
    for index, feature in enumerate(possible_features): 
        if ((target.shape[0] == feature.shape[0]) 
            and (keys[index] != (name, 'electricity_cleaned'))
            and ((feature['meter_reading'].isnull().sum()/feature.shape[0]) < 0.7)):
            
            # if shapes match, we are not at our target dataframe and the corresponding feature has sufficient data
            # we can compute the spearmans correlation
            correlation, pval = stats.spearmanr(y_target, feature['meter_reading'], nan_policy='omit')
            spearmans_matrix[index] = correlation, pval   

        else: 
            spearmans_matrix[index] = 0,100 
            continue
            
    return spearmans_matrix

In [16]:
# Not needed any longer

# For given dataset (list of pd.Dataframes) we compute a subsample of those dataframes such that we obtain a list of dfs
# that do not fit the given building_name and are not of meter_type 'electricity_cleaned'
# Also we obtain the index/position of the dataframe that matches the building_id and is of type electricity_cleaned
def get_all_features(dataset, building_name):
    key_to_exclude = [building_name, 'electricity_cleaned']
    features = []
    index_of_df = np.nan
    for index, key in enumerate(group_keys):
        # as the key composes of two elements, the buildling_id and the meter_type we compare two arrays of strings, therefore we need the .all() 
        if(key == key_to_exclude).all():
            index_of_df = index
            continue
        features.append(dataset[index])
            
    return features, index_of_df

# Computation of the Spearmans correlation matrix, with all other possible features

In [17]:
buildingNames[14:]

array(['Rat_public_Loretta', 'Wolf_retail_Marcella'], dtype=object)

In [None]:
correlation_matrices = []
for name in tqdm(buildingNames[14:]): 
    print(name)
    
    # Get dataframe that matches our building_id
    df = complete_data_cleaned.loc[(complete_data_cleaned['building_id']  == name) 
                                   & (complete_data_cleaned['meter'] == 'electricity_cleaned') ]
    
    # For later reconstruction we split the electricity meter reading into train and test set
    y_train_df, y_test_df = split_df_to_train_and_test(df, SPLIT_TIMESTAMP)
    
    
    # Compute the spearmans correlation between all those features in order to pick the most relevant ones
    spearmans_matrix = spearmansCorrelation(y_train_df, dfs_train, name)
    
    
    with open ('../data/correlation_matrix_{}.pkl'.format(name), 'wb') as outp:
        pickle.dump(spearmans_matrix, outp, pickle.HIGHEST_PROTOCOL )
    
    correlation_matrices.append((spearmans_matrix, name))

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Rat_public_Loretta




In [None]:

with open ('../data/correlation_matrices.pkl', 'wb') as outp:
    pickle.dump(correlation_matrices, outp, pickle.HIGHEST_PROTOCOL )
    
        