In [1]:
import pandas as pd
import numpy as np

from gridmeter._utils.data_processing import Data
from gridmeter._utils.data_processing_settings import Data_Settings
from gridmeter._utils import const as _const

# Data class in Gridmeter

The data class takes input in two formats - time series and loadshapes (both stacked and unstacked versions)
It returns an aggregated loadshape output based on the settings provided.

The usage is as follows:

```python

from gridmeter._utils.data_processing import Data
from gridmeter._utils.data_processing_settings import Data_Settings
from gridmeter._utils import const as _const

# Specify a time settings object (setting the time period to be seasonal day of week, i.e. 7 days * 3 seasons = 21 data points)
s = Data_Settings(TIME_PERIOD=_const.TimePeriod.SEASONAL_DAY_OF_WEEK)

# Use this time settings to create a data settings object
# df here is your input dataframe (more explained later on)
data = Data(time_series_df=df, settings=s)

# Check the output
data.get_loadshape()
    
```

Lets look at a few examples into the different types of input and possible outputs/return values.

## Time Series loadshapes

In [2]:
# Create a testing dataframe having an id, datetime of 15 min intervals, observed and modeled values
num_intervals = 4 * 24 * 365  # 4 intervals/hour * 24 hours/day * 365 days

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id1", "id2", "id3"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

# Convert 'datetime' column to datetime type
# df["datetime"] = pd.to_datetime(df["datetime"])

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.691112,0.545348
1,id1,2023-01-01 00:15:00,0.610815,0.873962
2,id1,2023-01-01 00:30:00,0.996261,0.607106
3,id1,2023-01-01 00:45:00,0.940431,0.613202
4,id1,2023-01-01 01:00:00,0.900356,0.061997
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.079176,0.894832
105116,id3,2023-12-31 23:00:00,0.405985,0.110396
105117,id3,2023-12-31 23:15:00,0.305425,0.250386
105118,id3,2023-12-31 23:30:00,0.194651,0.327459


In [3]:
# We can have Settings as None, which will use default settings


data1 = Data(time_series_df=df, settings=None)
data1.get_loadshape()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,495,496,497,498,499,500,501,502,503,504
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id1,0.514055,0.477364,0.477831,0.539715,0.567861,0.506745,0.468062,0.452307,0.481449,0.43649,...,0.492292,0.50429,0.502829,0.498376,0.533456,0.513615,0.545623,0.477422,0.539941,0.502864
id2,0.567966,0.496539,0.490891,0.422085,0.518143,0.474334,0.497741,0.493772,0.447418,0.509262,...,0.482465,0.481872,0.504239,0.531533,0.559018,0.506391,0.556891,0.498433,0.497808,0.538282
id3,0.521322,0.468946,0.489004,0.511848,0.475637,0.475067,0.474296,0.548275,0.485713,0.52023,...,0.51182,0.508375,0.516688,0.427903,0.472425,0.555548,0.533555,0.456264,0.481152,0.485573


In [4]:
# try a different Time Period setting

s = Data_Settings(TIME_PERIOD=_const.TimePeriod.SEASONAL_DAY_OF_WEEK)

data = Data(time_series_df=df, settings=s)
data.get_loadshape()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id1,0.502609,0.502005,0.498371,0.499097,0.510716,0.50181,0.504768,0.487633,0.499268,0.493135,...,0.510496,0.486733,0.489239,0.500365,0.501316,0.491926,0.500458,0.501307,0.494313,0.498394
id2,0.494351,0.498003,0.507347,0.498671,0.501594,0.49631,0.502153,0.496029,0.502887,0.492916,...,0.497044,0.494701,0.510158,0.500409,0.502636,0.49855,0.49435,0.498848,0.496146,0.494562
id3,0.499806,0.497734,0.500596,0.503229,0.492626,0.501964,0.492338,0.49751,0.485345,0.501507,...,0.51296,0.499171,0.486227,0.498991,0.496104,0.500238,0.497498,0.503219,0.492727,0.498534


#### We can also join two loadshapes if they have the same time_period classification

In [5]:
extended_df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id4", "id5", "id6"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

extended_df

Unnamed: 0,id,datetime,observed,modeled
0,id4,2023-01-01 00:00:00,0.924995,0.744391
1,id4,2023-01-01 00:15:00,0.948419,0.290004
2,id4,2023-01-01 00:30:00,0.859966,0.942396
3,id4,2023-01-01 00:45:00,0.410339,0.266863
4,id4,2023-01-01 01:00:00,0.702261,0.273207
...,...,...,...,...
105115,id6,2023-12-31 22:45:00,0.458919,0.893919
105116,id6,2023-12-31 23:00:00,0.145100,0.726999
105117,id6,2023-12-31 23:15:00,0.130032,0.081769
105118,id6,2023-12-31 23:30:00,0.358015,0.538163


In [6]:
s = Data_Settings(TIME_PERIOD=_const.TimePeriod.SEASONAL_DAY_OF_WEEK)
data_extended = Data(time_series_df=extended_df, settings=s)
data_extended.get_loadshape()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id4,0.492724,0.505538,0.498322,0.513031,0.499838,0.493808,0.503069,0.506053,0.493747,0.477565,...,0.505571,0.503961,0.506472,0.497519,0.50397,0.499071,0.498664,0.510987,0.511689,0.501406
id5,0.49463,0.494407,0.492638,0.49555,0.498049,0.505141,0.509758,0.489805,0.504258,0.491661,...,0.492175,0.499907,0.490875,0.4986,0.496115,0.507523,0.500375,0.48637,0.502198,0.518461
id6,0.512707,0.504244,0.497784,0.510918,0.505547,0.49756,0.490573,0.49986,0.495031,0.49741,...,0.505332,0.490492,0.494559,0.490126,0.509567,0.507522,0.514557,0.498515,0.486401,0.50857


In [7]:
data_extended.extend(data)
data_extended.get_loadshape()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id4,0.492724,0.505538,0.498322,0.513031,0.499838,0.493808,0.503069,0.506053,0.493747,0.477565,...,0.505571,0.503961,0.506472,0.497519,0.50397,0.499071,0.498664,0.510987,0.511689,0.501406
id5,0.49463,0.494407,0.492638,0.49555,0.498049,0.505141,0.509758,0.489805,0.504258,0.491661,...,0.492175,0.499907,0.490875,0.4986,0.496115,0.507523,0.500375,0.48637,0.502198,0.518461
id6,0.512707,0.504244,0.497784,0.510918,0.505547,0.49756,0.490573,0.49986,0.495031,0.49741,...,0.505332,0.490492,0.494559,0.490126,0.509567,0.507522,0.514557,0.498515,0.486401,0.50857
id1,0.502609,0.502005,0.498371,0.499097,0.510716,0.50181,0.504768,0.487633,0.499268,0.493135,...,0.510496,0.486733,0.489239,0.500365,0.501316,0.491926,0.500458,0.501307,0.494313,0.498394
id2,0.494351,0.498003,0.507347,0.498671,0.501594,0.49631,0.502153,0.496029,0.502887,0.492916,...,0.497044,0.494701,0.510158,0.500409,0.502636,0.49855,0.49435,0.498848,0.496146,0.494562
id3,0.499806,0.497734,0.500596,0.503229,0.492626,0.501964,0.492338,0.49751,0.485345,0.501507,...,0.51296,0.499171,0.486227,0.498991,0.496104,0.500238,0.497498,0.503219,0.492727,0.498534


## INTERPOLATION

We interpolate values that are missing in the dataframe linearly, given that the amount of missing data is lower than the interpolation threshold (default is 20% of the total data)

In [8]:
# Set the missing data to lower than 80% threshold. It should return a loadshape which has been interpolated
## Create a boolean mask for Wednesdays
day_mask = df['datetime'].dt.dayofweek.isin([2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.691112,0.545348
1,id1,2023-01-01 00:15:00,0.610815,0.873962
2,id1,2023-01-01 00:30:00,0.996261,0.607106
3,id1,2023-01-01 00:45:00,0.940431,0.613202
4,id1,2023-01-01 01:00:00,0.900356,0.061997
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.079176,0.894832
105116,id3,2023-12-31 23:00:00,0.405985,0.110396
105117,id3,2023-12-31 23:15:00,0.305425,0.250386
105118,id3,2023-12-31 23:30:00,0.194651,0.327459


In [9]:
s = Data_Settings(TIME_PERIOD=_const.TimePeriod.DAY_OF_WEEK)
data = Data(time_series_df=df, settings=s)
data.get_loadshape()

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,0.496691,0.500832,0.498657,0.507568,0.49443,0.497329,0.497329
id2,0.496912,0.501208,0.497309,0.499209,0.49573,0.502293,0.502293
id3,0.498744,0.492913,0.495881,0.502737,0.498031,0.492367,0.492367


In [10]:
# Now , Create a boolean mask for Mondays and Wednesdays , will give ValueError at 80% threshold
day_mask = df['datetime'].dt.dayofweek.isin([0,2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.691112,0.545348
1,id1,2023-01-01 00:15:00,0.610815,0.873962
2,id1,2023-01-01 00:30:00,0.996261,0.607106
3,id1,2023-01-01 00:45:00,0.940431,0.613202
4,id1,2023-01-01 01:00:00,0.900356,0.061997
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.079176,0.894832
105116,id3,2023-12-31 23:00:00,0.405985,0.110396
105117,id3,2023-12-31 23:15:00,0.305425,0.250386
105118,id3,2023-12-31 23:30:00,0.194651,0.327459


In [11]:
# use the above missing data to try to create a loadshape. Since we have 2 days missing out of 7 for every id, it returns None

s = Data_Settings(TIME_PERIOD=_const.TimePeriod.DAY_OF_WEEK)
data = Data(time_series_df=df, settings=s)
data.get_loadshape()

## Unstacked loadshapes

In [12]:
# Assuming ids is a list of unique ids
ids = ["id1", "id2", "id3"]

# Create a range of values
values = range(1, _const.time_period_row_counts["day_of_week"] + 1)
row_cnt_per_id = 1

# Repeat each id len(values) times and tile values len(ids) times
df_new = pd.DataFrame({
    'id': np.repeat(ids, row_cnt_per_id),
    **{str(i): np.random.randint(1, 100, len(ids) * row_cnt_per_id) for i in range(1, len(values) + 1)}
})

# Create a boolean mask with True values representing 10% of the total number of elements
mask = np.random.choice([True, False], size=df_new.drop('id', axis=1).shape, p=[0.2, 0.8])

# Use the mask to set 10% of the values in df_new to NaN, excluding 'id' column
df_new.loc[:, df_new.columns != 'id'] = df_new.drop('id', axis=1).where(~mask, np.nan)

df_new

Unnamed: 0,id,1,2,3,4,5,6,7
0,id1,24,50.0,,,49.0,48,29
1,id2,73,26.0,85.0,4.0,,94,91
2,id3,98,,20.0,15.0,40.0,25,65


#### AGG_TYPE, LOADSHAPE_TYPE and TIME_PERIOD must be set to None if we're using loadshapes.
They're only required for time series data.

In [13]:
s = Data_Settings(AGG_TYPE=None, LOADSHAPE_TYPE=None, TIME_PERIOD=None, INTERPOLATE_MISSING=True)
unstack_df = df_new.copy()
data_new = Data(loadshape_df=unstack_df, settings=s)
data_new.get_loadshape()

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,24.0,50.0,49.666667,49.333333,49.0,48.0,29.0
id2,73.0,26.0,85.0,4.0,49.0,94.0,91.0
id3,98.0,59.0,20.0,15.0,40.0,25.0,65.0


## Normal Loadshapes

In [14]:
ids = ["id1", "id2", "id3"]

# Given count number of time values
count = 7

# Create a DataFrame
loadshape_df = pd.DataFrame({
    'id': np.repeat(ids, count),
    'time': np.tile(range(1, count + 1), len(ids)),
    'loadshape': np.random.rand(len(ids) * count)
})

loadshape_df

Unnamed: 0,id,time,loadshape
0,id1,1,0.140626
1,id1,2,0.679542
2,id1,3,0.896955
3,id1,4,0.9968
4,id1,5,0.775457
5,id1,6,0.090019
6,id1,7,0.584716
7,id2,1,0.817156
8,id2,2,0.177374
9,id2,3,0.649277


In [15]:
s = Data_Settings(AGG_TYPE=None, LOADSHAPE_TYPE=None, TIME_PERIOD=None, INTERPOLATE_MISSING=True)
data_loadshape = Data(loadshape_df=loadshape_df, settings=s)
data_loadshape.get_loadshape()

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,0.817156,0.177374,0.649277,0.743633,0.891627,0.789059,0.212736
id2,0.085384,0.962816,0.646956,0.983562,0.893703,0.965946,0.876026
id3,0.140626,0.679542,0.896955,0.9968,0.775457,0.090019,0.584716


# Features input

id cooling_load heating_load 