In [1]:
import pandas as pd
import numpy as np

from eemeter import gridmeter as gm

# Data class in Gridmeter

The data class takes input in two formats - time series and loadshapes (both stacked and unstacked versions)
It returns an aggregated loadshape output based on the settings provided.

The usage is as follows:

```python

from gridmeter import Data
from gridmeter import Data_Settings

# Specify a time settings object (setting the time period to be seasonal day of week, i.e. 7 days * 3 seasons = 21 data points)
s = Data_Settings(time_period="seasonal_day_of_week")

# Use this time settings to create a data settings object
# df here is your input dataframe (more explained later on)
data = Data(time_series_df=df, settings=s)

# Check the output
data.loadshape()
    
```

Lets look at a few examples into the different types of input and possible outputs/return values.

## Time Series loadshapes

In [2]:
# Create a testing dataframe having an id, datetime of 15 min intervals, observed and modeled values
num_intervals = 4 * 24 * 365  # 4 intervals/hour * 24 hours/day * 365 days

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id1", "id2", "id3"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

# Convert 'datetime' column to datetime type
# df["datetime"] = pd.to_datetime(df["datetime"])

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.327113,0.589891
1,id1,2023-01-01 00:15:00,0.344199,0.326629
2,id1,2023-01-01 00:30:00,0.199584,0.596843
3,id1,2023-01-01 00:45:00,0.939975,0.477415
4,id1,2023-01-01 01:00:00,0.261389,0.569062
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.336365,0.795474
105116,id3,2023-12-31 23:00:00,0.721162,0.966619
105117,id3,2023-12-31 23:15:00,0.277900,0.518129
105118,id3,2023-12-31 23:30:00,0.100645,0.306255


In [3]:
# We can have Settings as None, which will use default settings

data1 = gm.Data(time_series_df=df, settings=None)
data1.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,495,496,497,498,499,500,501,502,503,504
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id1,0.528494,0.492601,0.472181,0.478608,0.494427,0.52773,0.427164,0.462045,0.53309,0.484445,...,0.541254,0.511396,0.490161,0.53475,0.493523,0.49591,0.535936,0.567332,0.483759,0.461309
id2,0.505981,0.418072,0.57037,0.471573,0.525652,0.536976,0.480128,0.522971,0.497713,0.517967,...,0.518566,0.497728,0.435952,0.513424,0.498654,0.474231,0.478662,0.539358,0.514203,0.504923
id3,0.48118,0.497967,0.530252,0.484291,0.519658,0.499551,0.502505,0.507182,0.470282,0.538326,...,0.454649,0.470523,0.490471,0.494344,0.503403,0.564843,0.514715,0.504742,0.465539,0.529272


In [4]:
# try a different Time Period setting

s = gm.Data_Settings(time_period="seasonal_day_of_week")

data = gm.Data(time_series_df=df, settings=s)
data.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id1,0.501786,0.503863,0.50178,0.50027,0.498012,0.496989,0.495403,0.491625,0.502143,0.491896,...,0.486971,0.494411,0.499357,0.490427,0.49236,0.496698,0.493651,0.489039,0.500328,0.496098
id2,0.498329,0.517931,0.492527,0.50362,0.493508,0.48007,0.501268,0.492751,0.48348,0.500921,...,0.506942,0.490336,0.500354,0.498767,0.502509,0.505887,0.496987,0.498651,0.500087,0.498458
id3,0.500224,0.493457,0.513093,0.499354,0.492187,0.50025,0.501698,0.504127,0.506387,0.501489,...,0.496307,0.49764,0.500398,0.490347,0.497695,0.503311,0.505062,0.496127,0.510957,0.494556


#### We can also join two loadshapes if they have the same time_period classification

In [5]:
extended_df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id4", "id5", "id6"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

extended_df

Unnamed: 0,id,datetime,observed,modeled
0,id4,2023-01-01 00:00:00,0.349616,0.917743
1,id4,2023-01-01 00:15:00,0.578194,0.112682
2,id4,2023-01-01 00:30:00,0.085235,0.141976
3,id4,2023-01-01 00:45:00,0.574407,0.742973
4,id4,2023-01-01 01:00:00,0.563275,0.933492
...,...,...,...,...
105115,id6,2023-12-31 22:45:00,0.379929,0.479875
105116,id6,2023-12-31 23:00:00,0.508261,0.581318
105117,id6,2023-12-31 23:15:00,0.315751,0.190065
105118,id6,2023-12-31 23:30:00,0.686306,0.936760


In [6]:
s = gm.Data_Settings(time_period="seasonal_day_of_week")
data_extended = gm.Data(time_series_df=extended_df, settings=s)
data_extended.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id4,0.503927,0.489571,0.504885,0.514186,0.497363,0.506407,0.493552,0.511097,0.509242,0.496894,...,0.497949,0.488821,0.512267,0.494475,0.512989,0.491917,0.498666,0.504802,0.50161,0.487297
id5,0.494927,0.511243,0.503142,0.500871,0.502218,0.497958,0.50626,0.507607,0.490261,0.515331,...,0.493606,0.499366,0.489313,0.507349,0.49514,0.495074,0.498511,0.491467,0.510916,0.495086
id6,0.494572,0.503764,0.496194,0.494443,0.507165,0.505038,0.502347,0.510037,0.505132,0.49306,...,0.496424,0.513391,0.496308,0.490498,0.512599,0.502856,0.509116,0.508791,0.510747,0.501319


In [7]:
data_extended.extend(data)
data_extended.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id4,0.503927,0.489571,0.504885,0.514186,0.497363,0.506407,0.493552,0.511097,0.509242,0.496894,...,0.497949,0.488821,0.512267,0.494475,0.512989,0.491917,0.498666,0.504802,0.50161,0.487297
id5,0.494927,0.511243,0.503142,0.500871,0.502218,0.497958,0.50626,0.507607,0.490261,0.515331,...,0.493606,0.499366,0.489313,0.507349,0.49514,0.495074,0.498511,0.491467,0.510916,0.495086
id6,0.494572,0.503764,0.496194,0.494443,0.507165,0.505038,0.502347,0.510037,0.505132,0.49306,...,0.496424,0.513391,0.496308,0.490498,0.512599,0.502856,0.509116,0.508791,0.510747,0.501319
id1,0.501786,0.503863,0.50178,0.50027,0.498012,0.496989,0.495403,0.491625,0.502143,0.491896,...,0.486971,0.494411,0.499357,0.490427,0.49236,0.496698,0.493651,0.489039,0.500328,0.496098
id2,0.498329,0.517931,0.492527,0.50362,0.493508,0.48007,0.501268,0.492751,0.48348,0.500921,...,0.506942,0.490336,0.500354,0.498767,0.502509,0.505887,0.496987,0.498651,0.500087,0.498458
id3,0.500224,0.493457,0.513093,0.499354,0.492187,0.50025,0.501698,0.504127,0.506387,0.501489,...,0.496307,0.49764,0.500398,0.490347,0.497695,0.503311,0.505062,0.496127,0.510957,0.494556


## INTERPOLATION

We interpolate values that are missing in the dataframe linearly, given that the amount of missing data is lower than the interpolation threshold (default is 20% of the total data)

In [8]:
# Set the missing data to lower than 80% threshold. It should return a loadshape which has been interpolated
## Create a boolean mask for Wednesdays
day_mask = df['datetime'].dt.dayofweek.isin([2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.327113,0.589891
1,id1,2023-01-01 00:15:00,0.344199,0.326629
2,id1,2023-01-01 00:30:00,0.199584,0.596843
3,id1,2023-01-01 00:45:00,0.939975,0.477415
4,id1,2023-01-01 01:00:00,0.261389,0.569062
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.336365,0.795474
105116,id3,2023-12-31 23:00:00,0.721162,0.966619
105117,id3,2023-12-31 23:15:00,0.277900,0.518129
105118,id3,2023-12-31 23:30:00,0.100645,0.306255


In [9]:
s = gm.Data_Settings(time_period="day_of_week")
data = gm.Data(time_series_df=df, settings=s)
data.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,0.496541,0.500964,0.496667,0.499581,0.48997,0.500003,0.500003
id2,0.494555,0.499507,0.497769,0.491469,0.497238,0.496982,0.496982
id3,0.498346,0.499318,0.496471,0.494822,0.502897,0.498831,0.498831


In [10]:
# Now , Create a boolean mask for Mondays and Wednesdays , will give ValueError at 80% threshold
day_mask = df['datetime'].dt.dayofweek.isin([0,2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.327113,0.589891
1,id1,2023-01-01 00:15:00,0.344199,0.326629
2,id1,2023-01-01 00:30:00,0.199584,0.596843
3,id1,2023-01-01 00:45:00,0.939975,0.477415
4,id1,2023-01-01 01:00:00,0.261389,0.569062
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.336365,0.795474
105116,id3,2023-12-31 23:00:00,0.721162,0.966619
105117,id3,2023-12-31 23:15:00,0.277900,0.518129
105118,id3,2023-12-31 23:30:00,0.100645,0.306255


In [11]:
# use the above missing data to try to create a loadshape. Since we have 2 days missing out of 7 for every id, it returns None

s = gm.Data_Settings(time_period="day_of_week")
data = gm.Data(time_series_df=df, settings=s)
data.loadshape

## Unstacked loadshapes

In [12]:
# Assuming ids is a list of unique ids
ids = ["id1", "id2", "id3"]

# Create a range of values
values = range(1, 8)
row_cnt_per_id = 1

# Repeat each id len(values) times and tile values len(ids) times
df_new = pd.DataFrame({
    'id': np.repeat(ids, row_cnt_per_id),
    **{str(i): np.random.randint(1, 100, len(ids) * row_cnt_per_id) for i in range(1, len(values) + 1)}
})

# Create a boolean mask with True values representing 10% of the total number of elements
mask = np.random.choice([True, False], size=df_new.drop('id', axis=1).shape, p=[0.2, 0.8])

# Use the mask to set 10% of the values in df_new to NaN, excluding 'id' column
df_new.loc[:, df_new.columns != 'id'] = df_new.drop('id', axis=1).where(~mask, np.nan)

df_new

Unnamed: 0,id,1,2,3,4,5,6,7
0,id1,97,32.0,75.0,72,43,64,1
1,id2,37,,,55,41,66,25
2,id3,95,,41.0,18,21,32,74


#### AGG_TYPE, LOADSHAPE_TYPE and TIME_PERIOD must be set to None if we're using loadshapes.
They're only required for time series data.

In [13]:
s = gm.Data_Settings(AGG_TYPE=None, LOADSHAPE_TYPE=None, TIME_PERIOD=None, INTERPOLATE_MISSING=True)
unstack_df = df_new.copy()
data_new = gm.Data(loadshape_df=unstack_df, settings=s)
data_new.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,97.0,32.0,75.0,72.0,43.0,64.0,1.0
id2,37.0,43.0,49.0,55.0,41.0,66.0,25.0
id3,95.0,68.0,41.0,18.0,21.0,32.0,74.0


## Normal Loadshapes

In [14]:
ids = ["id1", "id2", "id3"]

# Given count number of time values
count = 7

# Create a DataFrame
loadshape_df = pd.DataFrame({
    'id': np.repeat(ids, count),
    'time': np.tile(range(1, count + 1), len(ids)),
    'loadshape': np.random.rand(len(ids) * count)
})

loadshape_df

Unnamed: 0,id,time,loadshape
0,id1,1,0.514292
1,id1,2,0.300549
2,id1,3,0.679693
3,id1,4,0.27517
4,id1,5,0.554288
5,id1,6,0.231464
6,id1,7,0.733246
7,id2,1,0.195782
8,id2,2,0.020061
9,id2,3,0.267702


In [15]:
s = gm.Data_Settings(AGG_TYPE=None, LOADSHAPE_TYPE=None, TIME_PERIOD=None, INTERPOLATE_MISSING=True)
data_loadshape = gm.Data(loadshape_df=loadshape_df, settings=s)
data_loadshape.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,0.895797,0.881562,0.568781,0.655742,0.666111,0.676829,0.166042
id2,0.514292,0.300549,0.679693,0.27517,0.554288,0.231464,0.733246
id3,0.195782,0.020061,0.267702,0.650348,0.623385,0.838512,0.062307


# Features input

id cooling_load heating_load 