In [8]:
import pandas as pd
import numpy as np

from gridmeter import Data
from gridmeter import Data_Settings

# Data class in Gridmeter

The data class takes input in two formats - time series and loadshapes (both stacked and unstacked versions)
It returns an aggregated loadshape output based on the settings provided.

The usage is as follows:

```python

from gridmeter import Data
from gridmeter import Data_Settings

# Specify a time settings object (setting the time period to be seasonal day of week, i.e. 7 days * 3 seasons = 21 data points)
s = Data_Settings(time_period="seasonal_day_of_week")

# Use this time settings to create a data settings object
# df here is your input dataframe (more explained later on)
data = Data(time_series_df=df, settings=s)

# Check the output
data.loadshape()
    
```

Lets look at a few examples into the different types of input and possible outputs/return values.

## Time Series loadshapes

In [9]:
# Create a testing dataframe having an id, datetime of 15 min intervals, observed and modeled values
num_intervals = 4 * 24 * 365  # 4 intervals/hour * 24 hours/day * 365 days

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id1", "id2", "id3"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

# Convert 'datetime' column to datetime type
# df["datetime"] = pd.to_datetime(df["datetime"])

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.822414,0.940254
1,id1,2023-01-01 00:15:00,0.902174,0.766168
2,id1,2023-01-01 00:30:00,0.071087,0.266810
3,id1,2023-01-01 00:45:00,0.369226,0.750511
4,id1,2023-01-01 01:00:00,0.277626,0.677455
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.638390,0.797873
105116,id3,2023-12-31 23:00:00,0.216757,0.073988
105117,id3,2023-12-31 23:15:00,0.917763,0.424074
105118,id3,2023-12-31 23:30:00,0.732220,0.551153


In [10]:
# We can have Settings as None, which will use default settings

data1 = Data(time_series_df=df, settings=None)
data1.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,495,496,497,498,499,500,501,502,503,504
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id1,0.496175,0.4123,0.47298,0.496397,0.478349,0.506713,0.547918,0.51566,0.522835,0.502346,...,0.461981,0.472822,0.550722,0.454546,0.493359,0.492814,0.519986,0.512396,0.483791,0.522764
id2,0.52543,0.51307,0.48867,0.49821,0.508017,0.46319,0.484805,0.473393,0.580223,0.523918,...,0.523459,0.563412,0.504596,0.436554,0.470347,0.447998,0.520917,0.452835,0.514982,0.506226
id3,0.475286,0.458318,0.462285,0.469689,0.504085,0.50879,0.518214,0.480204,0.491616,0.492032,...,0.533087,0.46224,0.471672,0.553996,0.532797,0.518138,0.495298,0.535476,0.457287,0.444617


In [11]:
# try a different Time Period setting

s = Data_Settings(time_period="seasonal_day_of_week")

data = Data(time_series_df=df, settings=s)
data.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id1,0.490453,0.501772,0.508006,0.500192,0.495496,0.483809,0.510846,0.507784,0.497371,0.51016,...,0.496683,0.500776,0.495564,0.501429,0.502685,0.49087,0.491888,0.499636,0.49962,0.503204
id2,0.493228,0.494474,0.499823,0.494885,0.511489,0.501737,0.497094,0.502381,0.505228,0.500759,...,0.50538,0.503882,0.501647,0.513627,0.508932,0.504353,0.50953,0.493524,0.490292,0.497448
id3,0.510044,0.493884,0.511619,0.499133,0.50335,0.497913,0.495915,0.512684,0.493036,0.498387,...,0.49427,0.506879,0.501461,0.502114,0.495706,0.492193,0.510976,0.503638,0.510795,0.490929


#### We can also join two loadshapes if they have the same time_period classification

In [12]:
extended_df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id4", "id5", "id6"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

extended_df

Unnamed: 0,id,datetime,observed,modeled
0,id4,2023-01-01 00:00:00,0.337992,0.206913
1,id4,2023-01-01 00:15:00,0.013197,0.711018
2,id4,2023-01-01 00:30:00,0.266650,0.772050
3,id4,2023-01-01 00:45:00,0.823958,0.928811
4,id4,2023-01-01 01:00:00,0.995731,0.080494
...,...,...,...,...
105115,id6,2023-12-31 22:45:00,0.251279,0.726956
105116,id6,2023-12-31 23:00:00,0.198860,0.453431
105117,id6,2023-12-31 23:15:00,0.716030,0.039478
105118,id6,2023-12-31 23:30:00,0.251781,0.950448


In [13]:
s = Data_Settings(time_period="seasonal_day_of_week")
data_extended = Data(time_series_df=extended_df, settings=s)
data_extended.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id4,0.499503,0.498587,0.500289,0.497377,0.505518,0.497346,0.511655,0.49081,0.499294,0.501529,...,0.503974,0.507059,0.499004,0.488221,0.518188,0.497938,0.490391,0.494757,0.488572,0.494529
id5,0.496893,0.50358,0.489901,0.492844,0.503286,0.494735,0.504907,0.499432,0.497893,0.501732,...,0.505882,0.503876,0.503243,0.504648,0.491393,0.513741,0.515771,0.498497,0.485493,0.510548
id6,0.496357,0.501121,0.51189,0.499817,0.493633,0.495784,0.497669,0.488727,0.486208,0.488446,...,0.513569,0.502705,0.48853,0.501792,0.50062,0.498469,0.507697,0.512294,0.51015,0.492901


In [14]:
data_extended.extend(data)
data_extended.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,12,13,14,15,16,17,18,19,20,21
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
id4,0.499503,0.498587,0.500289,0.497377,0.505518,0.497346,0.511655,0.49081,0.499294,0.501529,...,0.503974,0.507059,0.499004,0.488221,0.518188,0.497938,0.490391,0.494757,0.488572,0.494529
id5,0.496893,0.50358,0.489901,0.492844,0.503286,0.494735,0.504907,0.499432,0.497893,0.501732,...,0.505882,0.503876,0.503243,0.504648,0.491393,0.513741,0.515771,0.498497,0.485493,0.510548
id6,0.496357,0.501121,0.51189,0.499817,0.493633,0.495784,0.497669,0.488727,0.486208,0.488446,...,0.513569,0.502705,0.48853,0.501792,0.50062,0.498469,0.507697,0.512294,0.51015,0.492901
id1,0.490453,0.501772,0.508006,0.500192,0.495496,0.483809,0.510846,0.507784,0.497371,0.51016,...,0.496683,0.500776,0.495564,0.501429,0.502685,0.49087,0.491888,0.499636,0.49962,0.503204
id2,0.493228,0.494474,0.499823,0.494885,0.511489,0.501737,0.497094,0.502381,0.505228,0.500759,...,0.50538,0.503882,0.501647,0.513627,0.508932,0.504353,0.50953,0.493524,0.490292,0.497448
id3,0.510044,0.493884,0.511619,0.499133,0.50335,0.497913,0.495915,0.512684,0.493036,0.498387,...,0.49427,0.506879,0.501461,0.502114,0.495706,0.492193,0.510976,0.503638,0.510795,0.490929


## INTERPOLATION

We interpolate values that are missing in the dataframe linearly, given that the amount of missing data is lower than the interpolation threshold (default is 20% of the total data)

In [15]:
# Set the missing data to lower than 80% threshold. It should return a loadshape which has been interpolated
## Create a boolean mask for Wednesdays
day_mask = df['datetime'].dt.dayofweek.isin([2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.822414,0.940254
1,id1,2023-01-01 00:15:00,0.902174,0.766168
2,id1,2023-01-01 00:30:00,0.071087,0.266810
3,id1,2023-01-01 00:45:00,0.369226,0.750511
4,id1,2023-01-01 01:00:00,0.277626,0.677455
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.638390,0.797873
105116,id3,2023-12-31 23:00:00,0.216757,0.073988
105117,id3,2023-12-31 23:15:00,0.917763,0.424074
105118,id3,2023-12-31 23:30:00,0.732220,0.551153


In [16]:
s = Data_Settings(time_period="day_of_week")
data = Data(time_series_df=df, settings=s)
data.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,0.503065,0.502923,0.504053,0.503619,0.498696,0.49876,0.49876
id2,0.508365,0.494186,0.502535,0.500476,0.505056,0.496105,0.496105
id3,0.500041,0.500547,0.496966,0.497238,0.494525,0.50306,0.50306


In [17]:
# Now , Create a boolean mask for Mondays and Wednesdays , will give ValueError at 80% threshold
day_mask = df['datetime'].dt.dayofweek.isin([0,2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.822414,0.940254
1,id1,2023-01-01 00:15:00,0.902174,0.766168
2,id1,2023-01-01 00:30:00,0.071087,0.266810
3,id1,2023-01-01 00:45:00,0.369226,0.750511
4,id1,2023-01-01 01:00:00,0.277626,0.677455
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.638390,0.797873
105116,id3,2023-12-31 23:00:00,0.216757,0.073988
105117,id3,2023-12-31 23:15:00,0.917763,0.424074
105118,id3,2023-12-31 23:30:00,0.732220,0.551153


In [18]:
# use the above missing data to try to create a loadshape. Since we have 2 days missing out of 7 for every id, it returns None

s = Data_Settings(time_period="day_of_week")
data = Data(time_series_df=df, settings=s)
data.loadshape

## Unstacked loadshapes

In [19]:
# Assuming ids is a list of unique ids
ids = ["id1", "id2", "id3"]

# Create a range of values
values = range(1, 8)
row_cnt_per_id = 1

# Repeat each id len(values) times and tile values len(ids) times
df_new = pd.DataFrame({
    'id': np.repeat(ids, row_cnt_per_id),
    **{str(i): np.random.randint(1, 100, len(ids) * row_cnt_per_id) for i in range(1, len(values) + 1)}
})

# Create a boolean mask with True values representing 10% of the total number of elements
mask = np.random.choice([True, False], size=df_new.drop('id', axis=1).shape, p=[0.2, 0.8])

# Use the mask to set 10% of the values in df_new to NaN, excluding 'id' column
df_new.loc[:, df_new.columns != 'id'] = df_new.drop('id', axis=1).where(~mask, np.nan)

df_new

Unnamed: 0,id,1,2,3,4,5,6,7
0,id1,45.0,,67,68.0,81.0,36.0,6.0
1,id2,,11.0,22,56.0,3.0,,
2,id3,53.0,9.0,7,,,23.0,46.0


#### AGG_TYPE, LOADSHAPE_TYPE and TIME_PERIOD must be set to None if we're using loadshapes.
They're only required for time series data.

In [20]:
s = Data_Settings(AGG_TYPE=None, LOADSHAPE_TYPE=None, TIME_PERIOD=None, INTERPOLATE_MISSING=True)
unstack_df = df_new.copy()
data_new = Data(loadshape_df=unstack_df, settings=s)
data_new.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,45.0,56.0,67.0,68.0,81.0,36.0,6.0
id2,11.0,11.0,22.0,56.0,3.0,3.0,3.0
id3,53.0,9.0,7.0,12.333333,17.666667,23.0,46.0


## Normal Loadshapes

In [21]:
ids = ["id1", "id2", "id3"]

# Given count number of time values
count = 7

# Create a DataFrame
loadshape_df = pd.DataFrame({
    'id': np.repeat(ids, count),
    'time': np.tile(range(1, count + 1), len(ids)),
    'loadshape': np.random.rand(len(ids) * count)
})

loadshape_df

Unnamed: 0,id,time,loadshape
0,id1,1,0.563458
1,id1,2,0.532356
2,id1,3,0.197222
3,id1,4,0.450618
4,id1,5,0.307108
5,id1,6,0.793405
6,id1,7,0.940847
7,id2,1,0.296756
8,id2,2,0.483339
9,id2,3,0.581278


In [22]:
s = Data_Settings(AGG_TYPE=None, LOADSHAPE_TYPE=None, TIME_PERIOD=None, INTERPOLATE_MISSING=True)
data_loadshape = Data(loadshape_df=loadshape_df, settings=s)
data_loadshape.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
id1,0.296756,0.483339,0.581278,0.817598,0.018408,0.137025,0.855369
id2,0.563458,0.532356,0.197222,0.450618,0.307108,0.793405,0.940847
id3,0.404228,0.497281,0.591521,0.968508,0.769953,0.453242,0.682078


# Features input

id cooling_load heating_load 