In [None]:
from gridmeter._utils.data_processing import Data
from gridmeter._utils.data_processing_settings import Data_Settings
from gridmeter._utils import const as _const
import pandas as pd
import numpy as np

# Data class in Gridmeter

The data class takes input in two formats - time series and loadshapes (both stacked and unstacked versions)
It returns an aggregated loadshape output based on the settings provided.

The usage is as follows:

```python

from gridmeter._utils.data_processing import Data
from gridmeter._utils.data_processing_settings import Data_Settings
from gridmeter._utils import const as _const

# Specify a time settings object (setting the time period to be seasonal day of week, i.e. 7 days * 3 seasons = 21 data points)
settings = Data_Settings(TIME_PERIOD=_const.TimePeriod.SEASONAL_DAY_OF_WEEK)

# Use this time settings to create a data settings object
# df here is your input dataframe (more explained later on)
data = Data(None).set_data(time_series_df=df)

# Check the output
data.loadshape
    
```

Lets look at a few examples into the different types of input and possible outputs/return values.

## Time Series loadshapes

In [None]:
# Create a testing dataframe having an id, datetime of 15 min intervals, observed and modeled values
num_intervals = 4 * 24 * 365  # 4 intervals/hour * 24 hours/day * 365 days

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id1", "id2", "id3"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

# Convert 'datetime' column to datetime type
# df["datetime"] = pd.to_datetime(df["datetime"])

df

In [None]:
# We can have Settings as None, which will use default settings


data1 = Data(None).set_data(time_series_df=df)
data1.get_loadshape()

In [None]:
# try a different Time Period setting

settings = Data_Settings(TIME_PERIOD=_const.TimePeriod.SEASONAL_DAY_OF_WEEK)


data = Data(settings).set_data(time_series_df=df)
data.get_loadshape()

#### We can also join two loadshapes if they have the same time_period classification

In [None]:
extended_df = pd.DataFrame(
    {
        "id": np.repeat(
            ["id4", "id5", "id6"], num_intervals
        ),  # only 3 ids for easier comparison
        "datetime": pd.date_range(
            start="2023-01-01", periods=num_intervals, freq="15T"
        ).tolist()
        * 3,
        "observed": np.random.rand(num_intervals * 3),  # randomized
        "modeled": np.random.rand(num_intervals * 3),  # randomized
    }
)

extended_df



In [None]:
settings = Data_Settings(TIME_PERIOD=_const.TimePeriod.SEASONAL_DAY_OF_WEEK)
data_extended = Data(settings).set_data(time_series_df=extended_df)
data_extended.get_loadshape()

In [None]:
data_extended.extend(data)
data_extended.get_loadshape()

## INTERPOLATION

We interpolate values that are missing in the dataframe linearly, given that the amount of missing data is lower than the interpolation threshold (default is 20% of the total data)

In [None]:
# Set the missing data to lower than 80% threshold. It should return a loadshape which has been interpolated
## Create a boolean mask for Wednesdays
day_mask = df['datetime'].dt.dayofweek.isin([2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df


In [None]:
settings = Data_Settings(TIME_PERIOD=_const.TimePeriod.DAY_OF_WEEK)
data = Data(settings).set_data(time_series_df=df)
data.get_loadshape()

In [None]:
# Now , Create a boolean mask for Mondays and Wednesdays , will give ValueError at 80% threshold
day_mask = df['datetime'].dt.dayofweek.isin([0,2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan

# # Remove all rows with NaN values
df = df.loc[~day_mask]

df

In [None]:
# use the above missing data to try to create a loadshape. Since we have 2 days missing out of 7 for every id, it returns None

settings = Data_Settings(TIME_PERIOD=_const.TimePeriod.DAY_OF_WEEK)
data = Data(settings).set_data(time_series_df=df)
data.get_loadshape()

## Unstacked loadshapes

In [None]:
# Assuming ids is a list of unique ids
ids = ["id1", "id2", "id3"]

# Create a range of values
values = range(1, _const.time_period_row_counts["day_of_week"] + 1)
row_cnt_per_id = 1

# Repeat each id len(values) times and tile values len(ids) times
df_new = pd.DataFrame({
    'id': np.repeat(ids, row_cnt_per_id),
    **{str(i): np.random.randint(1, 100, len(ids) * row_cnt_per_id) for i in range(1, len(values) + 1)}
})

# Create a boolean mask with True values representing 10% of the total number of elements
mask = np.random.choice([True, False], size=df_new.drop('id', axis=1).shape, p=[0.2, 0.8])

# Use the mask to set 10% of the values in df_new to NaN, excluding 'id' column
df_new.loc[:, df_new.columns != 'id'] = df_new.drop('id', axis=1).where(~mask, np.nan)

df_new

#### AGG_TYPE, LOADSHAPE_TYPE and TIME_PERIOD must be set to None if we're using loadshapes.
They're only required for time series data.

In [None]:
settings = Data_Settings(AGG_TYPE = None, LOADSHAPE_TYPE = None, TIME_PERIOD = None, INTERPOLATE_MISSING = True)
unstack_df = df_new.copy()
data_new = Data(settings).set_data(loadshape_df=unstack_df)
data_new.get_loadshape()

## Normal Loadshapes

In [None]:
ids = ["id1", "id2", "id3"]

# Given count number of time values
count = 7

# Create a DataFrame
loadshape_df = pd.DataFrame({
    'id': np.repeat(ids, count),
    'time': np.tile(range(1, count + 1), len(ids)),
    'loadshape': np.random.rand(len(ids) * count)
})

loadshape_df

In [None]:
settings = Data_Settings(AGG_TYPE = None, LOADSHAPE_TYPE = None, TIME_PERIOD = None, INTERPOLATE_MISSING = True)
data_loadshape = Data(settings).set_data(loadshape_df=loadshape_df)
data_loadshape.get_loadshape()

# Features input

id cooling_load heating_load 