In [1]:
import subprocess
import sys

for package in ['gridmeter', 'pandas', 'plotnine']:
    try:
        __import__(package)
    except ImportError:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotnine as pn
import gridmeter as gm

In [2]:
# Loadshape and feature data

n_treatment = 100
n_pool = 1000

filter = lambda df: df[(df.summer_usage > df.summer_usage.quantile(0.5)) & (df.winter_usage > df.winter_usage.quantile(0.5))]
population = gm.DummyTreatmentPoolPopulation(n_treatment=n_treatment, n_pool=n_pool, 
                                              treatment_filter_function=filter)

df_features= population.features()
df_features_pool = df_features[df_features['set']=='pool']
df_features_treatment = df_features[df_features['set']=='treatment']
df_loadshape = population.features_monthly()

df_features_pool = df_features_pool.rename(columns={'meter_id':'id'}).drop(columns=['set'])
df_ls_pool = df_loadshape.loc[df_features_pool["id"]]
df_ls_pool = df_ls_pool.stack().reset_index().rename(columns={'meter_id':'id', 'month':'time', 0:'loadshape'})

df_features_treatment = df_features_treatment.rename(columns={'meter_id':'id'}).drop(columns=['set'])
df_ls_treatment = df_loadshape.loc[df_features_treatment["id"]]
df_ls_treatment = df_ls_treatment.stack().reset_index().rename(columns={'meter_id':'id', 'month':'time', 0:'loadshape'})

# Fake time series data
# Create a testing dataframe having an id, datetime of 1 month intervals, observed and modeled values 
num_intervals = 12  # 1 per month

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df_ts_treatment = pd.DataFrame({
    'id': np.repeat(df_features_treatment["id"].values, num_intervals),  # only 3 ids for easier comparison
    'datetime': pd.date_range(start='2023-01-01', periods=num_intervals, freq='M').tolist() * n_treatment, 
    'observed': np.random.rand(num_intervals * n_treatment),  # randomized
    'modeled': np.random.rand(num_intervals * n_treatment)  # randomized
})

INFO:root:Caching objects to .cache


In [3]:
# Set data classes
# for the purposes of this tutorial, we'll set both loadshapes and features

data_settings = gm.Data_Settings(AGG_TYPE=None, LOADSHAPE_TYPE=None, time_period=None)

pool_data = gm.Data(data_settings)
pool_data.set_data(loadshape_df=df_ls_pool, features_df=df_features_pool)

treatment_data = gm.Data(data_settings)
treatment_data.set_data(loadshape_df=df_ls_treatment, features_df=df_features_treatment)

<gridmeter._utils.data_processing.Data at 0x7f477ee9da10>

In [4]:
clustering_settings = gm.Clustering_Settings()
df_cg, df_t_coeffs = gm.Clustering(clustering_settings).get_comparison_group(treatment_data, pool_data)
df_cg

Unnamed: 0_level_0,cluster
id,Unnamed: 1_level_1
meter_730,-1
meter_738,-1
meter_739,-1
meter_743,-1
meter_744,-1
...,...
meter_520,25
meter_805,25
meter_305,25
meter_82,25
