In [58]:
# %matplotlib
# %matplotlib inline
# %matplotlib notebook

import pandas as pd
import numpy as np
import random
import os
import warnings
from datetime import datetime, timedelta, timezone
warnings.simplefilter("ignore")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.ticker import AutoMinorLocator
from matplotlib.ticker import FormatStrFormatter
import re
import math
from copy import deepcopy
from numba_stats import t
import scipy.stats as stats

# IMM
from gridmeter.individual_meter_matching.create_comparison_groups import (
    Individual_Meter_Matching as IMM,
)
from gridmeter.individual_meter_matching import settings as IMM_settings

# Clustering
from gridmeter.clustering import (
    settings as clustering_settings,
    cluster as clustering,
    transform as clustering_transform,
)

from IPython.display import Image, Markdown, display
plt.ion()
plt.rcParams['figure.figsize'] = [24, 16]
plt.rcParams['figure.dpi'] = 300

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [59]:
df_ls_t = pd.read_csv("/app/.recurve_cache/clustering/example_dfs/df_ls_t.csv")
df_ls_cp = pd.read_csv("/app/.recurve_cache/clustering/example_dfs/df_ls_cp.csv")

In [60]:
# Test IMM

df_ls_cp_mod = df_ls_cp.set_index(["id", "hour"]).unstack()
df_ls_cp_mod.columns = df_ls_cp_mod.columns.droplevel(0)

df_ls_t_mod = df_ls_t.set_index(["id", "hour"]).unstack()
df_ls_t_mod.columns = df_ls_t_mod.columns.droplevel(0)

imm_settings = IMM_settings.Settings()
df_cg, df_t_coeffs = IMM(imm_settings).get_comparison_group(df_ls_t_mod, df_ls_cp_mod)
df_cg

Unnamed: 0_level_0,treatment,distance,duplicated,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
None-3234991905-3234991905,None-1094275585-1094275585,1.488844,False,1
None-1518448307-1518448307,None-1397301805-1397301805,4.454165,False,1
None-3517431310-3517431310,None-1432022910-1432022910,4.414845,False,1
None-1908503005-1908503005,None-1469355610-1469355610,1.487711,False,1
None-3529331605-3529331605,None-1504812305-1504812305,0.969864,False,1
...,...,...,...,...
None-1802814010-1802814010,None-5519977972-5519977972,80.664935,False,1
None-1646832205-1646832205,None-8098217928-8098217928,18.023014,False,1
None-0603795751-0603795751,None-8313277985-8313277985,9.958277,False,1
None-3831190605-3831190605,None-8860891437-8860891437,15.392707,False,1


In [61]:
from gridmeter.utils.calculate_distances import calculate_distances

def TestDistanceMatching(
    df_ls_t,
    df_ls_c,
    n_matches_per_treatment=4,
    distance_metric="euclidean",
    allow_duplicate_match=False,
    replace_duplicate_method=None,  # currently unused [None, "closest_to_meter", "closest_global"]
    max_distance_threshold=None,
    n_match_multiplier=None,
    n_meters_per_chunk=10000,
):
    ls_t = df_ls_t.to_numpy()
    ls_cp = df_ls_c.to_numpy()

    # Calculate closest distances
    if n_match_multiplier is None:
        n_matches_per_treatment = None

    if n_match_multiplier is not None:
        if (not allow_duplicate_match and replace_duplicate_method is not None) or max_distance_threshold is not None:
            n_matches_per_treatment *= n_match_multiplier

        if n_matches_per_treatment > ls_cp.shape[0]:
            n_matches_per_treatment = ls_cp.shape[0]

    cp_id_idx, dist = calculate_distances(
        ls_t, ls_cp, distance_metric, n_matches_per_treatment, n_meters_per_chunk
    )

    # create dataframes
    id_t = df_ls_t.index.values
    id_c = df_ls_c.index.values

    series_t = pd.Series(np.repeat(id_t, dist.shape[1]), name="treatment")
    series_cp = pd.Series(id_c[cp_id_idx.flatten()], name="id")
    clusters = pd.DataFrame(
        dist.flatten(), index=[series_t, series_cp], columns=["distance"]
    )
    clusters = clusters.reset_index()
    clusters["duplicated"] = clusters.duplicated(subset=["id"])
    clusters["cluster"] = 1
    clusters = clusters.set_index("id")

    if not allow_duplicate_match:
        # drop duplicate index
        clusters = clusters[~clusters.index.duplicated(keep='first')]

        # get count of treatment
        print(clusters['treatment'])
        counts = clusters['treatment'].value_counts()
        print(counts)

        if replace_duplicate_method is not None:
            raise NotImplementedError(
                "'replace_duplicate_meters': True not implemented"
            )

    return clusters


def get_comparison_group(df_ls_t, df_ls_cp, weights=None):
    df_cg = TestDistanceMatching(df_ls_t, df_ls_cp)

    # Create df_t_coeffs
    t_ids = df_ls_t.index.unique()
    coeffs = np.ones(t_ids.values.size)

    df_t_coeffs = pd.DataFrame(coeffs, index=t_ids, columns=["pct_cluster_1"])
    df_t_coeffs.index.name = "id"

    return df_cg, df_t_coeffs

In [62]:
df_cg, df_t_coeffs = get_comparison_group(df_ls_t_mod, df_ls_cp_mod)
df_cg

id
None-0344817417-0344817417    None-1094275585-1094275585
None-0356376858-0356376858    None-1094275585-1094275585
None-0490610165-0490610165    None-1094275585-1094275585
None-0497060249-0497060249    None-1094275585-1094275585
None-0567408984-0567408984    None-1094275585-1094275585
                                         ...            
None-9467861419-9467861419    None-1094275585-1094275585
None-9552364372-9552364372    None-1094275585-1094275585
None-9576055824-9576055824    None-1094275585-1094275585
None-9808840282-9808840282    None-1094275585-1094275585
None-9963816099-9963816099    None-1094275585-1094275585
Name: treatment, Length: 1000, dtype: object
treatment
None-1094275585-1094275585    1000
Name: count, dtype: int64


Unnamed: 0_level_0,treatment,distance,duplicated,cluster
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
None-0344817417-0344817417,None-1094275585-1094275585,1.689297,False,1
None-0356376858-0356376858,None-1094275585-1094275585,1.710408,False,1
None-0490610165-0490610165,None-1094275585-1094275585,7.299609,False,1
None-0497060249-0497060249,None-1094275585-1094275585,19.960519,False,1
None-0567408984-0567408984,None-1094275585-1094275585,1.583618,False,1
...,...,...,...,...
None-9467861419-9467861419,None-1094275585-1094275585,1354.884431,False,1
None-9552364372-9552364372,None-1094275585-1094275585,1.770874,False,1
None-9576055824-9576055824,None-1094275585-1094275585,7.978226,False,1
None-9808840282-9808840282,None-1094275585-1094275585,56.881127,False,1


In [None]:
df_cg.sort_values(["treatment", "distance"])

In [None]:
# Test Clustering

clustering_settings = clustering_settings.Settings()
matcher = clustering.ClusterResult.from_comparison_pool_loadshapes_and_settings(
    df_cp_ls=df_ls_cp, s=clustering_settings
)
treatment_df = matcher.get_match_treatment_to_cluster_df(
    treatment_loadshape_df=df_ls_t
)

print(matcher.cluster_df)
print(treatment_df)

In [None]:
from utils.data_settings import Settings

class Data:
    def __init__(self, settings : Settings | None):
        if settings is None:
            settings = Settings()
        else:
            self.settings = {"agg_type": "mean",
                            "loadshape_type": "observed", # ["observed", "modeled", "error"]
                            "time_period": "hour", #["hour", "week", "season"]
                            }

    def convert_timeseries_to_loadshape(self, df):
        # Check columns missing in time_series_df
        expected_columns = ["id", "datetime", self.settings["loadshape_type"]] # except error which requires both observed and modeled
        missing_columns = [c for c in expected_columns if c not in time_series_df.columns]
        
        if missing_columns:
            raise ValueError(f"Missing columns in time_series_df: {missing_columns}")
        
        # df_type
        if self.settings["loadshape_type"] == "observed":
            df_type = "observed"
        elif self.settings["loadshape_type"] == "modeled":
            df_type = "modeled"
        elif self.settings["loadshape_type"] == "error":
            df_type = "error"
        else:
            raise ValueError(f"Invalid loadshape_type: {self.settings['loadshape_type']}")
            

        if df_type == "error":
            pass # calculate error

        # Aggregate the input time_series based on time_period
        time_series_df = time_series_df.set_index("datetime")
        time_series_df = time_series_df.groupby(["id", pd.Grouper(freq=self.settings["time_period"])])[df_type].agg(self.settings["agg_type"])
        time_series_df = time_series_df.reset_index()
        time_series_df = time_series_df.rename(columns={"datetime": "time"})
        time_series_df["loadshape"] = time_series_df[df_type]
        time_series_df = time_series_df.drop(columns=[df_type])
        time_series_df = time_series_df.set_index(["id", "time"])
        loadshape_df = time_series_df
        
        return loadshape_df
    
    def _validate(self, df):
        pass

    
    def set_data(self, loadshape_df=None, time_series_df=None):
        """

        Args:
            Loadshape_df: columns = [id, time, loadshape]

            Time_series_df: columns = [id, datetime, observed, observed_error, modeled, modeled_error]

        Output:
            loadshape: idx = id, columns = time, columns = [loadshape]

            
        """
        if loadshape_df is None and time_series_df is None:
            raise ValueError("Either loadshape_dataframe or time_series_dataframe must be provided.")


        if loadshape_df is not None:
            # Check columns missing in loadshape_df
            expected_columns = ["id", "time", "loadshape"]
            missing_columns = [c for c in expected_columns if c not in loadshape_df.columns]
            
            if missing_columns:
                raise ValueError(f"Missing columns in time_series_df: {missing_columns}")

            
            #Aggregate the input loadshape based on time_period
            output_loadshape = loadshape_df.groupby(["id", pd.Grouper(freq=self.settings["time_period"])])["loadshape"].agg(self.settings["agg_type"])
            
        elif time_series_df is not None:
            output_loadshape = convert_timeseries_to_loadshape(time_series_df)

        
        self.loadshape = output_loadshape


        return self.loadshape

        

In [None]:
df_ls_t_mod.head()