In [1]:
# %matplotlib
# %matplotlib inline
# %matplotlib notebook

import pandas as pd
import numpy as np
import random
import os
import warnings
from datetime import datetime, timedelta, timezone
warnings.simplefilter("ignore")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.ticker import AutoMinorLocator
from matplotlib.ticker import FormatStrFormatter
import re
import math
from copy import deepcopy
from numba_stats import t
import scipy.stats as stats

# Data
from gridmeter import Data
from gridmeter import Data_Settings

# IMM
from gridmeter import IMM
from gridmeter import IMM_Settings

# Clustering
from gridmeter import Clustering
from gridmeter import Clustering_Settings

from IPython.display import Image, Markdown, display
plt.ion()
plt.rcParams['figure.figsize'] = [24, 16]
plt.rcParams['figure.dpi'] = 300

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

%load_ext autoreload
%autoreload 2

In [2]:
df_ls_t = pd.read_csv("/app/.recurve_cache/clustering/example_dfs/df_ls_t.csv")
df_ls_cp = pd.read_csv("/app/.recurve_cache/clustering/example_dfs/df_ls_cp.csv")

df_ls_t = df_ls_t.rename(columns={"hour": "time", "ls": "loadshape"})
df_ls_cp = df_ls_cp.rename(columns={"hour": "time", "ls": "loadshape"})

In [3]:
# Test Data with ls input

data = Data(None)
data.set_data(loadshape_df=df_ls_t)
data.loadshape

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,495,496,497,498,499,500,501,502,503,504
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
None-1094275585-1094275585,-0.004892,0.014424,0.024523,0.009783,0.002749,-0.018928,0.000177,-0.022380,-0.015270,-0.015309,...,-0.072218,-0.048825,-0.046101,-0.036118,-0.021544,0.041530,0.057548,0.083471,0.063300,0.073510
None-1397301805-1397301805,-0.040984,-0.017669,-0.000320,0.023877,0.026313,-0.035445,0.017402,0.097946,0.067614,0.139950,...,0.065461,0.161034,0.069464,-0.014588,0.022082,0.088210,-0.039004,0.088822,-0.021783,0.020953
None-1432022910-1432022910,0.005879,0.007890,-0.005875,0.002565,0.007195,-0.019074,-0.007493,-0.007334,-0.018340,-0.040356,...,0.103594,0.077896,0.055153,0.013053,-0.002923,0.015454,0.037913,0.038603,0.019493,0.046673
None-1469355610-1469355610,-0.005751,-0.002495,0.000473,0.001568,-0.008138,-0.027553,-0.007281,0.051084,-0.008178,0.020144,...,0.001396,-0.059681,0.053881,0.055815,0.067706,0.049554,0.029438,0.031970,0.144222,0.123939
None-1504812305-1504812305,0.042894,0.010071,-0.028050,-0.014719,-0.021681,-0.016993,-0.045222,-0.004814,-0.008289,-0.002940,...,0.161389,0.164025,0.106125,0.026877,0.010182,0.031870,-0.010856,0.013376,0.037637,0.020543
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
None-5519977972-5519977972,0.012554,-0.010461,-0.007778,-0.005589,-0.000990,-0.011222,-0.011654,0.013016,-0.009513,-0.031281,...,-0.006706,0.026516,0.079070,0.061120,0.017507,0.024078,-0.006222,0.004099,0.013716,0.035427
None-8098217928-8098217928,-0.004565,-0.023354,-0.050385,-0.034883,-0.022048,-0.008940,-0.031168,-0.017703,-0.012403,-0.224249,...,0.152574,0.155358,0.105045,0.089779,0.096124,0.105736,0.068235,0.022936,0.027400,0.055927
None-8313277985-8313277985,0.011474,0.014000,-0.019039,-0.017233,0.018185,-0.013543,0.004347,-0.000793,-0.016210,-0.046111,...,-0.004876,-0.041127,-0.035425,-0.058451,-0.000260,0.017771,0.019968,0.006349,0.000319,-0.000629
None-8860891437-8860891437,-0.001721,0.025028,0.015406,0.014048,-0.017247,0.000452,-0.025423,0.013604,-0.019193,-0.020456,...,-0.079701,-0.059249,-0.021311,-0.003690,-0.026972,-0.073921,-0.043149,-0.029514,-0.027741,-0.016276


In [36]:
# Test Data with ts input

# Create a testing dataframe having an id, datetime of 15 min intervals, observed and modeled values 
num_intervals = 4 * 24 * 365  # 4 intervals/hour * 24 hours/day * 365 days

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df = pd.DataFrame({
    'id': np.repeat(['id1', 'id2', 'id3'], num_intervals),  # only 3 ids for easier comparison
    'datetime': pd.date_range(start='2023-01-01', periods=num_intervals, freq='15T').tolist() * 3, 
    'observed': np.random.rand(num_intervals * 3),  # randomized
    'modeled': np.random.rand(num_intervals * 3)  # randomized
})

df_features = pd.DataFrame({
    'id': ['id1', 'id2', 'id3'],  # only 3 ids for easier comparison
    'feature1': np.random.rand(3),  # randomized
    'feature2': [None, *np.random.rand(2)]  # randomized
    # 'feature2': *np.random.rand(3)  # randomized
})

data = Data()
data.set_data(time_series_df=df, features_df=df_features)
data.loadshape
# data.features
data.excluded_ids

Unnamed: 0,id,reason
0,id1,null values in features_df


In [None]:
# Test IMM

df_ls_t_mod = df_ls_t.set_index(["id", "hour"]).unstack()
df_ls_t_mod.columns = df_ls_t_mod.columns.droplevel(0)

df_ls_cp_mod = df_ls_cp.set_index(["id", "hour"]).unstack()
df_ls_cp_mod.columns = df_ls_cp_mod.columns.droplevel(0)

imm_settings = IMM_Settings()
df_cg, df_t_coeffs = IMM(imm_settings).get_comparison_group(df_ls_t_mod, df_ls_cp_mod)
df_cg

In [None]:
from gridmeter._utils.calculate_distances import calculate_distances
from copy import deepcopy as copy

def TestDistanceMatching(
    df_ls_t,
    df_ls_c,
    n_matches_per_treatment=4,
    distance_metric="euclidean",
    allow_duplicate_match=True,
    replace_duplicate_method=None,  # currently unused [None, "closest_to_meter", "closest_global"]
    max_distance_threshold=None,
    n_match_multiplier=None,
    n_meters_per_chunk=10000,
):
    ls_t = df_ls_t.to_numpy()
    ls_cp = df_ls_c.to_numpy()

    n_matches_per_chunk = copy(n_matches_per_treatment)

    # Calculate closest distances
    if n_match_multiplier is None:
        n_matches_per_chunk = None

    if n_match_multiplier is not None:
        if (not allow_duplicate_match and replace_duplicate_method is not None) or max_distance_threshold is not None:
            n_matches_per_chunk *= n_match_multiplier

        if n_matches_per_chunk > ls_cp.shape[0]:
            n_matches_per_chunk = ls_cp.shape[0]

    cp_id_idx, dist = calculate_distances(
        ls_t, ls_cp, distance_metric, n_matches_per_chunk, n_meters_per_chunk
    )

    print(dist.shape)
    print(cp_id_idx.shape)

    # create dataframes
    id_t = df_ls_t.index.values
    id_c = df_ls_c.index.values

    print(np.repeat(id_t, dist.shape[1]))

    series_t = pd.Series(np.repeat(id_t, dist.shape[1]), name="treatment")
    series_cp = pd.Series(id_c[cp_id_idx.flatten()], name="id")
    clusters = pd.DataFrame(
        dist.flatten(), index=[series_t, series_cp], columns=["distance"]
    )
    clusters = clusters.reset_index()
    clusters["duplicated"] = clusters.duplicated(subset=["id"])
    clusters["cluster"] = 1
    clusters = clusters.set_index("id")
    
    if allow_duplicate_match:
        clusters = clusters.sort_values(by=["treatment", "distance"])

        # for each index, get 4 smallest distances
        clusters = clusters.groupby("treatment").head(n_matches_per_treatment)

    else:
        # get count of treatment and id pairs
        # t_id_counts = clusters.groupby(["treatment", "id"]).size()
        # print(t_id_counts)

        # drop duplicate index
        clusters = clusters[~clusters.index.duplicated(keep='first')]

        if replace_duplicate_method is not None:
            raise NotImplementedError(
                "'replace_duplicate_meters': True not implemented"
            )

    return clusters


def get_comparison_group(df_ls_t, df_ls_cp, weights=None, **kwargs):
    df_cg = TestDistanceMatching(df_ls_t, df_ls_cp, **kwargs)

    # Create df_t_coeffs
    t_ids = df_ls_t.index.unique()
    coeffs = np.ones(t_ids.values.size)

    df_t_coeffs = pd.DataFrame(coeffs, index=t_ids, columns=["pct_cluster_1"])
    df_t_coeffs.index.name = "id"

    return df_cg, df_t_coeffs

In [None]:
df_cg, df_t_coeffs = get_comparison_group(df_ls_t_mod, df_ls_cp_mod, allow_duplicate_match=True, n_match_multiplier=2)
df_cg.reset_index().sort_values(by=["treatment", "id", "distance"])

In [None]:
df_cg.sort_values(["treatment", "distance"])

In [None]:
df_ls_t

In [None]:
df_ls_t_mod.stack().reset_index().rename(columns={0: "ls"})

In [None]:
# Test Clustering

clustering_settings = Clustering_Settings()
df_cg, df_t_coeffs = Clustering(clustering_settings).get_comparison_group(df_ls_t_mod, df_ls_cp_mod)
df_cg

In [None]:
df_t_coeffs