In [1]:
# %matplotlib
# %matplotlib inline
# %matplotlib notebook

import pandas as pd
import numpy as np
import random
import os
import warnings
from datetime import datetime, timedelta, timezone
warnings.simplefilter("ignore")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.ticker import AutoMinorLocator
from matplotlib.ticker import FormatStrFormatter
import re
import math
from copy import deepcopy
from numba_stats import t
import scipy.stats as stats


from IPython.display import Image, Markdown, display
plt.ion()
plt.rcParams['figure.figsize'] = [24, 16]
plt.rcParams['figure.dpi'] = 300

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

%load_ext autoreload
%autoreload 2

In [4]:
df_ls_t = pd.read_csv("/app/.recurve_cache/example_dfs/df_ls_t.csv")
df_ls_cp = pd.read_csv("/app/.recurve_cache/example_dfs/df_ls_cp.csv")

In [2]:
from gridmeter.utils.calculate_distances import calculate_distances

def TestDistanceMatching(
    df_ls_t,
    df_ls_c,
    n_matches_per_treatment=4,
    distance_metric="euclidean",
    allow_duplicate_match=False,
    replace_duplicate_method=None,  # currently unused [None, "closest_to_meter", "closest_global"]
    max_distance_threshold=None,
    n_match_multiplier=None,
    n_meters_per_chunk=10000,
):
    ls_t = df_ls_t.to_numpy()
    ls_cp = df_ls_c.to_numpy()

    # Calculate closest distances
    if n_match_multiplier is None:
        n_matches_per_treatment = None

    if n_match_multiplier is not None:
        if (not allow_duplicate_match and replace_duplicate_method is not None) or max_distance_threshold is not None:
            n_matches_per_treatment *= n_match_multiplier

        if n_matches_per_treatment > ls_cp.shape[0]:
            n_matches_per_treatment = ls_cp.shape[0]

    cp_id_idx, dist = calculate_distances(
        ls_t, ls_cp, distance_metric, n_matches_per_treatment, n_meters_per_chunk
    )

    # create dataframes
    id_t = df_ls_t.index.values
    id_c = df_ls_c.index.values

    series_t = pd.Series(np.repeat(id_t, dist.shape[1]), name="treatment")
    series_cp = pd.Series(id_c[cp_id_idx.flatten()], name="id")
    clusters = pd.DataFrame(
        dist.flatten(), index=[series_t, series_cp], columns=["distance"]
    )
    clusters = clusters.reset_index()
    clusters["duplicated"] = clusters.duplicated(subset=["id"])
    clusters["cluster"] = 1
    clusters = clusters.set_index("id")

    if not allow_duplicate_match:
        # drop duplicate index
        clusters = clusters[~clusters.index.duplicated(keep='first')]

        # get count of treatment
        print(clusters['treatment'])
        counts = clusters['treatment'].value_counts()
        print(counts)

        if replace_duplicate_method is not None:
            raise NotImplementedError(
                "'replace_duplicate_meters': True not implemented"
            )

    return clusters


def get_comparison_group(df_ls_t, df_ls_cp, weights=None):
    df_cg = TestDistanceMatching(df_ls_t, df_ls_cp)

    # Create df_t_coeffs
    t_ids = df_ls_t.index.unique()
    coeffs = np.ones(t_ids.values.size)

    df_t_coeffs = pd.DataFrame(coeffs, index=t_ids, columns=["pct_cluster_1"])
    df_t_coeffs.index.name = "id"

    return df_cg, df_t_coeffs

In [164]:
from utils.data_settings import Settings
import pandas as pd

class Data:
    def __init__(self, settings : Settings):
        if settings is None:
            self.settings = Settings()
        
        
        self.settings = {   "agg_type": "mean",
                            "loadshape_type": "observed", # ["observed", "modeled", "error"]
                            "time_period": "season_day_of_week", #["hour", "day_of_week", "weekday_weekend", "month", "season_day_of_week", "season_weekday_weekend"]
                            "seasons" : {   # 0 = summer, 1 = shoulder, 2 = winter
                                        1: "winter",
                                        2: "winter",
                                        3: "shoulder",
                                        4: "shoulder",
                                        5: "shoulder",
                                        6: "summer",
                                        7: "summer",
                                        8: "summer",
                                        9: "summer",
                                        10: "shoulder",
                                        11: "winter",
                                        12: "winter",
                                    }
                        }
        
    
    def _find_groupby_columns(self):
        cols = ['id']

        if self.settings['time_period'] == 'season_day_of_week':
            cols.extend(['season', 'day_of_week'])

        elif self.settings['time_period'] == 'season_weekday_weekend':
            cols.extend(['season', 'weekday_weekend'])

        elif self.settings['time_period'] != 'hour':
            cols.append(self.settings['time_period'])

        if self.settings['time_period'] != 'month':
            cols.append('hour')

        return cols
    
    def _add_index_columns_from_datetime(self, df : pd.DataFrame):

        # Add hour column
        df['hour'] = df.index.hour

        # Add month column
        if self.settings['time_period'] == 'month':
            df['month'] = df.index.month

        # Add day_of_week column
        if 'day_of_week' in self.settings['time_period']:
            df['day_of_week'] = df.index.dayofweek
        
        # Add weekday_weekend column
        if 'weekday_weekend' in self.settings['time_period']:
            df['weekday_weekend'] = df.index.dayofweek

            # Setting th ordering to weekday, weekend as 0 and 1 respectively
            df['weekday_weekend'] = df['weekday_weekend'].apply(lambda x: 0 if x < 5 else 1)  # 0 for weekday and 1 for weekend

        # Add season column
        if 'season' in self.settings['time_period']:

            # Setting the ordering of the seasons to summer, shoulder, winter as 0, 1 and 2 respectively
            season_values = {
                "winter": 2,
                "shoulder": 1,
                "summer": 0,
            }

            df['season'] = df.index.month.map(self.settings['seasons']).map(season_values)

        return df

    
    def _convert_timeseries_to_loadshape(self, time_series_df : pd.DataFrame):

        """
            Arguments:


            Returns :
        """

        # Check columns missing in time_series_df
        expected_columns = ["id", "datetime", self.settings["loadshape_type"]] # except error which requires both observed and modeled
        missing_columns = [c for c in expected_columns if c not in time_series_df.columns]
        
        if missing_columns:
            raise ValueError(f"Missing columns in time_series_df: {missing_columns}")
        
        # Ensure the loadshape type only uses observed, modeled or error
        df_type = self.settings["loadshape_type"]
        if df_type not in ["observed", "modeled", "error"]:
            raise ValueError(f"Invalid loadshape_type: {df_type}")
        
        # Check that the datetime column is actually of type datetime
        if time_series_df['datetime'].dtypes != 'datetime64[ns]':
            raise ValueError("The 'datetime' column must be of datetime type")
            

        if df_type == "error":
            pass # calculate error

        
        # Create a base df for adding all required columns
        base_df = time_series_df.set_index('datetime')
        base_df = self._add_index_columns_from_datetime(base_df)


        # Aggregate the input time_series based on time_period

        group_by_columns = self._find_groupby_columns()

        grouped_df = base_df.groupby(group_by_columns)[self.settings['loadshape_type']]

        agg_df = grouped_df.agg(agg_loadshape = self.settings['agg_type']).reset_index()
        
        # agg_df = self._sort_index_values(agg_df)
        agg_df = agg_df.sort_values(by = group_by_columns)


        # return agg_df

        # Create the count of the index per ID
        agg_df['hour'] = agg_df.groupby('id').cumcount() + 1

        # Pivot the rolled up column
        loadshape_df = agg_df.pivot(index='id', columns=['hour'], values='agg_loadshape')

        return loadshape_df
    
    
    def _validate(self, df):
        pass

    
    def set_data(self, loadshape_df=None, time_series_df=None):
        """

        Args:
            Loadshape_df: columns = [id, time, loadshape]

            Time_series_df: columns = [id, datetime, observed, observed_error, modeled, modeled_error]

        Output:
            loadshape: index = id, columns = time, values = loadshape

            
        """
        if loadshape_df is None and time_series_df is None:
            raise ValueError("Either loadshape dataframe or time series dataframe must be provided.")


        if loadshape_df is not None:
            # Check columns missing in loadshape_df
            expected_columns = ["id", self.settings["time_period"], "loadshape"]
            missing_columns = [c for c in expected_columns if c not in loadshape_df.columns]
            
            if missing_columns:
                raise ValueError(f"Missing columns in time_series_df: {missing_columns}")

            
            # Check if all values are present in the columns as required
            # Else update the values via interpolation if missing, also ignore duplicates if present


            #Aggregate the input loadshape based on time_period
            output_loadshape = loadshape_df.pivot(index='id', columns=[self.settings["time_period"]], values='loadshape') 
            
        elif time_series_df is not None:
            output_loadshape = self._convert_timeseries_to_loadshape(time_series_df)

        
        # Convert multi level index to single level
        self.loadshape = output_loadshape.rename_axis(None, axis=1).reset_index().drop(columns='index', axis=1, errors='ignore')


        

In [137]:
# Testing with input loadshapes

data = Data(None)
data.set_data(df_ls_t_fixed)
data.loadshape

NameError: name 'df_ls_t_fixed' is not defined

In [36]:
df_ls_t_mod.head()

hour,1,2,3,4,5,6,7,8,9,10,...,495,496,497,498,499,500,501,502,503,504
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
None-1094275585-1094275585,-0.004892,0.014424,0.024523,0.009783,0.002749,-0.018928,0.000177,-0.02238,-0.01527,-0.015309,...,-0.072218,-0.048825,-0.046101,-0.036118,-0.021544,0.04153,0.057548,0.083471,0.0633,0.07351
None-1397301805-1397301805,-0.040984,-0.017669,-0.00032,0.023877,0.026313,-0.035445,0.017402,0.097946,0.067614,0.13995,...,0.065461,0.161034,0.069464,-0.014588,0.022082,0.08821,-0.039004,0.088822,-0.021783,0.020953
None-1432022910-1432022910,0.005879,0.00789,-0.005875,0.002565,0.007195,-0.019074,-0.007493,-0.007334,-0.01834,-0.040356,...,0.103594,0.077896,0.055153,0.013053,-0.002923,0.015454,0.037913,0.038603,0.019493,0.046673
None-1469355610-1469355610,-0.005751,-0.002495,0.000473,0.001568,-0.008138,-0.027553,-0.007281,0.051084,-0.008178,0.020144,...,0.001396,-0.059681,0.053881,0.055815,0.067706,0.049554,0.029438,0.03197,0.144222,0.123939
None-1504812305-1504812305,0.042894,0.010071,-0.02805,-0.014719,-0.021681,-0.016993,-0.045222,-0.004814,-0.008289,-0.00294,...,0.161389,0.164025,0.106125,0.026877,0.010182,0.03187,-0.010856,0.013376,0.037637,0.020543


In [24]:
df_ls_cp.head()

Unnamed: 0,id,hour,ls
0,None-0344817417-0344817417,1,0.048667
1,None-0344817417-0344817417,2,0.038093
2,None-0344817417-0344817417,3,0.029257
3,None-0344817417-0344817417,4,0.0384
4,None-0344817417-0344817417,5,0.031701


In [106]:
# Create a testing dataframe having an id, datetime of 15 min intervals, observed and modeled values 
num_intervals = 4 * 24 * 365  # 4 intervals/hour * 24 hours/day * 365 days

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df = pd.DataFrame({
    'id': np.repeat(['id1', 'id2', 'id3'], num_intervals),  # only 3 ids for easier comparison
    'datetime': pd.date_range(start='2023-01-01', periods=num_intervals, freq='15T').tolist() * 3, 
    'observed': np.random.rand(num_intervals * 3),  # randomized
    'modeled': np.random.rand(num_intervals * 3)  # randomized
})

# Convert 'datetime' column to datetime type
df['datetime'] = pd.to_datetime(df['datetime'])
df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.124148,0.664864
1,id1,2023-01-01 00:15:00,0.600414,0.362168
2,id1,2023-01-01 00:30:00,0.076517,0.124665
3,id1,2023-01-01 00:45:00,0.817475,0.911011
4,id1,2023-01-01 01:00:00,0.408777,0.705123
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.633569,0.606711
105116,id3,2023-12-31 23:00:00,0.254145,0.696529
105117,id3,2023-12-31 23:15:00,0.001792,0.187593
105118,id3,2023-12-31 23:30:00,0.813865,0.512033


In [165]:
data = Data(None)
data.set_data(time_series_df = df)
data.loadshape

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,...,495,496,497,498,499,500,501,502,503,504
0,id1,0.501719,0.488966,0.447439,0.522465,0.500911,0.56745,0.495952,0.480871,0.465315,...,0.551877,0.547703,0.492746,0.445849,0.487883,0.50316,0.451972,0.472149,0.532671,0.599067
