In [13]:
# %matplotlib
# %matplotlib inline
# %matplotlib notebook

import pandas as pd
import numpy as np
import random
import os
import warnings
from datetime import datetime, timedelta, timezone
warnings.simplefilter("ignore")
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.ticker import AutoMinorLocator
from matplotlib.ticker import FormatStrFormatter
import re
import math
from copy import deepcopy
from numba_stats import t
import scipy.stats as stats


from IPython.display import Image, Markdown, display
plt.ion()
plt.rcParams['figure.figsize'] = [24, 16]
plt.rcParams['figure.dpi'] = 300

# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

%load_ext autoreload
%autoreload 2

In [4]:
df_ls_t = pd.read_csv("/app/.recurve_cache/example_dfs/df_ls_t.csv")
df_ls_cp = pd.read_csv("/app/.recurve_cache/example_dfs/df_ls_cp.csv")

In [2]:
from gridmeter.utils.calculate_distances import calculate_distances

def TestDistanceMatching(
    df_ls_t,
    df_ls_c,
    n_matches_per_treatment=4,
    distance_metric="euclidean",
    allow_duplicate_match=False,
    replace_duplicate_method=None,  # currently unused [None, "closest_to_meter", "closest_global"]
    max_distance_threshold=None,
    n_match_multiplier=None,
    n_meters_per_chunk=10000,
):
    ls_t = df_ls_t.to_numpy()
    ls_cp = df_ls_c.to_numpy()

    # Calculate closest distances
    if n_match_multiplier is None:
        n_matches_per_treatment = None

    if n_match_multiplier is not None:
        if (not allow_duplicate_match and replace_duplicate_method is not None) or max_distance_threshold is not None:
            n_matches_per_treatment *= n_match_multiplier

        if n_matches_per_treatment > ls_cp.shape[0]:
            n_matches_per_treatment = ls_cp.shape[0]

    cp_id_idx, dist = calculate_distances(
        ls_t, ls_cp, distance_metric, n_matches_per_treatment, n_meters_per_chunk
    )

    # create dataframes
    id_t = df_ls_t.index.values
    id_c = df_ls_c.index.values

    series_t = pd.Series(np.repeat(id_t, dist.shape[1]), name="treatment")
    series_cp = pd.Series(id_c[cp_id_idx.flatten()], name="id")
    clusters = pd.DataFrame(
        dist.flatten(), index=[series_t, series_cp], columns=["distance"]
    )
    clusters = clusters.reset_index()
    clusters["duplicated"] = clusters.duplicated(subset=["id"])
    clusters["cluster"] = 1
    clusters = clusters.set_index("id")

    if not allow_duplicate_match:
        # drop duplicate index
        clusters = clusters[~clusters.index.duplicated(keep='first')]

        # get count of treatment
        print(clusters['treatment'])
        counts = clusters['treatment'].value_counts()
        print(counts)

        if replace_duplicate_method is not None:
            raise NotImplementedError(
                "'replace_duplicate_meters': True not implemented"
            )

    return clusters


def get_comparison_group(df_ls_t, df_ls_cp, weights=None):
    df_cg = TestDistanceMatching(df_ls_t, df_ls_cp)

    # Create df_t_coeffs
    t_ids = df_ls_t.index.unique()
    coeffs = np.ones(t_ids.values.size)

    df_t_coeffs = pd.DataFrame(coeffs, index=t_ids, columns=["pct_cluster_1"])
    df_t_coeffs.index.name = "id"

    return df_cg, df_t_coeffs

In [4]:
# Testing with input loadshapes

data = Data(None)
data.set_data(df_ls_t_fixed)
data.loadshape

NameError: name 'Data' is not defined

In [36]:
df_ls_t_mod.head()

hour,1,2,3,4,5,6,7,8,9,10,...,495,496,497,498,499,500,501,502,503,504
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
None-1094275585-1094275585,-0.004892,0.014424,0.024523,0.009783,0.002749,-0.018928,0.000177,-0.02238,-0.01527,-0.015309,...,-0.072218,-0.048825,-0.046101,-0.036118,-0.021544,0.04153,0.057548,0.083471,0.0633,0.07351
None-1397301805-1397301805,-0.040984,-0.017669,-0.00032,0.023877,0.026313,-0.035445,0.017402,0.097946,0.067614,0.13995,...,0.065461,0.161034,0.069464,-0.014588,0.022082,0.08821,-0.039004,0.088822,-0.021783,0.020953
None-1432022910-1432022910,0.005879,0.00789,-0.005875,0.002565,0.007195,-0.019074,-0.007493,-0.007334,-0.01834,-0.040356,...,0.103594,0.077896,0.055153,0.013053,-0.002923,0.015454,0.037913,0.038603,0.019493,0.046673
None-1469355610-1469355610,-0.005751,-0.002495,0.000473,0.001568,-0.008138,-0.027553,-0.007281,0.051084,-0.008178,0.020144,...,0.001396,-0.059681,0.053881,0.055815,0.067706,0.049554,0.029438,0.03197,0.144222,0.123939
None-1504812305-1504812305,0.042894,0.010071,-0.02805,-0.014719,-0.021681,-0.016993,-0.045222,-0.004814,-0.008289,-0.00294,...,0.161389,0.164025,0.106125,0.026877,0.010182,0.03187,-0.010856,0.013376,0.037637,0.020543


In [24]:
df_ls_cp.head()

Unnamed: 0,id,hour,ls
0,None-0344817417-0344817417,1,0.048667
1,None-0344817417-0344817417,2,0.038093
2,None-0344817417-0344817417,3,0.029257
3,None-0344817417-0344817417,4,0.0384
4,None-0344817417-0344817417,5,0.031701


In [126]:
from _utils.data_settings import Settings
import pandas as pd


class DataConstants:
    time_periods = ["hourly",
        "month",
        "hourly_month",
        "day_of_week",
        "hourly_day_of_week",
        "weekday_weekend",
        "hourly_weekday_weekend",
        "season_day_of_week",
        "season_hourly_day_of_week",
        "season_weekday_weekend",
        "season_hourly_weekday_weekend"
    ]
        

    time_period_row_counts = {
        "hourly": 24,
        "month": 12,
        "hourly_month": 24 * 12,
        "day_of_week": 7,
        "hourly_day_of_week": 24 * 7,
        "weekday_weekend": 2,
        "hourly_weekday_weekend": 24 * 2,
        "season_day_of_week": 3 * 7,
        "season_hourly_day_of_week": 3 * 24 * 7,
        "season_weekday_weekend": 3 * 2,
        "season_hourly_weekday_weekend": 3 * 24 * 2,
    }

    season_order = {
        "summer": 0,
        "shoulder": 1,
        "winter": 2,
    }

    weekday_weekend_order = {
        "weekday": 0,
        "weekend": 1,
    }

        # This list ordering is important for the groupby columns
    unique_time_periods = ['season',
                            'month', 
                            'day_of_week', 
                            'weekday_weekend', 
                            'hour'
                            ]

    min_data_pct_required = 0.8 # 80% of data required for a meter to be included in the analysis


class Data:
    def __init__(self, settings : Settings):
        if settings is None:
            self.settings = Settings()
        
        
        self.settings = {   "agg_type": "mean",
                            "loadshape_type": "observed", # ["observed", "modeled", "error"]
                            "time_period": "season_hourly_day_of_week", #["hour", "day_of_week", "weekday_weekend", "month", "season_hourly_day_of_week", "season_weekday_weekend"]
                            "interpolate_missing" : True, # False should throw error if missing values
                            "seasons" : {   # 0 = summer, 1 = shoulder, 2 = winter, conversion done later on
                                        1: "winter",
                                        2: "winter",
                                        3: "shoulder",
                                        4: "shoulder",
                                        5: "shoulder",
                                        6: "summer",
                                        7: "summer",
                                        8: "summer",
                                        9: "summer",
                                        10: "shoulder",
                                        11: "winter",
                                        12: "winter"
                                    },
                            "weekday_weekend" : {
                                        0: "weekday",
                                        1: "weekday",
                                        2: "weekday",
                                        3: "weekday",
                                        4: "weekday",
                                        5: "weekend",
                                        6: "weekend"
                            }
                        }

        
        
    
    def _find_groupby_columns(self) -> list:
        """
            Create the list of columns to be grouped by based on the time_period selected in Settings.

            Time_period : hour => group by (id, hour)
            Time_period : month => group by (id, month)
            Time_period : hourly_day_of_week => group by (id, day_of_week, hour)
            Time_period : weekday_weekend => group by (id, weekday_weekend)
            Time_period : season_day_of_week => group by (id, season, day_of_week)
            Time_period : season_hourly_weekday_weekend => group by (id, season, weekday_weekend, hour)

        """
        cols = ['id']

        for period in DataConstants.unique_time_periods:
            if period in self.settings['time_period']:
                cols.append(period)

        return cols
    
    def _add_index_columns_from_datetime(self, df : pd.DataFrame) -> pd.DataFrame:

        # Add hour column
        if 'hour' in self.settings['time_period']:
            df['hour'] = df.index.hour

        # Add month column
        if 'month' in self.settings['time_period']:
            df['month'] = df.index.month

        # Add day_of_week column
        if 'day_of_week' in self.settings['time_period']:
            df['day_of_week'] = df.index.dayofweek
        
        # Add weekday_weekend column
        if 'weekday_weekend' in self.settings['time_period']:
            df['weekday_weekend'] = df.index.dayofweek

            # Setting the ordering to weekday, weekend
            df['weekday_weekend'] = df['weekday_weekend'].map(self.settings['weekday_weekend']).map(DataConstants.weekday_weekend_order)

        # Add season column
        if 'season' in self.settings['time_period']:
            df['season'] = df.index.month.map(self.settings['seasons']).map(DataConstants.season_order)

        return df

    def _validate(self, df : pd.DataFrame) -> pd.DataFrame:
        # Check if all values are present in the columns as required
        # Else update the values via interpolation if missing, also ignore duplicates if present

        # loadshape df has the "hour" column or similar, whereas timeseries df has the "datetime" column
        subset_columns = ['id', self.settings["time_period"] if self.settings["time_period"] in df.columns else "agg_loadshape"]

        df = df.drop_duplicates(subset=subset_columns, keep='first')

        if self.settings["interpolate_missing"]:

             # Check that the number of missing values is less than the threshold
            for id, group in df.groupby('id'):
                if group.count().min() < DataConstants.min_data_pct_required * DataConstants.time_period_row_counts[self.settings["time_period"]]:
                    raise ValueError(f"Missing minimum threshold number of values in dataframe for id: {id}")
            
            # Fill NaN values with interpolation
            df = df.groupby('id').apply(lambda x: x.interpolate(method='linear', limit_direction='both')).reset_index(drop=True)

            # TODO : Interpolation should only occur on within seasons, not across seasons
            
        else:
            missing_values = df[df.isnull().any(axis=1)]
            if missing_values.shape[0] > 0:
                raise ValueError(f"Missing values in loadshape_df: {missing_values.shape[0]}")

        return df

    
    def _convert_timeseries_to_loadshape(self, time_series_df : pd.DataFrame) -> pd.DataFrame:

        """
            Arguments:
                Time series dataframe with columns = [id, datetime, observed, observed_error, modeled, modeled_error

            Returns :
                Loadshape dataframe with columns = [id, time, loadshape]
        """

        # Check columns missing in time_series_df
        expected_columns = ["id", "datetime", self.settings["loadshape_type"]] # except error which requires both observed and modeled
        missing_columns = [c for c in expected_columns if c not in time_series_df.columns]
        
        if missing_columns:
            raise ValueError(f"Missing columns in time_series_df: {missing_columns}")

        
        
        # Ensure the loadshape type only uses observed, modeled or error
        df_type = self.settings["loadshape_type"]
        if df_type not in ["observed", "modeled", "error"]:
            raise ValueError(f"Invalid loadshape_type: {df_type}")
        
        # Check that the datetime column is actually of type datetime
        if time_series_df['datetime'].dtypes != 'datetime64[ns]':
            raise ValueError("The 'datetime' column must be of datetime type")
            

        if df_type == "error":
            pass # calculate error

        
        # Create a base df for adding all required columns
        base_df = time_series_df.set_index('datetime')
        base_df = self._add_index_columns_from_datetime(base_df)


        # Aggregate the input time_series based on time_period

        group_by_columns = self._find_groupby_columns()

        grouped_df = base_df.groupby(group_by_columns)[self.settings['loadshape_type']]

        agg_df = grouped_df.agg(agg_loadshape = self.settings['agg_type']).reset_index()
        
        # Sort the values so that the ordering is maintained correctly
        agg_df = agg_df.sort_values(by = group_by_columns)

        # Validate that all the values are correct
        agg_df = self._validate(agg_df)


        # uncomment this for testing
        # return agg_df

        
        # Create the count of the index per ID
        agg_df['time'] = agg_df.groupby('id').cumcount() + 1

        # Pivot the rolled up column
        loadshape_df = agg_df.pivot(index='id', columns=['time'], values='agg_loadshape')

        return loadshape_df
    
    
    def set_data(self, loadshape_df=None, time_series_df=None) -> None:
        """

        Args:
            Loadshape_df: columns = [id, time, loadshape]

            Time_series_df: columns = [id, datetime, observed, observed_error, modeled, modeled_error]

        Output:
            loadshape: index = id, columns = time, values = loadshape

            
        """
        if loadshape_df is None and time_series_df is None:
            raise ValueError("Either loadshape dataframe or time series dataframe must be provided.")
        
        elif loadshape_df is not None and time_series_df is not None:
            raise ValueError("Both loadshape dataframe and time series dataframe are provided. Please provide only one.")


        if loadshape_df is not None:
            # Check columns missing in loadshape_df
            expected_columns = ["id", self.settings["time_period"], "loadshape"]
            missing_columns = [c for c in expected_columns if c not in loadshape_df.columns]
            
            if missing_columns:
                raise ValueError(f"Missing columns in time_series_df: {missing_columns}")

            loadshape_df = self._validate(loadshape_df)

            #Aggregate the input loadshape based on time_period
            output_loadshape = loadshape_df.pivot(index='id', columns=[self.settings["time_period"]], values='loadshape') 
            
        elif time_series_df is not None:
            output_loadshape = self._convert_timeseries_to_loadshape(time_series_df)

        
        # Convert multi level index to single level
        self.loadshape = output_loadshape.rename_axis(None, axis=1).reset_index().drop(columns='index', axis=1, errors='ignore')


        

In [127]:
# Create a testing dataframe having an id, datetime of 15 min intervals, observed and modeled values 
num_intervals = 4 * 24 * 365  # 4 intervals/hour * 24 hours/day * 365 days

# Create a DataFrame with 'id', 'datetime', 'observed', and 'modeled' columns
df = pd.DataFrame({
    'id': np.repeat(['id1', 'id2', 'id3'], num_intervals),  # only 3 ids for easier comparison
    'datetime': pd.date_range(start='2023-01-01', periods=num_intervals, freq='15T').tolist() * 3, 
    'observed': np.random.rand(num_intervals * 3),  # randomized
    'modeled': np.random.rand(num_intervals * 3)  # randomized
})

# # Create a boolean mask for Mondays and Wednesdays , will give ValueError at 80% threshold
# day_mask = df['datetime'].dt.dayofweek.isin([0,2])

# # Set 'observed' and 'modeled' values to NaN for all Mondays and Wednesdays
# df.loc[day_mask, ['observed', 'modeled']] = np.nan


# Convert 'datetime' column to datetime type
df['datetime'] = pd.to_datetime(df['datetime'])
df

Unnamed: 0,id,datetime,observed,modeled
0,id1,2023-01-01 00:00:00,0.036023,0.060180
1,id1,2023-01-01 00:15:00,0.220069,0.536827
2,id1,2023-01-01 00:30:00,0.218821,0.394087
3,id1,2023-01-01 00:45:00,0.809175,0.476992
4,id1,2023-01-01 01:00:00,0.249092,0.857870
...,...,...,...,...
105115,id3,2023-12-31 22:45:00,0.820274,0.969752
105116,id3,2023-12-31 23:00:00,0.933048,0.742738
105117,id3,2023-12-31 23:15:00,0.782695,0.940302
105118,id3,2023-12-31 23:30:00,0.288678,0.246360


In [128]:
data = Data(None)
data.set_data(time_series_df = df)
data.loadshape

Unnamed: 0,id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,465,466,467,468,469,470,471,472,473,474,475,476,477,478,479,480,481,482,483,484,485,486,487,488,489,490,491,492,493,494,495,496,497,498,499,500,501,502,503,504
0,id1,0.497513,0.50308,0.585096,0.501427,0.519372,0.519026,0.521536,0.49809,0.548002,0.462479,0.458657,0.458846,0.520614,0.530701,0.547971,0.481091,0.462663,0.553896,0.514236,0.476655,0.525762,0.485854,0.424401,0.54995,0.50986,0.527067,0.529336,0.472988,0.592509,0.495912,0.560203,0.540261,0.531196,0.512268,0.432088,0.475354,0.460178,0.527132,0.487451,...,0.510121,0.461013,0.45804,0.465397,0.509333,0.455215,0.500319,0.52803,0.535635,0.469574,0.540474,0.503659,0.527446,0.545216,0.469338,0.474134,0.46951,0.51937,0.514227,0.452956,0.506415,0.445379,0.493674,0.468681,0.493651,0.459008,0.432278,0.515239,0.509032,0.533815,0.522705,0.532362,0.534769,0.477726,0.496121,0.525629,0.506505,0.495232,0.465889,0.461063
1,id2,0.511356,0.505004,0.432078,0.529338,0.469377,0.524019,0.462755,0.557061,0.462384,0.552961,0.490747,0.518907,0.526815,0.52158,0.500749,0.511481,0.492782,0.469096,0.51342,0.440006,0.422768,0.540626,0.420681,0.42379,0.474846,0.543837,0.468018,0.504725,0.511063,0.490607,0.521121,0.504424,0.502705,0.508105,0.491907,0.42478,0.519335,0.500179,0.467873,...,0.474225,0.486638,0.535211,0.448515,0.497444,0.517116,0.488796,0.547084,0.452915,0.523403,0.471872,0.511867,0.532988,0.480859,0.492341,0.531833,0.481691,0.49591,0.484596,0.520474,0.500027,0.535606,0.437683,0.447808,0.541978,0.448347,0.514447,0.450794,0.532999,0.553716,0.468665,0.563674,0.515951,0.497905,0.40482,0.498062,0.520308,0.444535,0.504788,0.590322
2,id3,0.504112,0.483284,0.452797,0.414495,0.501335,0.454209,0.45377,0.472631,0.529832,0.519036,0.488815,0.496542,0.451364,0.524489,0.557621,0.469173,0.517633,0.431323,0.460994,0.49179,0.504099,0.488622,0.529865,0.440646,0.558205,0.496382,0.433261,0.565615,0.511294,0.552707,0.435067,0.499088,0.445643,0.461256,0.494659,0.493058,0.524613,0.438435,0.507443,...,0.57053,0.492983,0.488145,0.487788,0.517869,0.491561,0.532872,0.528683,0.535443,0.519557,0.515458,0.522002,0.534914,0.550744,0.570809,0.507922,0.550052,0.459606,0.430617,0.469266,0.505965,0.458786,0.452004,0.484494,0.515372,0.476923,0.432848,0.471361,0.506274,0.554754,0.501322,0.465567,0.458048,0.526822,0.59124,0.466543,0.495892,0.436973,0.547513,0.549112
