In [1]:
# import os
from pathlib import Path

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from eemeter.development.data import HourlyData
from itertools import combinations
# import time

# from hourly_test_utils import *
from applied_data_science.bigquery.data import Meter_Data
from eemeter import eemeter as em
# from eemeter.common.metrics import BaselineTestingMetrics as Metrics
from scipy.interpolate import RBFInterpolator

# import multiprocessing as mp

import warnings
warnings.filterwarnings("ignore")
%load_ext autoreload
%autoreload 2

In [2]:
#load subsamples from the main MCE dataset
dataset = 'mce'
subsample = 1
has_solar = True

cache_dir = Path("/app/.recurve_cache/data").resolve()

if 'data' in globals():
    del data

data = Meter_Data(dataset, subsample, "hourly", solar=has_solar, cache_dir=cache_dir)
meta = data.df['meta']
subsample_df = data.df['meter']
ids = subsample_df.index.unique()

Loading: /app/.recurve_cache/data/MCE/MCE_covid_solar_meta_1.pkl
Done in 0.005 s
Loading: /app/.recurve_cache/data/MCE/MCE_covid_solar_hourly_meter_1.pkl
Done in 4.756 s


In [3]:
alpha_opt = 0.15
l1_ratio_opt = 0.05
settings = em.HourlySettings(
    TRAIN_FEATURES=['ghi'],
    LAGGED_FEATURES=None, # 'ghi'
    WINDOW=None,
    SUPPLEMENTAL_DATA=None,
    # SUPPLEMENTAL_DATA=['solar_supp'],
    ALPHA=alpha_opt,
    L1_RATIO=l1_ratio_opt,
    SELECTION="cyclic",
    MAX_ITER=1000,
    SEED=42
)

In [4]:
# get one meter
meter_id = ids[758]
meter = subsample_df.loc[meter_id].copy()
# PV_interventation = "2019-11-09"
PV_interventation = "2021-01-21"

#change PV_interventation to datetime
PV_interventation = pd.to_datetime(PV_interventation)
meter['solar_supp'] = 0
meter.loc[meter['date'] >= PV_interventation.date(), 'solar_supp'] = 1

# select baseline period
meter_baseline = meter.loc[meter['period'] == 'baseline'].copy()

meta_meter = meta.loc[meter_id]

# get unique pair of station_latitude and station_longitude
unique_pairs = meta_meter[['station_latitude', 'station_longitude']].drop_duplicates()

# print unique pairs
METADATA = {
    'station_latitude': unique_pairs['station_latitude'].values[0],
    'station_longitude': unique_pairs['station_longitude'].values[0],
}

#TODO: keep it simple and let the input force the output
OUTPUT_FEATURES=['temperature', 'observed', 'model']
if 'solar_supp' in settings.TRAIN_FEATURES:
    OUTPUT_FEATURES.append('solar_supp')
if 'ghi' in settings.TRAIN_FEATURES:
    CONSIDER_SOLAR = True
    OUTPUT_FEATURES.append('ghi')
    OUTPUT_FEATURES.append('clearsky_ghi')
else:
    CONSIDER_SOLAR = False
kwargs = {
    'solar': CONSIDER_SOLAR,
    'metadata': METADATA,
    'outputs': OUTPUT_FEATURES,
}
data_baseline = HourlyData(meter_baseline, **kwargs)

In [5]:
meter_baseline[['temperature', 'observed']].isna().sum()

temperature     128
observed       1862
dtype: int64

In [6]:
#TODO: keep it simple and let the input force the output
OUTPUT_FEATURES=['temperature', 'observed', 'model']
if 'solar_supp' in settings.TRAIN_FEATURES:
    OUTPUT_FEATURES.append('solar_supp')
if 'ghi' in settings.TRAIN_FEATURES:
    CONSIDER_SOLAR = True
    OUTPUT_FEATURES.append('ghi')
    OUTPUT_FEATURES.append('clearsky_ghi')
else:
    CONSIDER_SOLAR = False


arglist = []
for i, meter_id in enumerate(ids):
    meta_meter = meta.loc[meter_id]
    # get unique pair of station_latitude and station_longitude
    unique_pairs = meta_meter[['station_latitude', 'station_longitude']].drop_duplicates()
    try:
        try:
            METADATA = {
            'station_latitude': unique_pairs['station_latitude'][0],
            'station_longitude': unique_pairs['station_longitude'][0],
            }
        except:
            METADATA = {
            'station_latitude': unique_pairs['station_latitude'],
            'station_longitude': unique_pairs['station_longitude'],
            }
    except:
        try:
            METADATA = {
                'station_latitude': unique_pairs['station_latitude'].values[0],
                'station_longitude': unique_pairs['station_longitude'].values[0],
                }
        except:
            pass
        
    kwargs = {
    'solar': CONSIDER_SOLAR,
    'metadata': METADATA,
    'outputs': OUTPUT_FEATURES,
    }
    arglist.append((i, meter_id, kwargs))


def check_df_decorator(arglist):
    i, meter_id, kwargs = arglist
    status = 'None'

    details = [i, meter_id, status]

    try:
        meter = subsample_df.loc[meter_id].copy()
        meter_baseline = meter.loc[meter['period'] == 'baseline'].copy()
        # get unique pair of station_latitude and station_longitude

        if meter_baseline.empty:
            print(f"Skipping {i},   {meter_id} due to empty baseline data")
            details[-1] = 'empty_baseline'
            return details
        data_baseline = HourlyData(meter_baseline, **kwargs)

        if data_baseline.too_many_missing_data:
            print(f" {i},   {meter_id} has {data_baseline.missing_values_amount} missing data")
            details[-1] = 'too_many_missing_data'
            
        if data_baseline.df[data_baseline.to_be_interpolated_columns].isnull().values.any():
            print(f"Skipping {i},   {meter_id} due to missing data, because of interpolaiton failed")
            details[-1] = 'interpolation_failed'
        
        return details
    except:
        print(f"Skipping {i},   {meter_id} due to unknown error")
        details[-1] = 'unknown_error'
        return details

# for i, meter_id, kwargs in arglist:
#     status = check_df_decorator((i, meter_id, kwargs))
#     if status:
#         print(f"Skipping {i},   {meter_id} due to {status}")
#         continue
import multiprocessing as mp
with mp.Pool(mp.cpu_count()-1) as pool:
    results = pool.map(check_df_decorator, arglist)

Skipping 16,   None-0880849278_1-0880849278_1 due to empty baseline dataSkipping 32,   None-1396433705_1-1396433705_1 due to empty baseline data
Skipping 232,   None-1771940710_1-1771940710_1 due to empty baseline dataSkipping 240,   None-1782589105_1-1782589105_1 due to empty baseline data
Skipping 88,   None-1545680582_1-1545680582_1 due to empty baseline dataSkipping 224,   None-1761946610_1-1761946610_1 due to empty baseline data



Skipping 241,   None-1783689710_1-1783689710_1 due to empty baseline data
Skipping 57,   None-1474037010_1-1474037010_1 due to empty baseline data
Skipping 193,   None-1718143205_1-1718143205_1 due to empty baseline dataSkipping 73,   None-1509752510_1-1509752510_1 due to empty baseline data

Skipping 74,   None-1510867905_1-1510867905_1 due to empty baseline data
Skipping 82,   None-1530517010_1-1530517010_1 due to empty baseline data
Skipping 83,   None-1530649410_1-1530649410_1 due to empty baseline data
Skipping 84,   None-1535257901_1-1535257901_1 

In [7]:
#value counts of results
results = pd.DataFrame(results, columns=['i', 'meter_id', 'status'])
results['status'].value_counts()

status
None                     785
empty_baseline           149
too_many_missing_data     52
Name: count, dtype: int64

In [8]:
results.loc[results['status'] == 'unknown_error']

Unnamed: 0,i,meter_id,status


In [None]:
for i, meter_id in enumerate(ids):
    print(i, meter_id)
    meter = subsample_df.loc[meter_id].copy()

    # select baseline period
    meter_baseline = meter.loc[meter['period'] == 'baseline'].copy()

    meta_meter = meta.loc[meter_id]

    # get unique pair of station_latitude and station_longitude
    unique_pairs = meta_meter[['station_latitude', 'station_longitude']].drop_duplicates()

    # print unique pairs
    METADATA = {
        'station_latitude': unique_pairs['station_latitude'].values[0],
        'station_longitude': unique_pairs['station_longitude'].values[0],
    }

    #TODO: keep it simple and let the input force the output
    OUTPUT_FEATURES=['temperature', 'observed', 'model']
    if 'solar_supp' in settings.TRAIN_FEATURES:
        OUTPUT_FEATURES.append('solar_supp')
    if 'ghi' in settings.TRAIN_FEATURES:
        CONSIDER_SOLAR = True
        OUTPUT_FEATURES.append('ghi')
        OUTPUT_FEATURES.append('clearsky_ghi')
    else:
        CONSIDER_SOLAR = False
    kwargs = {
        'solar': CONSIDER_SOLAR,
        'metadata': METADATA,
        'outputs': OUTPUT_FEATURES,
    }
    if meter_baseline.empty:
        print(f"Skipping {i},   {meter_id} due to empty baseline data")
        continue
    data_baseline = HourlyData(meter_baseline, **kwargs)

    if data_baseline.too_many_missing_data:
        print(f"Skipping {i},   {meter_id} due to {data_baseline.missing_values_amount} missing data")
    if data_baseline.df[data_baseline.to_be_interpolated_columns].isnull().values.any():
        print(f"Skipping {i},   {meter_id} due to missing data, because of interpolaiton failed")

In [None]:
unique_pairs

## Interpolation analysis

In [None]:
def check_datetime(df):
    #get all the columns with datetime type #TODO: check if this is the best way to do this
    datetime_columns = df.select_dtypes(include=[np.datetime64]).columns
    # check if datetime is in the columns
    if "datetime" in df.columns:
        pass
    elif "datetime" in df.index.names:
        df['datetime'] = df.index
        df = df.reset_index(drop=True)
    elif "start_local" in df.columns:
        df['datetime'] = df['start_local']
        df = df.drop(columns=["start_local"])
    elif len(datetime_columns) > 0:
        df['datetime'] = df[datetime_columns[0]]
        df = df.drop(columns=[datetime_columns[0]])
    else:
        raise ValueError("datetime column not found")
    
    #reset index to ensure datetime is not the index
    df = df.reset_index(drop=True)
    return df

def get_contiguous_datetime(df):
    # get earliest datetime and latest datetime
    # make earliest start at 0 and latest end at 23, this ensures full days
    earliest_datetime = df["datetime"].min().replace(hour=0, minute=0, second=0, microsecond=0)
    latest_datetime = df["datetime"].max().replace(hour=23, minute=0, second=0, microsecond=0)

    # create a new index with all the hours between the earliest and latest datetime
    complete_dt = pd.date_range(start=earliest_datetime, end=latest_datetime, freq='H').to_frame(index=False, name="datetime")

    # merge meter data with complete_dt
    df = complete_dt.merge(df, on="datetime", how="left")
    df['date'] = df['datetime'].dt.date
    df['hour_of_day'] = df['datetime'].dt.hour

    return df

def remove_duplicate_datetime(df):
    if "observed" in df.columns:
        # find duplicate datetime values and remove if nan
        duplicate_dt_mask = df.duplicated(subset="datetime", keep=False)
        observed_nan_mask = df['observed'].isna()
        df = df[~(duplicate_dt_mask & observed_nan_mask)]

        # if duplicated and observed is not nan, keep the largest abs(value)
        df["abs_observed"] = df["observed"].abs()
        df = df.sort_values(by=["datetime", "abs_observed"], ascending=[True, False])
        df = df.drop_duplicates(subset="datetime", keep="first")
        df = df.drop(columns=["abs_observed"])

    else:
        # TODO what if there is no observed column? Could have dup datetime with different temperatures
        df = df.drop_duplicates(subset="datetime", keep="first")

    return df

In [None]:
#remove random time slots for temperature and make them nan
#set a random seed
meter_baseline = check_datetime(meter_baseline)
print(meter_baseline.isna().sum())

meter_baseline_nans = meter_baseline.copy()
interpolation_columns = ['observed']
random_seeds = [42, 43]

save_nan_vals = {}
for i, col in enumerate(interpolation_columns):
    np.random.seed(random_seeds[i])
    random_slots = np.random.choice(meter_baseline['datetime'], 1000, replace=False)
    save_nan_vals[col] = meter_baseline_nans.loc[meter_baseline_nans['datetime'].isin(random_slots)][['datetime', col]]
    meter_baseline_nans.loc[meter_baseline_nans['datetime'].isin(random_slots), col] = np.nan
print(meter_baseline_nans['temperature'].isna().sum())
print(meter_baseline_nans['observed'].isna().sum())

In [None]:
print(meter_baseline_nans.shape)
meter_baseline_nans = check_datetime(meter_baseline_nans)
meter_baseline_nans = get_contiguous_datetime(meter_baseline_nans)
meter_baseline_nans = remove_duplicate_datetime(meter_baseline_nans)
print(meter_baseline_nans.shape)
print(meter_baseline_nans['temperature'].isna().sum())
print(meter_baseline_nans['observed'].isna().sum())
meter_baseline_nans.set_index('datetime', inplace=True)

In [None]:
data_baseline = HourlyData(meter_baseline_nans, **kwargs)

In [None]:
data_baseline.outputs

In [None]:
col = 'observed'
plt.plot(data_baseline.df[col])
# plot the interpolated values on red
plt.plot(data_baseline.df.loc[data_baseline.df[f'interpolated_{col}']==True, col], 'ro')
plt.show()
print(data_baseline.df.shape)

In [None]:
na_dt = save_nan_vals['observed'].dropna()
interp_vals = data_baseline.df.loc[data_baseline.df.index.isin(na_dt['datetime']), 'observed']

In [None]:
na_dt.isna().sum()

In [None]:
interp_vals.isna().sum()

In [None]:
plt.scatter(interp_vals.values, na_dt['observed'].values, color='red')
plt.show()

#get the rmse error
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(na_dt['observed'].values, interp_vals.values, squared=False)
print(f"RMSE: {rmse}")

2.5 vs 2.7

2.2 vs 2.3


2.2   vs 2.35


In [None]:
        # combinations_list = []
        # for r in range(0, len(columns) + 1):
        #     combinations_list.extend(combinations(columns, r))
        # combinations_list = sorted(combinations_list, key=len, reverse=True)


In [None]:
# class Interpolator:
#     def __init__(self, grid_lag_lead_days = [-7, -1, 1, 7]):  
#         self.grid_lag_lead_days = grid_lag_lead_days
#         self.interp = RBFInterpolator
#         self.interpolated_values = {}

#     def interpolate(self, df, columns=['temperature', 'observed']):
#         self.df = df
#         self.columns = columns
#         for col in columns:
#             self.df[f'interpolated_{col}'] = False
#         # Main method to perform the interpolation
#         for col in self.columns:
#             self._col_interpolation(col)
        
#         self.df.set_index('datetime', inplace=True)
#         #for those datetime that we still haven't interpolated (for the columns), we will interpolate them with pd.interpolate
#         for col in self.columns:
#             still_na_datetime = self.df.loc[self.df[col].isna()].index
#             print(f"Interpolating {len(still_na_datetime)} nans for {col}")
#             self.df[col] = self.df[col].interpolate(method='time')
#             self.df.loc[self.df.index.isin(still_na_datetime), f'interpolated_{col}'] = True
        
#         return self.df
        
    
#     def _col_interpolation(self, col):
#         # Method to interpolate a single column
#         column_df = self.df[['datetime', 'date', col]]
#         # add lag and lead columns
#         interp_helper_cols = []
#         for shift in self.grid_lag_lead_days:
#             column_df[f'{col}_day_({shift})'] = column_df[col].shift(shift*24)
#             interp_helper_cols.append(f'{col}_day_({shift})')

#         column_df['hour_of_day'] = column_df['datetime'].dt.hour
#         #group by date
#         column_df = column_df.groupby('date')
#         # interpolate each day
#         interpolated_datetime = pd.DataFrame([], columns=['datetime', col])
#         abnormal_date = []
#         for date, group in column_df:
#             nans = group[group[col].isna()]

#             #TODO: update the values based on the previous interpolated points

#             if nans.empty:
#                 continue

#             group, interpolated_datetime = self._scenario_selector_interpolator(group, col, interp_helper_cols, interpolated_datetime)

#         interpolated_datetime_local = interpolated_datetime.set_index('datetime')
#         self.interpolated_values[col] = interpolated_datetime_local

#         #replace nans with the interpolated values for the column
#         self.df.loc[self.df['datetime'].isin(interpolated_datetime_local.index), col] = self.df['datetime'].map(interpolated_datetime_local[col])
#         self.df[f'interpolated_{col}'].loc[self.df['datetime'].isin(interpolated_datetime_local.index)] = True

#     def _scenario_selector_interpolator(self, group, col, interp_helper_cols, interpolated_datetime):
        
#         def joint_value_options(input_string):
#             from itertools import product
#             str_lst = [int(i) for i in input_string.split(" ")]
#             all_possible_options = [list(option) for option in product([0, 1], repeat=len(str_lst))]
#             all_possible_options = sorted([option for option in all_possible_options if all(i <= j for i, j in zip(option, str_lst))], reverse=True)
#             all_possible_options.pop()#drop all zeros option
#             return all_possible_options
        
#         normal_datapoints = group[~group[col].isna()]
#         nans = group[group[col].isna()]

#         nans_dt = nans['datetime'].values
#         normal_dt = normal_datapoints['datetime'].values

#         helper_matrix = group.loc[group['datetime'].isin(nans_dt)]
#         helper_matrix_mask = 1 - helper_matrix[interp_helper_cols].isna().astype(int) #0,1 mask for hashing the data

#         columns = helper_matrix_mask.columns.tolist()

#         helper_matrix_mask['pointer'] = helper_matrix_mask.apply(lambda row: ' '.join(row.astype(str)), axis=1) #string of columns that have value or not
#         helper_matrix_mask['class'] = helper_matrix_mask[columns].sum(axis=1)# this will be used for sorting the best options
#         helper_matrix_mask['datetime'] = helper_matrix['datetime']
 
#         datetime_lists = helper_matrix_mask.groupby('pointer')['datetime'].apply(list).to_dict()
#         nan_groups = helper_matrix_mask.groupby(['class', 'pointer']).size().reset_index(name='counts')
#         nan_groups = nan_groups.sort_values(by=['class', 'counts'], ascending=[False, False])
#         nan_groups[f'datetime_lists_nans'] = nan_groups['pointer'].map(datetime_lists)
        
#         has_value = group.loc[group['datetime'].isin(normal_dt)].reset_index()
#         has_value_mask = np.array(1 - has_value[interp_helper_cols].isna().astype(int).values)

#         #search through option of each nan group
#         for p in nan_groups['pointer'].unique():
#             row = nan_groups.loc[nan_groups['pointer']==p]
#             if not row['class'].values==0:
#                 nan_pointer_options = np.array(joint_value_options(row['pointer'].values[0]))
#                 filter = nan_pointer_options.sum(axis=1)
#                 lookup_matrix = np.dot(has_value_mask, nan_pointer_options.transpose())
#                 eligibles = np.where(lookup_matrix < filter, 0, 1).sum(axis=0)
#                 count = np.where(eligibles > filter, 1, 0) # check if we have enough points for RBFinterpolate
#                 if 1 in count:# give us the most promissing
#                     idx = list(count).index(1)
#                     best_scenario = nan_pointer_options[idx]
#                     #select the columns based on best scenario
#                     selected_cols = [c for i, c in enumerate(interp_helper_cols) if best_scenario[i]==1]
#                     selected_has_value_index = np.where(np.where(lookup_matrix < filter, 0, 1)[:,idx]==1)
#                     #get the point and value to feed interpolator
#                     interp_points = has_value.loc[has_value.index.isin(selected_has_value_index[0])][selected_cols]
#                     # add small random noise to remove singular matrix for RBF
#                     noise = np.random.standard_normal(size=interp_points.shape)
#                     interp_points += noise
#                     interp_values = has_value.loc[has_value.index.isin(selected_has_value_index[0])][col]

#                     nan_points_datetime =  nan_groups.loc[nan_groups['pointer']==p][f'datetime_lists_nans'].values[0]
#                     nan_points = nans.loc[nans['datetime'].isin(nan_points_datetime)][selected_cols]

#                     pred = self._interpolate_day(interp_points, interp_values, nan_points)

#                     interpolated_rows = list(zip(nan_points_datetime, pred))
#                     interpolated_rows = pd.DataFrame(interpolated_rows, columns=['datetime', col])
#                     interpolated_datetime = pd.concat([interpolated_datetime, interpolated_rows], ignore_index=True)

#                     interpolated_rows_local = interpolated_rows.set_index('datetime')
#                     #replace nans with the interpolated values for the group
#                     group.loc[group['datetime'].isin(nan_points_datetime), col] = group['datetime'].map(interpolated_rows_local[col])
        
#         return group, interpolated_datetime


#     def _interpolate_day(self, points, values, nans):
#         # Method to interpolate a single day
#         if len(values)==0:
#             return np.nan
        
#         interp = self.interp(points, values)
#         pred = interp(nans)
#         return pred


In [None]:
from scipy.interpolate import RBFInterpolator, griddata, LinearNDInterpolator, NearestNDInterpolator, CloughTocher2DInterpolator

"""griddata: need an actual limit for the input such as temp (like a grid) which we don't want to have
   LinearNDInterpolator" if the data is out of bound based on the train data, it gives us nan :( which is crazy that it doesn't even extrapolate
   NearestNDInterpolator: values are descrete and way off sometimes (we have the mean value of couple of classes)
   CloughTocher2DInterpolator: same as LinearNDInterpolator


   Some crazy interpolation is already happening in the MCE temperature (last week and the day before temperatures were exactly the same (reason we we getting singularity in interpolation))
"""
def remove_duplicate_datetime(df):
    if "observed" in df.columns:
        # find duplicate datetime values and remove if nan
        duplicate_dt_mask = df.duplicated(subset="datetime", keep=False)
        observed_nan_mask = df['observed'].isna()
        df = df[~(duplicate_dt_mask & observed_nan_mask)]

        # if duplicated and observed is not nan, keep the largest abs(value)
        df["abs_observed"] = df["observed"].abs()
        df = df.sort_values(by=["datetime", "abs_observed"], ascending=[True, False])
        df = df.drop_duplicates(subset="datetime", keep="first")
        df = df.drop(columns=["abs_observed"])

    else:
        # TODO what if there is no observed column? Could have dup datetime with different temperatures
        df = df.drop_duplicates(subset="datetime", keep="first")

    return df

grouped_meter_baseline = meter_baseline_nans.copy()
grouped_meter_baseline['interpolated'] = False
grouped_meter_baseline = grouped_meter_baseline.groupby('date')


none_regular_dates = []
daily_thr_nans = 0
seeds = [0, 1]
for date, group in grouped_meter_baseline:
    group = remove_duplicate_datetime(group)
    if (
        group['temperature'].isna().sum() > daily_thr_nans
        ) or (
            group['observed'].isna().sum() > daily_thr_nans
        ):

        #remove any duplicate

        interpolation_columns = ['temperature', 'observed']

        #get previous week date and yesterday
        yesterday = date - pd.DateOffset(day=1)
        yesterday_df = meter_baseline.loc[meter_baseline['date']==yesterday.date()]
        yesterday_df = remove_duplicate_datetime(yesterday_df)

        last_week = date - pd.DateOffset(weeks=1)
        last_week_df = meter_baseline.loc[meter_baseline['date']==last_week.date()]
        last_week_df = remove_duplicate_datetime(last_week_df)

        if ((yesterday_df.empty==True) and (last_week_df.empty==True)):
            #make a linear interpolation from what we have
            group[interpolation_columns] = group[interpolation_columns].interpolate()

        else:
            intepolation_helper_days = []
            if yesterday_df.empty==False:
                intepolation_helper_days.append(yesterday_df)
            
            if last_week_df.empty==False:
                intepolation_helper_days.append(last_week_df)

            helper_vectors_nans = []
            helper_vectors_normal = []

            for inter_col in interpolation_columns:
                if group[inter_col].isna().sum()>0:
                    # print(inter_col)
                    for i, interp_df in enumerate(intepolation_helper_days):
                
                        # get index of those without temperature
                        nan_hours = group.loc[group[inter_col].isna()]['hour_of_day'].values
                        interp_df_nans = interp_df.loc[interp_df['hour_of_day'].isin(nan_hours)][inter_col].values

                        #get any other hours that is not in nan_hours
                        normal_hours = group.loc[~group[inter_col].isna()]['hour_of_day'].values
                        interp_df_normal = interp_df.loc[interp_df['hour_of_day'].isin(normal_hours)][inter_col].values

                        #singularity prevention
                        np.random.seed = seeds[i]
                        noise = np.random.normal(0, 1, len(interp_df_normal))
                        interp_df_normal += noise

                        helper_vectors_normal.append(interp_df_normal)
                        helper_vectors_nans.append(interp_df_nans)
                        
                    if len(helper_vectors_normal) == 1:
                        temp_vector_normal = helper_vectors_normal[0].reshape(-1,1)
                        temp_vector_nans = helper_vectors_nans[0].reshape(-1,1)
                    else:
                        temp_vector_normal = list(zip(helper_vectors_normal[0], helper_vectors_normal[1]))
                        temp_vector_nans = list(zip(helper_vectors_nans[0], helper_vectors_nans[1]))

                        
                    # target_vector = group.loc[~group[inter_col].isna()][inter_col].values
                    target_vector = group.loc[group['hour_of_day'].isin(normal_hours)][inter_col].values
                    if len(target_vector)==0:
                        none_regular_dates.append(date)
                        pass
                    else:
                        interp = RBFInterpolator(temp_vector_normal, target_vector)
                        preds = interp(temp_vector_nans)
                        group.loc[group['hour_of_day'].isin(nan_hours), inter_col] = preds

                        x = group['hour_of_day'].values
                        for j, interp_df in enumerate(intepolation_helper_days):
                            lbl = 'yesterday'
                            if j==1:
                                lbl = 'last week'
                            plt.plot(x,interp_df[inter_col], label = lbl)
                        
                        plt.plot(x,group[inter_col], label = 'today')
                        #plot temp values for interpolated hours as red
                        xp = group.loc[group['hour_of_day'].isin(nan_hours)]['hour_of_day'].values
                        plt.scatter(xp, group.loc[group['hour_of_day'].isin(nan_hours), inter_col], marker='^', color='red',label='interpolated')
                        print(date)
                        print(inter_col)
                        plt.legend()
                        plt.show()

print(none_regular_dates)

In [None]:
meter_baseline_nans
print(meter_baseline_nans['temperature'].isna().sum())
print(meter_baseline_nans['observed'].isna().sum())

In [None]:
meter_baseline_nans

In [None]:
temp_vector_normal

In [None]:
nan_hours

In [None]:
len(target_vector)

In [None]:
# heatmap fro the model._model.coef_
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

model_coef = model._model.coef_
# # assuming model_coef is a 1D array, we reshape it to 2D for the heatmap
# model_coef_2d = model_coef.reshape(-1, 1)

#fig size
plt.rcParams['figure.figsize'] = [12, 4]
sns.heatmap(model_coef)
plt.show()
# get the model coefficients

In [None]:
pred_baseline

In [None]:
s = 0
w = -1
e = s + w
plt.plot(pred_baseline['observed'].iloc[s:e].values, label='observed')
plt.plot(pred_baseline['model'].iloc[s:e].values, label='CT')
plt.plot(pred_baseline['predicted'].iloc[s:e].values, label='new model')
# plt.plot(-pred_baseline['ghi'].iloc[s:e].values*3, label='ghi')
plt.legend()
plt.show()

In [None]:
meter_reporting = meter.loc[meter['period'] == 'reporting'].copy()
data_reporting = HourlyData(meter_reporting, **kwargs)
reporting_pred = model.predict(data_reporting)
reporting_pred

In [None]:
s = 0
w = -1
e = s + w
plt.plot(reporting_pred['observed'].iloc[s:e].values, label='observed')
plt.plot(reporting_pred['model'].iloc[s:e].values, label='CT')
plt.plot(reporting_pred['predicted'].iloc[s:e].values, label='new model')
# plt.plot(-pred_baseline['ghi'].iloc[s:e].values*3, label='ghi')
plt.legend()
plt.show()

In [None]:
# pred['temperature'].quantile(0.95)
# find days with temperature above 95th percentile
hot_hours = reporting_pred[reporting_pred['temperature'] > reporting_pred['temperature'].quantile(0.995)]
unique_days = np.unique(hot_hours.index.date)
print(unique_days.shape)

reporting_pred['date'] = reporting_pred.index.date
 # get days that are in unique_days
hot_days = reporting_pred[reporting_pred['date'].isin(unique_days)]
hot_days

In [None]:
plt.rcParams['figure.figsize'] = [15, 5]
s = 24*0
w = -1
e = s + w
plt.plot(hot_days['observed'].iloc[s:e].values, label='observed')
plt.plot(hot_days['model'].iloc[s:e].values, label='CT')
plt.plot(hot_days['predicted'].iloc[s:e].values, label='predicted')
plt.legend()

# plt.plot(-hot_days['ghi'].iloc[s:e].values*7, label='ghi')

plt.show()

In [None]:
s = 24*0
w = 24*7
e = s + w
plt.plot(reporting_pred['observed'].iloc[s:e].values, label='observed')
plt.plot(reporting_pred['model'].iloc[s:e].values, label='CT')
plt.plot(reporting_pred['predicted'].iloc[s:e].values, label='new model')
# plt.plot(-pred_baseline['ghi'].iloc[s:e].values*3, label='ghi')
plt.legend()
plt.show()

In [None]:
def PNRMSE(y_true, y_pred, iqr):
    return np.sqrt(np.mean((y_true - y_pred)**2))/iqr

def CVRMSE(y_true, y_pred, mn):
    return np.sqrt(np.mean((y_true - y_pred)**2))/mn

def MBE(y_true, y_pred):
    return np.mean(y_true - y_pred)



In [None]:
# plot the season/hour_of_week plots
# get the season and hour_of_week
seasons = {'winter': [11, 12, 1, 2], 'shoulder': [3, 4, 5, 10], 'summer': [6, 7, 8, 9]}
reporting_pred['season'] = reporting_pred.index.month
reporting_pred['season'] = reporting_pred['season'].apply(lambda x: 'winter' if x in seasons['winter'] else ('shoulder' if x in seasons['shoulder'] else 'summer'))
reporting_pred['season'] = reporting_pred['season'].astype('category')

reporting_pred['hour_of_week'] = reporting_pred.index.dayofweek*24 + reporting_pred.index.hour
reporting_pred['hour_of_week'] = reporting_pred['hour_of_week'].astype('category')

reporting_pred['hour'] = reporting_pred.index.hour
reporting_pred['hour'] = reporting_pred['hour'].astype('category')


# get the season/hour_of_week plots
plt.rcParams['figure.figsize'] = [15, 5]

plt.plot(reporting_pred.groupby(['season', 'hour_of_week'])['observed'].mean().values, label='observed')
plt.plot(reporting_pred.groupby(['season', 'hour_of_week'])['model'].mean().values, label='CT')
plt.plot(reporting_pred.groupby(['season', 'hour_of_week'])['predicted'].mean().values, label='predicted')
plt.legend()
plt.show()


plt.plot(reporting_pred.groupby(['season', 'hour'])['observed'].mean().values, label='observed')
plt.plot(reporting_pred.groupby(['season', 'hour'])['model'].mean().values, label='CT')
plt.plot(reporting_pred.groupby(['season', 'hour'])['predicted'].mean().values, label='predicted')
plt.legend()
plt.show()

In [None]:
dft = reporting_pred.copy()
# dft = pred_baseline.copy()

seasons = {'winter': [11, 12, 1, 2], 'shoulder': [3, 4, 5, 10], 'summer': [6, 7, 8, 9]}
dft['season'] = dft.index.month
dft['season'] = dft['season'].apply(lambda x: 'winter' if x in seasons['winter'] else ('shoulder' if x in seasons['shoulder'] else 'summer'))
dft['season'] = dft['season'].astype('category')

dft['hour_of_week'] = dft.index.dayofweek*24 + dft.index.hour
dft['hour_of_week'] = dft['hour_of_week'].astype('category')

dft['hour'] = dft.index.hour
dft['hour'] = dft['hour'].astype('category')

# plot two weeks of data for oeem , new_model and observed
start = 24*0
pltwindow = 24*7
end = start + pltwindow

plt.rcParams["figure.figsize"] = (21,6)
# higher dpi for better quality
plt.rcParams['figure.dpi'] = 500
#x axis as hours
# plt.plot(dft['start_local'][start:end], label='start_local', linestyle='--')
# plt.plot(dft['observed'][start:end].values, label='observed', linewidth=2.5)

plt.plot(dft.groupby(['season', 'hour_of_week'])['observed'].mean().values, label='observed', linewidth=2.5)
# plt.plot(reporting_pred.groupby(['season', 'hour_of_week'])['model'].mean().values, label='CT',color='red',  linewidth=2.5)
plt.plot(dft.groupby(['season', 'hour_of_week'])['predicted'].mean().values, label='predicted', color='red', linewidth=4.5)

# plt.plot(-dft['ghi'][start:end].values*10, label='ghi', color='orange', linestyle='--', linewidth=1, alpha=0.7)
# plt.plot(dft['model'][start:end].values, label='oeem', color='green', linestyle='--', linewidth=1.5)
# plt.plot(dft['new_model'][start:end].values, label='new_model', color='mediumseagreen',linestyle='--', linewidth=4.5)

# plt.plot(dft['predicted'][start:end].values, label='new_model', color='red', linewidth=4.5)
# plt.plot(dft['temperature'][start:end].values/20, label='temperature', color='mediumseagreen', linewidth=1, alpha=0.7)
# make xtick and ytick labels larger
#add two vertical line in x=168 and x=336
plt.axvline(168, color='black', linewidth=1, linestyle='--' , alpha=0.5)
plt.axvline(336, color='black', linewidth=1, linestyle='--' , alpha=0.5)
plt.xticks(fontsize=16)
#change s tick to time
# plt.xticks(dft['start_local'].index[start:end].values, dft['start_local'].index[start:end].values, rotation=45)
plt.yticks(fontsize=16)
# transparent line on y=0
plt.axhline(-0.02, color='black', linewidth=1, linestyle='--' , alpha=0.5)

#make y axis the same for all plots
plt.ylim(-2.7, 2)
# plt.legend()
plt.show()


iqr = np.percentile(pred_baseline['observed'], 75) - np.percentile(pred_baseline['observed'], 25)
mn = np.mean(pred_baseline['observed'])

# get metrics both for model and predicted based on observed
metrics = {}
metrics['PNRMSE'] = PNRMSE(dft['observed'], dft['model'], iqr)
metrics['PNRMSE_predicted'] = PNRMSE(dft['observed'], dft['predicted'], iqr)
metrics['CVRMSE'] = CVRMSE(dft['observed'], dft['model'], mn)
metrics['CVRMSE_predicted'] = CVRMSE(dft['observed'], dft['predicted'], mn)
metrics['MBE'] = MBE(dft['observed'], dft['model'])
metrics['MBE_predicted'] = MBE(dft['observed'], dft['predicted'])

metrics

In [None]:
# dft = reporting_pred.copy()
dft = pred_baseline.copy()
start = 24*0
pltwindow = -1
end = start + pltwindow
# get the rows between start and end
dft = dft.iloc[start:end]
plt.rcParams["figure.figsize"] = (12,6)
# higher dpi for better quality
plt.rcParams['figure.dpi'] = 300
#x axis as hours
# plt.plot(dft['start_local'][start:end], label='start_local', linestyle='--')
plt.plot(dft['observed'].values, label='observed', linewidth=2.5)


# plt.plot(-dft['ghi'].values*2, label='ghi', color='orange', linestyle='--', linewidth=1, alpha=0.7)
# plt.plot(dft['model'].values, label='oeem', color='green', linestyle='--', linewidth=1.5)
# plt.plot(dft['predicted'].values, label='new_model', color='red', linewidth=4.5)

# plt.plot(dft['predicted'][start:end].values, label='new_model', color='red', linewidth=4.5)
# plt.plot(dft['temperature'][start:end].values/20, label='temperature', color='mediumseagreen', linewidth=1, alpha=0.7)
# make xtick and ytick labels larger
#add two vertical line in x=168 and x=336
# plt.axvline(23, color='black', linewidth=1, linestyle='--' , alpha=0.5)
# plt.axvline(336, color='black', linewidth=1, linestyle='--' , alpha=0.5)
# plt.xticks(fontsize=16)
#change xtick to 1 to 48 and show every other 6 hours
# plt.xticks([0, 11, 23, 35, 47], [1, 12, 24, 36, 48])

# add the first and the last date to the xticks
first_date = dft.index[0].date()
last_date = dft.index[-1].date()
plt.xticks([0, len(dft['observed'])], [first_date, last_date])

# plt.yticks(fontsize=16)
# transparent line on y=0
plt.axhline(dft['observed'].mean(), color='black', linewidth=1, linestyle='--' , alpha=0.5)

#make y axis the same for all plots
# plt.ylim(-2.9, 2.9)
# plt.xlim(0, 47)
# plt.legend()
plt.show()

In [None]:
# dft = pred_baseline.copy()
# dft['observed'].describe()
# idx = dft['observed'].argmin() -24*2
# print(idx)

# selected_date = dft.index[idx].date()
# # get the data for the selected date
# selected_data = dft[dft.index.date == selected_date]


In [None]:
start = 24*0
pltwindow = -1
end = start + pltwindow
dft = selected_data.copy()
dft = repo
# get the rows between start and end
dft = dft.iloc[start:end]
plt.rcParams["figure.figsize"] = (12,6)
# higher dpi for better quality
plt.rcParams['figure.dpi'] = 300
#x axis as hours
# plt.plot(dft['start_local'][start:end], label='start_local', linestyle='--')
plt.plot(dft['observed'].values, label='observed', linewidth=2.5)

#change xtick to 1 to 48 and show every other 6 hours
first_date = dft.index[0].date()
last_date = dft.index[-1].date()
plt.xticks([0, len(dft['observed'])], [first_date, last_date])


plt.axhline(dft['observed'].mean(), color='black', linewidth=1, linestyle='--' , alpha=0.5)

plt.show()

In [None]:
634.4/208

In [None]:
meta_meter

In [None]:
reporting_pred

In [None]:
dft = reporting_pred.copy()
# dft = pred_baseline.copy()
start = 24*0
pltwindow = -1
end = start + pltwindow
# get the rows between start and end
dft = dft.iloc[start:end]
plt.rcParams["figure.figsize"] = (12,6)
# higher dpi for better quality
plt.rcParams['figure.dpi'] = 300
#x axis as hours
# plt.plot(dft['start_local'][start:end], label='start_local', linestyle='--')
plt.plot(dft['observed'].values, label='observed', linewidth=2.5)


# plt.plot(-dft['ghi'].values*2, label='ghi', color='orange', linestyle='--', linewidth=1, alpha=0.7)
# plt.plot(dft['model'].values, label='oeem', color='green', linestyle='--', linewidth=1.5)
# plt.plot(dft['predicted'].values, label='new_model', color='red', linewidth=4.5)

# plt.plot(dft['predicted'][start:end].values, label='new_model', color='red', linewidth=4.5)
# plt.plot(dft['temperature'][start:end].values/20, label='temperature', color='mediumseagreen', linewidth=1, alpha=0.7)
# make xtick and ytick labels larger
#add two vertical line in x=168 and x=336
# plt.axvline(23, color='black', linewidth=1, linestyle='--' , alpha=0.5)
# plt.axvline(336, color='black', linewidth=1, linestyle='--' , alpha=0.5)
# plt.xticks(fontsize=16)
#change xtick to 1 to 48 and show every other 6 hours
first_date = dft.index[0].date()
last_date = dft.index[-1].date()
plt.xticks([0, len(dft['observed'])], [first_date, last_date])


# plt.yticks(fontsize=16)
# transparent line on y=0
plt.axhline(0, color='black', linewidth=1, linestyle='--' , alpha=0.5)

#make y axis the same for all plots
# plt.ylim(-2.9, 2.9)
# plt.xlim(0, 47)
# plt.legend()
plt.show()

In [None]:
reporting_pred.describe()

In [None]:
# select reporting period after the intervention
selected = reporting_pred.loc[reporting_pred.index >= PV_interventation]
selected.describe()