# Setup & Import


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pvlib
import json
import os
from pvlib.pvsystem import PVSystem, Array, FixedMount
from pvlib.location import Location
from pvlib.modelchain import ModelChain
from pvlib.temperature import TEMPERATURE_MODEL_PARAMETERS
import plotly.graph_objects as go
import plotly.io as pio
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import forestci as fci
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import threading
from sklearn.metrics import make_scorer
import dash
from dash import dcc, html
import plotly.graph_objects as go
from dash.dependencies import Input, Output
import webbrowser
from threading import Timer
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle
import time

from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import (
    check_is_fitted,
)
from sklearn.ensemble._base import _partition_estimators

pio.renderers.default = "browser"  # render plotly figures in browser

PARENT_DATA_DIR = os.getenv('PARENT_DATA_DIR')
if PARENT_DATA_DIR is None:
    raise ValueError("PARENT_DATA_DIR environment variable is not set")


dataDirpath = PARENT_DATA_DIR + r"\PRiOT\dataExport_2"  # "/Applications/Documents/TM Maxime/dataExport_3400_daily"#
dataCacheDirpath = os.path.join(dataDirpath, "cache")
logsDirpath = "../logs"
useCached = True
forceTrain = False
tuneMaxProductionEstimators = True
random_state = 42


testingDays = 100
minTestingDays = 30
minTrainingDays = 7

if not os.path.exists(logsDirpath):
    os.makedirs(logsDirpath)

if not os.path.exists(dataCacheDirpath):
    os.makedirs(dataCacheDirpath)

# Serializer


In [2]:
# https://scikit-learn.org/stable/model_persistence.html


class ModelSerializer:
    def _save_model(self, model, serial_type, save_params):
        serial_type.dump(model, save_params)

    def _retrieve_model(self, serial_type, retrieve_params):
        return serial_type.load(retrieve_params)


# save_model_path = "Serialized_models\\"


class JoblibSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        super()._save_model(model, joblib, os.path.join(save_model_path, filename + ".joblib"))

    def retrieve_model(self, save_model_path, filename):
        return super()._retrieve_model(joblib, os.path.join(save_model_path, filename + '.joblib'))


class PickleSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        # create folder if not exists
        if not os.path.exists(save_model_path):
            os.makedirs(save_model_path)
        with open(os.path.join(save_model_path, filename + ".pkl"), 'wb') as f:
            super()._save_model(model, pickle, f)

    def retrieve_model(self, save_model_path, filename):
        with open(os.path.join(save_model_path, filename + ".pkl"), 'rb') as f:
            return super()._retrieve_model(pickle, f)

# Function


In [3]:
def get_altitude_from_wgs84(longitude, latitude):
    # Convert WGS84 to LV95
    lv95_url = "https://geodesy.geo.admin.ch/reframe/wgs84tolv95"
    params_lv95 = {
        "easting": longitude,
        "northing": latitude,
        "format": "json"
    }

    response_lv95 = requests.get(lv95_url, params=params_lv95)
    if response_lv95.status_code != 200:
        raise Exception("Error converting WGS84 to LV95: " + response_lv95.text)

    lv95_data = response_lv95.json()
    lv95_easting = lv95_data["easting"]
    lv95_northing = lv95_data["northing"]

    # Get altitude from LV95 coordinates
    altitude_url = "https://api3.geo.admin.ch/rest/services/height"
    params_altitude = {
        "easting": lv95_easting,
        "northing": lv95_northing
    }

    response_altitude = requests.get(altitude_url, params=params_altitude)
    if response_altitude.status_code != 200:
        raise Exception("Error retrieving altitude: " + response_altitude.text)

    altitude_data = response_altitude.json()
    altitude = altitude_data["height"]

    return float(altitude)


def remove_system(systemName, message):
    if 'systemsName_Valid' in globals() and systemName in systemsName_Valid:
        systemsName_Valid.remove(systemName)
    if 'systemsName_Valid' in globals() and systemName in systemsName_Valid:
        systemsName_Valid.remove(systemName)
    if 'systemsData_EstimatedMaxDailyEnergy' in globals() and systemName in systemsData_EstimatedMaxDailyEnergy.columns:
        systemsData_EstimatedMaxDailyEnergy.drop(columns=systemName, inplace=True)
    if 'systemsData_MeasuredDailyEnergy_train' in globals() and systemName in systemsData_MeasuredDailyEnergy_train.columns:
        systemsData_MeasuredDailyEnergy_train.drop(columns=systemName, inplace=True)
    if 'systemsData_MeasuredDailyEnergy_test' in globals() and systemName in systemsData_MeasuredDailyEnergy_test.columns:
        systemsData_MeasuredDailyEnergy_test.drop(columns=systemName, inplace=True)
    if 'systemsData_MeasuredDailyEnergy' in globals() and systemName in systemsData_MeasuredDailyEnergy.columns:
        systemsData_MeasuredDailyEnergy.drop(columns=systemName, inplace=True)
    if 'normalizer_estimated_max_daily_energy' in globals() and systemName in normalizer_estimated_max_daily_energy:
        normalizer_estimated_max_daily_energy.pop(systemName)
    print(message)

## Import metadata


In [None]:
metadataFilepath = os.path.join(dataDirpath, "metadata.json")

with open(metadataFilepath, 'r') as f:
    systemsMetadata = json.load(f)

# Add altitude to metadata, if not already present (TODO : imporove with multi threading)

for systemId, systemMetadata in tqdm(systemsMetadata.items()):
    if "loc_altitude" not in systemMetadata['metadata']:
        if "loc_longitude" in systemMetadata['metadata'] and "loc_latitude" in systemMetadata['metadata']:
            systemMetadata['metadata']["loc_altitude"] = get_altitude_from_wgs84(systemMetadata['metadata']["loc_longitude"], systemMetadata['metadata']["loc_latitude"])

# Split arrays in dictionaries by module number
for systemId, systemMetadata in systemsMetadata.items():
    arrays = {}
    keys_to_delete = []
    for key, value in systemMetadata['metadata'].items():
        if 'mod' in key:
            # Extract the module number
            array_num = key.split('_')[1][-1]
            # Remove the module number from the key
            new_key = '_'.join(key.split('_')[:1] + key.split('_')[2:])
            # Add the key-value pair to the appropriate module dictionary
            if 'arrays' not in systemMetadata:
                systemMetadata['arrays'] = {}
            if array_num not in systemMetadata['arrays']:
                systemMetadata['arrays'][array_num] = {}
            systemMetadata['arrays'][array_num][new_key] = value
            keys_to_delete.append(key)
    for key in keys_to_delete:
        del systemMetadata['metadata'][key]

# Save metadata with altitude
with open(metadataFilepath, 'w') as f:
    json.dump(systemsMetadata, f, indent=4)

print("Number of systems in metadata: ", len(systemsMetadata))

## Import measures


In [None]:
cacheFilename_systemsData_MeasuredDailyEnergy = os.path.join(dataCacheDirpath, 'systemsData_MeasuredDailyEnergy.pkl')
if True and os.path.exists(cacheFilename_systemsData_MeasuredDailyEnergy):
    print(f"Loading cached data in {cacheFilename_systemsData_MeasuredDailyEnergy}")
    systemsData_MeasuredDailyEnergy = pd.read_pickle(cacheFilename_systemsData_MeasuredDailyEnergy)
    systemsName_Valid = list(systemsData_MeasuredDailyEnergy.columns)
else:
    # Load all csv files from the data directory
    systemsData = {}
    for file in os.listdir(dataDirpath):
        if file.endswith(".csv"):
            systemName = file.split("_")[0]
            systemsData[systemName] = pd.read_csv(os.path.join(dataDirpath, file))
            systemsData[systemName]['Datetime'] = pd.to_datetime(systemsData[systemName]['Timestamp'], unit='ms', utc=True).dt.tz_convert('Europe/Zurich')
            systemsData[systemName]['Date'] = (systemsData[systemName]['Datetime'] + pd.Timedelta(hours=1)).dt.date  # Convert the datetime to only the date, as the production is the daily production. The +1h is to manage the saving time. Normally PRiOT exports the data at midnight (local time) for the day after (e.g. the energy for the July 1st is saved at July 1st 00:00 Europe/Zurich). However it seams that the saving time is not always correctly handled, and sometime the export is done at 23:00 the day before (e.g. the energy for the July 1st is saved at June 30th 23:00 Europe/Zurich). This is why we add 1h to the datetime to be sure to have the correct date.

    systemsName = list(systemsData.keys())

    df_duplicate_list = list()
    for systemName, systemData in systemsData.items():
        # Save duplicate dates to log list, and the in a log file
        duplicates = systemData[systemData['Date'].duplicated(keep=False)]
        if len(duplicates) > 0:
            df_duplicate_list.append(duplicates)

            # Remove duplicate date where tt_forward_active_energy_total_toDay is the smallest
            # TODO maybe we should sum the energy of the duplicates instead of removing the smallest one. However, when looking in PRiOT Portal, it seams that in the daily energy, only the biggest value is represented. We do the same here.
            systemData.sort_values('tt_forward_active_energy_total_toDay', ascending=True, inplace=True)
            systemsData[systemName].drop_duplicates(subset='Date', keep='last', inplace=True)

        # Set date as the index and sort the data by date
        systemsData[systemName].set_index('Date', inplace=True)
        systemData.sort_index(ascending=True, inplace=True)

    # Save duplicate dates to log file
    df_duplicate = pd.concat(df_duplicate_list)
    print(f"Number of duplicate dates found: {len(df_duplicate)} (see log file for more details)")
    df_duplicate.to_csv(os.path.join(logsDirpath, 'duplicateDates.csv'), index=True)

    ## ----------------------------------------------- ##
    ## Convert data & Filter out invalid PRiOT systems ##
    ## ----------------------------------------------- ##

    systemsName_Valid = systemsName.copy()
    for systemName in systemsName:
        missingData = False
        # Check if the system has measures
        if len(systemsData[systemName]) == 0:
            missingData = True
            print(f"System {systemName} : No measures found")
        # Check if the system has metadata
        if systemName not in systemsMetadata:
            missingData = True
            print(f"System {systemName} : No metadata found")

        else:
            # Check metadata for the system
            for key in ['loc_latitude', 'loc_longitude', 'loc_altitude', 'pv_kwp']:
                # test that the key is present
                if key not in systemsMetadata[systemName]['metadata']:
                    missingData = True
                    print(f"System {systemName} : No '{key}' found")
                # if present, convert the value to a number, if possible
                elif not isinstance(systemsMetadata[systemName]['metadata'][key], (int, float)):
                    try:
                        systemsMetadata[systemName]['metadata'][key] = int(systemsMetadata[systemName]['metadata'][key])
                    except ValueError:
                        try:
                            systemsMetadata[systemName]['metadata'][key] = float(systemsMetadata[systemName]['metadata'][key])
                        except ValueError:
                            missingData = True
                            print(f"System {systemName} : The key-value '{key}:{systemsMetadata[systemName]['metadata'][key]}' is not a number")

            # Check metadata for the arrays
            if 'arrays' not in systemsMetadata[systemName] or len(systemsMetadata[systemName]['arrays']) == 0:
                print(f"System {systemName} : No PV arrays found")
                missingData = True
            else:
                for array_num, arrayData in systemsMetadata[systemName]['arrays'].items():
                    for key in ['pv_tilt', 'pv_azimut', 'pv_wp', 'pv_number']:
                        if key not in arrayData:
                            missingData = True
                            print(f"System {systemName} : No '{key}' found for array {array_num}")
                        # test that the value is a number
                        elif not isinstance(arrayData[key], (int, float)):
                            try:
                                arrayData[key] = int(arrayData[key])
                            except ValueError:
                                try:
                                    arrayData[key] = float(arrayData[key])
                                except ValueError:
                                    missingData = True
                                    print(f"System {systemName} : The key-value '{key}:{arrayData[key]}' is not a number for array {array_num}")

            # add the loss metadata if not present
            if 'loss' not in systemsMetadata[systemName]['metadata']:
                systemsMetadata[systemName]['metadata']['loss'] = 0

        if missingData:
            systemsName_Valid.remove(systemName)
            print(f"-> Removing system {systemName} from the list of systems")

    print(f"\nNumber of systems with all the necessary data: {len(systemsName_Valid)}/{len(systemsName)}")

    # # Filter out systems with less than X days of data
    # for systemName in systemsName_Valid[:]:  # Create a copy of the list using slicing [:] to avoid removing elements while iterating over the list itself
    #     if len(systemsData[systemName]) < minMeasurements:
    #         systemsName_Valid.remove(systemName)
    #         print(f"-> Removing system {systemName} from the list of systems because it has less than {minMeasurements} days of data")

    # print(f"Number of systems with at least {minMeasurements} days of data: {len(systemsName_Valid)}/{len(systemsName)}")

    ## ---------------------------------------------------------------------------- ##
    ## Create one 2D DataFrame with the daily production of every remaining systems ##
    ## ---------------------------------------------------------------------------- ##

    # Create an empty list to store all measured data for each systems
    systemsData_MeasuredDailyEnergy_List = []

    # Iterate over each key-value pair in the systemsData dictionary
    for systemName in systemsName_Valid:
        # Extract the 'tt_forward_active_energy_total_toDay' column from the current dataframe
        measuredDailyEnergy = systemsData[systemName]['tt_forward_active_energy_total_toDay']

        # Rename the column with the system name
        measuredDailyEnergy.rename(systemName, inplace=True)

        systemsData_MeasuredDailyEnergy_List.append(measuredDailyEnergy)
        # Concatenate the column to the new_dataframe

    # Concatenate all the columns in the list to create one dataframe
    systemsData_MeasuredDailyEnergy = pd.concat(systemsData_MeasuredDailyEnergy_List, axis=1)
    systemsData_MeasuredDailyEnergy.index = pd.to_datetime(systemsData_MeasuredDailyEnergy.index)
    systemsData_MeasuredDailyEnergy.sort_index(inplace=True)

    ## ------------------ ##
    ## Save the dataframe ##
    ## ------------------ ##
    # Save the dataframe for later use
    # create cache directory if it does not exist

    systemsData_MeasuredDailyEnergy.to_pickle(cacheFilename_systemsData_MeasuredDailyEnergy)

# Print the dataframe
systemsData_MeasuredDailyEnergy

## Create train & test set


In [6]:
# create a validation set with the last 100 days
# if testingDays == 0:
#     systemsData_MeasuredDailyEnergy_train = systemsData_MeasuredDailyEnergy
#     systemsData_MeasuredDailyEnergy_test = pd.DataFrame()
# else:
if testingDays > len(systemsData_MeasuredDailyEnergy):
    raise ValueError(f"testingDays ({testingDays}) is greater than the number of days in the dataset ({len(systemsData_MeasuredDailyEnergy)})")
systemsData_MeasuredDailyEnergy_train, systemsData_MeasuredDailyEnergy_test = train_test_split(systemsData_MeasuredDailyEnergy, test_size=testingDays, random_state=42, shuffle=False)

In [None]:
# remove systems with not enough days that are not null for training or testing
nbr_valid_systems = len(systemsName_Valid)
under_min_training_days_systems = systemsData_MeasuredDailyEnergy_train.loc[:, systemsData_MeasuredDailyEnergy_train.notnull().sum() < minTrainingDays].columns
for systemName in under_min_training_days_systems:
    remove_system(systemName, f"System {systemName} : Not enough days for training (min {minTrainingDays} days required)")

under_min_testing_days_systems = systemsData_MeasuredDailyEnergy_test.loc[:, systemsData_MeasuredDailyEnergy_test.notnull().sum() < minTestingDays].columns
for systemName in under_min_testing_days_systems:
    remove_system(systemName, f"System {systemName} : Not enough days for testing (min {minTestingDays} days required)")

print(f"Number of valid systems: {len(systemsName_Valid)}/{nbr_valid_systems}")

## Max production estimator


### Functions


In [8]:
# Convert the power production with a given frequency to the total daily energy
def daily_energy(df_power):
    # Get the frequency in minutes
    freq_in_minutes = pd.Timedelta(df_power.index.freq).seconds / 60
    # Convert power from kW to kWh
    df_energy = df_power * (freq_in_minutes / 60)
    # Resample to daily frequency and sum the values
    daily_energy = df_energy.resample('D').sum()
    # daily_energy.index = daily_energy.index.date

    return daily_energy

# Simulate the daily production of a system from a start date to an end date using the given PVLib ModelChain


def generate_max_production_estimate(startDate, endDate, estimator: ModelChain, samplingFreq='1h'):
    # The end date needs to be estimated completly(end date at 23:59). But "endDate" is considered as 00:00 by pd.date_range().
    # So we add 1 day to the end date to include the entire end date in the date_range(), and then we exclude the last value with the inclusive='left' proprety, to remove "endDate+1" at 00:00) in the date_range().
    endDate = endDate + pd.Timedelta(days=1)

    times = pd.date_range(start=startDate, end=endDate, freq=samplingFreq, tz=estimator.location.tz, inclusive='left')
    weatherClearSky = estimator.location.get_clearsky(times)  # In W/m2
    # TODO adjust the clear sky model to take into account the horizon https://pvlib-python.readthedocs.io/en/stable/gallery/shading/plot_simple_irradiance_adjustment_for_horizon_shading.html
    estimator.run_model(weatherClearSky)
    production = estimator.results.ac / 1000  # Convert W to kW
    dailyProduction = daily_energy(production)
    dailyProduction.index = pd.to_datetime(dailyProduction.index.date)
    return dailyProduction


def generate_max_production_estimator(systemMetadata):
    latitude = systemMetadata['metadata']['loc_latitude']
    longitude = systemMetadata['metadata']['loc_longitude']
    altitude = systemMetadata['metadata']['loc_altitude']
    Wp_Tot = systemMetadata['metadata']['pv_kwp'] * 1000
    loss = systemMetadata['metadata']['loss'] * 100

    arrays = []
    for array_num, arrayData in systemMetadata['arrays'].items():
        array = Array(
            mount=FixedMount(surface_tilt=arrayData['pv_tilt'], surface_azimuth=arrayData['pv_azimut'], racking_model='open_rack'),
            module_parameters={'pdc0': arrayData['pv_wp'], 'gamma_pdc': -0.004},
            module_type='glass_polymer',
            modules_per_string=arrayData['pv_number'],
            strings=1,
            temperature_model_parameters=TEMPERATURE_MODEL_PARAMETERS['sapm']['open_rack_glass_polymer'],
        )
        arrays.append(array)

    location = Location(latitude=latitude, longitude=longitude, altitude=altitude, tz='Europe/Zurich')
    system = PVSystem(arrays=arrays,
                      inverter_parameters={'pdc0': Wp_Tot, 'eta_inv_nom': 0.96},
                      losses_parameters={'nameplate_rating': loss, 'soiling': 0, 'shading': 0, 'snow': 0, 'mismatch': 0, 'wiring': 0, 'connections': 0, 'lid': 0, 'age': 0, 'availability': 0})
    modelChain = ModelChain(system, location, clearsky_model='ineichen', aoi_model='no_loss', spectral_model="no_loss", losses_model='pvwatts')

    return modelChain


def tune_max_production_estimator(measured_series, max_estimated_series, window=7):
    # Remove the obvious outliers. It's important before calculating the std, which can be strongly impacted by the strong outliers.
    outliers_mask = measured_series > 2 * max_estimated_series
    measured_no_outliers_series = measured_series[~outliers_mask]
    # if 10% of the data is removed as outliers, we consider that the system is not valid
    if outliers_mask.sum().sum() / outliers_mask.size > 0.1:
        return None, None, None, None
    # Keep only the max measured value
    max_measured_series = pd.Series(index=measured_series.index, dtype=float)
    # Iterate over windows of a given size, and keep only the maximum value in each window
    for i in range(0, len(measured_series), window):
        window_data = measured_no_outliers_series.iloc[i:i + window]
        if not window_data.empty and not window_data.isna().all():
            max_value = window_data.max()
            max_index = window_data.idxmax(skipna=True)
            max_measured_series[max_index] = max_value

    # Calculate the relative difference between the maximum measured and maximum estimated value
    realtive_difference = max_measured_series / max_estimated_series

    # Compute statistics
    std = realtive_difference.std()
    mean = realtive_difference.mean()

    # Remove the outilers that have a z-score greater than 1
    z_scores = np.abs(realtive_difference - mean) / std

    # Add the measure with a z-score greater than 1 to the previous outliers (AND operation)
    outliers_mask = outliers_mask | (z_scores > 1)

    # Get the loss that overestimate the estimate maximum daily energy
    loss = 1 - realtive_difference[~outliers_mask].max()

    return loss, std, max_measured_series, outliers_mask

### Create estimator


In [None]:
cacheFilename_normalizer_estimated_max_daily_energy = os.path.join(dataCacheDirpath, 'normalizer_estimated_max_daily_energy.pkl')
cacheFilename_normalizer_tuning_estimated_max_daily_energy_untuned = os.path.join(dataCacheDirpath, 'normalizer_tuning_estimated_max_daily_energy_untuned.pkl')
cacheFilename_normalizer_tuning_measure_max = os.path.join(dataCacheDirpath, 'normalizer_tuning_measure_max.pkl')
cacheFilename_normalizer_tuning_outliers = os.path.join(dataCacheDirpath, 'normalizer_tuning_outliers.pkl')
cacheFilename_normalizer_tuning_unfitted_systems = os.path.join(dataCacheDirpath, 'normalizer_tuning_unfitted_systems.pkl')

if useCached and os.path.exists(cacheFilename_normalizer_estimated_max_daily_energy) and os.path.exists(cacheFilename_normalizer_tuning_estimated_max_daily_energy_untuned) and os.path.exists(cacheFilename_normalizer_tuning_measure_max) and os.path.exists(cacheFilename_normalizer_tuning_outliers):
    # TODO how to deal if the cached data is not up to date and some systems have been added or removed?
    print(f"Loading cached data in {cacheFilename_normalizer_estimated_max_daily_energy}")
    normalizer_estimated_max_daily_energy = pd.read_pickle(cacheFilename_normalizer_estimated_max_daily_energy)
    normalizer_tuning_estimated_max_daily_energy_untuned = pd.read_pickle(cacheFilename_normalizer_tuning_estimated_max_daily_energy_untuned)
    normalizer_tuning_measure_max = pd.read_pickle(cacheFilename_normalizer_tuning_measure_max)
    normalizer_tuning_outliers = pd.read_pickle(cacheFilename_normalizer_tuning_outliers)
    normalizer_tuning_unfitted_systems = pd.read_pickle(cacheFilename_normalizer_tuning_unfitted_systems)

else:
    normalizer_estimated_max_daily_energy_dic = {}
    normalizer_tuning_estimated_max_daily_energy_untuned_dic = {}
    normalizer_tuning_measure_max_dic = {}
    normalizer_tuning_outliers_dic = {}
    normalizer_tuning_unfitted_systems_list = []

    iteration_times_normalizer = []
    for systemName in tqdm(systemsName_Valid):
        start_time = time.time()  # Start timing
        
        tuned = not tuneMaxProductionEstimators  # If we don't want to tune the estimators, we say that the estimator is already tuned
        # reset the loss in the metadata if we want to tune the estimators
        if tuneMaxProductionEstimators:
            systemsMetadata[systemName]['metadata']['loss'] = 0

        while True:  # emulate do while loop

            ## ------------------ ##
            ## Create ModelChains ##
            ## ------------------ ##
            estimator = generate_max_production_estimator(systemsMetadata[systemName])

            ## ------------------- ##
            ## Simulate production ##
            ## ------------------- ##
            measured_series = systemsData_MeasuredDailyEnergy[systemName]
            startDate = measured_series[~measured_series.isna()].index.min()
            endDate = measured_series[~measured_series.isna()].index.max()
            estimatedMaxDailyEnergy = generate_max_production_estimate(startDate, endDate, estimator, samplingFreq='1h')

            # fill remaining days with NaN
            estimatedMaxDailyEnergy = estimatedMaxDailyEnergy.reindex(measured_series.index, fill_value=np.nan)

            # add the series to the dictionary
            normalizer_estimated_max_daily_energy_dic[systemName] = estimatedMaxDailyEnergy

            ## --------------- ##
            ## Tune estimators ##
            ## --------------- ##
            if tuned:
                break

            loss, std, measuredMax, outliersMask = tune_max_production_estimator(measured_series, estimatedMaxDailyEnergy)

            if loss is None:
                normalizer_tuning_unfitted_systems_list.append(systemName)
                break

            normalizer_tuning_estimated_max_daily_energy_untuned_dic[systemName] = estimatedMaxDailyEnergy
            normalizer_tuning_measure_max_dic[systemName] = measuredMax
            normalizer_tuning_outliers_dic[systemName] = measured_series[outliersMask]

            # If the std is greater than 1, we remove the system from the list of systems to be processed.
            # This is to avoid to have a system that is not well fitted by the maximum energy estimator model, and that could impact the training of the RF model.
            if std is None or std > 1 or measured_series.count() == 0:
                normalizer_tuning_unfitted_systems_list.append(systemName)
                break

            # write the loss in systemsMetadata
            systemsMetadata[systemName]['metadata']['loss'] = loss

            tuned = True

            iteration_duration = time.time() - start_time
            iteration_times_normalizer.append(iteration_duration)


    normalizer_estimated_max_daily_energy = pd.concat(normalizer_estimated_max_daily_energy_dic, axis=1)
    normalizer_estimated_max_daily_energy.index = pd.to_datetime(normalizer_estimated_max_daily_energy.index)
    normalizer_estimated_max_daily_energy.sort_index(inplace=True)

    normalizer_tuning_estimated_max_daily_energy_untuned = pd.concat(normalizer_tuning_estimated_max_daily_energy_untuned_dic, axis=1)
    normalizer_tuning_estimated_max_daily_energy_untuned.index = pd.to_datetime(normalizer_tuning_estimated_max_daily_energy_untuned.index)
    normalizer_tuning_estimated_max_daily_energy_untuned.sort_index(inplace=True)

    normalizer_tuning_measure_max = pd.concat(normalizer_tuning_measure_max_dic, axis=1)
    normalizer_tuning_measure_max.index = pd.to_datetime(normalizer_tuning_measure_max.index)
    normalizer_tuning_measure_max.sort_index(inplace=True)

    normalizer_tuning_outliers = pd.concat(normalizer_tuning_outliers_dic, axis=1)
    normalizer_tuning_outliers.index = pd.to_datetime(normalizer_tuning_outliers.index)
    normalizer_tuning_outliers.sort_index(inplace=True)

    normalizer_tuning_unfitted_systems = pd.Series(normalizer_tuning_unfitted_systems_list)

    # Save metadata with tuned parameters
    if tuneMaxProductionEstimators:
        with open(metadataFilepath, 'w') as f:
            json.dump(systemsMetadata, f, indent=4)

    # save systemsData_EstimatedMaxDailyEnergy in cacheFilename_systemsData_EstimatedMaxDailyEnergy
    normalizer_estimated_max_daily_energy.to_pickle(cacheFilename_normalizer_estimated_max_daily_energy)
    normalizer_tuning_estimated_max_daily_energy_untuned.to_pickle(cacheFilename_normalizer_tuning_estimated_max_daily_energy_untuned)
    normalizer_tuning_measure_max.to_pickle(cacheFilename_normalizer_tuning_measure_max)
    normalizer_tuning_outliers.to_pickle(cacheFilename_normalizer_tuning_outliers)
    normalizer_tuning_unfitted_systems.to_pickle(cacheFilename_normalizer_tuning_unfitted_systems)

# Remove unfitted systems from systemsName_Valid, systemsName_Valid, systemsData_EstimatedMaxDailyEnergy, systemsData_MeasuredDailyEnergy
nbr_valid_systems = len(systemsName_Valid)
for systemName in normalizer_tuning_unfitted_systems:
    remove_system(systemName, f"System {systemName} : We can't find the model corresponding to the measured data. This system is removed from the list of systems to be processed.")
print(f"Number of valid systems: {len(systemsName_Valid)}/{nbr_valid_systems}")
# Print the dataframe
normalizer_estimated_max_daily_energy

### Remove outliers


In [None]:
rel_mesasured_series = systemsData_MeasuredDailyEnergy_train / normalizer_estimated_max_daily_energy

# remove the outliers in measured data that are greater than 1.1 times (+10%) the maximum estimated value, or less than 1% of the maximum estimated value
inliers = (rel_mesasured_series < 1.1) & (rel_mesasured_series > 0.01)
systemsData_MeasuredDailyEnergy_train_outliers = systemsData_MeasuredDailyEnergy_train[~inliers]
systemsData_MeasuredDailyEnergy_train = systemsData_MeasuredDailyEnergy_train[inliers]

# remove the systems that have less than 7 days
nbr_valid_systems = len(systemsName_Valid)
for systemName in systemsData_MeasuredDailyEnergy_train.loc[:, systemsData_MeasuredDailyEnergy_train.count() < minTrainingDays].columns:
    remove_system(systemName, f"System {systemName} : The system has less than {minTrainingDays} days of data. This system is removed from the list of systems to be processed.")
print(f"Number of valid systems: {len(systemsName_Valid)}/{nbr_valid_systems}")


### Relative production

True production scaled by the maximum production from the simulator


In [11]:
# Calculate the relative energy for each system
systemsData_RelativeMeasuredDailyEnergy_train = systemsData_MeasuredDailyEnergy_train / normalizer_estimated_max_daily_energy
systemsData_RelativeMeasuredDailyEnergy = systemsData_MeasuredDailyEnergy / normalizer_estimated_max_daily_energy

### Compare the difference between simulation with hourly and 10min sampling rate


## Correlation between Systems


In [12]:
correlation_matrix = systemsData_MeasuredDailyEnergy.corr(method='pearson', min_periods=minTrainingDays)
# set all negative value (therefore when the value of one system increasse, the other systme decrease) to 0
correlation_matrix[correlation_matrix < 0] = 0

### Plot linear regression between 2 systems


## Half-Sibling Regression


### Functions


In [13]:
def get_system_data(targetName, set='train', relative=True, max_neighbors=None, max_days=None):
    # take the max_neighbors best neighbours from the correlation matrix
    # if none, take all the neighbours
    if max_neighbors == None or max_neighbors > len(systemsName_Valid) - 1:
        max_neighbors = len(systemsName_Valid) - 1
    best_neighbours = correlation_matrix.loc[targetName, systemsName_Valid].sort_values(ascending=False).index[1:max_neighbors + 1]
    # Create the feature matrix X and the target vector y
    if set == 'train' and relative:
        X = systemsData_RelativeMeasuredDailyEnergy_train[best_neighbours]
        y = systemsData_RelativeMeasuredDailyEnergy_train[targetName]
    elif set == 'train' and not relative:
        X = systemsData_MeasuredDailyEnergy_train[best_neighbours]
        y = systemsData_MeasuredDailyEnergy_train[targetName]
    elif set == 'test' and relative:
        X = systemsData_RelativeMeasuredDailyEnergy_test[best_neighbours]
        y = systemsData_RelativeMeasuredDailyEnergy_test[targetName]
    elif set == 'test' and not relative:
        X = systemsData_MeasuredDailyEnergy_test[best_neighbours]
        y = systemsData_MeasuredDailyEnergy_test[targetName]
    else:
        raise ValueError(f"Invalid set value: {set}")
    # remove the observations where their is no target value
    X = X[~y.isna()]
    y = y[~y.isna()]

    if max_days and max_days > 0 and max_days < len(y) :
        X = X.iloc[-max_days:]
        y = y.iloc[-max_days:]
    return X, y


def mean_absolute_percentage_error_mean_denominator(
        y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"):
    # Copy of the function mean_absolute_percentage_error from sklearn.metrics._regression, with the denominator of the MAPE changed to the mean of the true values
    import sklearn
    y_type, y_true, y_pred, multioutput = sklearn.metrics._regression._check_reg_targets(
        y_true, y_pred, multioutput)
    sklearn.utils.validation.check_consistent_length(y_true, y_pred, sample_weight)
    epsilon = np.finfo(np.float64).eps
    mape = np.abs(y_pred - y_true) / np.maximum(np.mean(np.abs(y_true)), epsilon)
    output_errors = np.average(mape, weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None
    return np.average(output_errors, weights=multioutput)


def mean_absolute_percentage_error_epsilon(y_true, y_pred, epsilon=np.finfo(np.float64).eps, *, sample_weight=None, multioutput="uniform_average"):
    # Copy of the function mean_absolute_percentage_error from sklearn.metrics._regression, with epsilon as a parameter
    import sklearn
    y_type, y_true, y_pred, multioutput = sklearn.metrics._regression._check_reg_targets(
        y_true, y_pred, multioutput)
    sklearn.utils.validation.check_consistent_length(y_true, y_pred, sample_weight)
    mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon)
    output_errors = np.average(mape, weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None
    return np.average(output_errors, weights=multioutput)


def mad(arr):
    return abs(arr - arr.median()).median()


def modified_z_score(arr):
    # based on https://www.ibm.com/docs/en/cognos-analytics/11.1.0?topic=terms-modified-z-score
    mad_value = mad(arr)
    if mad_value == 0:
        MeanAD = np.mean(np.abs(arr - np.mean(arr)))
        denominator = 1.253314 * MeanAD
    else:
        denominator = 1.486 * mad_value
    return (arr - np.median(arr)) / denominator


def metrics(y_true, y_pred):
    return {'MAPE': mean_absolute_percentage_error(y_true, y_pred), 'MAPE-MD': mean_absolute_percentage_error_mean_denominator(y_true, y_pred), 'MAE': mean_absolute_error(y_true, y_pred), 'RMSE': root_mean_squared_error(y_true, y_pred), 'R2': r2_score(y_true, y_pred)}


mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
mape_eps_scorer = make_scorer(mean_absolute_percentage_error_epsilon, greater_is_better=False)
# Return the metrics for a RFR model trained on the given data.
# The entire dataset is used for training, and the OOB prediction is used to compute the metrics.


def oob_metrics(X, y, metricFct, rf_parames={}):
    model = RandomForestRegressor(oob_score=True, **rf_parames)
    y_pred = model.fit(X, y).oob_prediction_
    return metricFct(y, y_pred)
# Return the metrics for a RFR model trained on the given data.
# KFold cross-validation is used train the model and to compute the metrics.


def kfold_metrics(X, y, metricFct, rf_parames={}, n_folds=5):
    model = RandomForestRegressor(**rf_parames)
    metrics_list = []
    for train_index, test_index in KFold(n_splits=n_folds).split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        y_pred = model.fit(X_train, y_train).predict(X_test)
        metrics_list.append(metricFct(y_test, y_pred))
    if isinstance(metrics_list[0], dict):
        # Convert list of dictionaries to a DataFrame
        metrics_df = pd.DataFrame(metrics_list)
        # Compute mean for each column
        aggregated_metrics = metrics_df.mean().to_dict()
        return aggregated_metrics
    else:
        # Compute mean of the list for numerical metrics
        return np.mean(metrics_list)


def _accumulate_prediction(predict, X, out, lock):
    """    This is a utility function for joblib's Parallel.    It can't go locally in ForestClassifier or ForestRegressor, because joblib    complains that it cannot pickle it when placed there.    """
    prediction = predict(X, check_input=False)
    with lock:
        out.append(prediction)


def predict_w_std(self, X):
    """    Predict regression target and standard deviation for X.    The predicted regression target of an input sample is computed as the    mean predicted regression targets of the trees in the forest. The standard    deviation of the predicted regression targets of the trees in the forest    is also computed to provide an estimate of the prediction uncertainty.    Parameters    ----------    X : {array-like, sparse matrix} of shape (n_samples, n_features)        The input samples. Internally, its dtype will be converted to        ``dtype=np.float32``. If a sparse matrix is provided, it will be        converted into a sparse ``csr_matrix``.    Returns    -------    mean_predictions : ndarray of shape (n_samples,)        The predicted values (mean of the predictions from all estimators).    std_predictions : ndarray of shape (n_samples,)        The standard deviation of the predicted values (standard deviation of the        predictions from all estimators).    Raises    ------    NotImplementedError        If the model was trained for multi-output regression.    Notes    -----    This function does not support multi-output regression. If the model was    trained for multi-output regression, an exception will be raised.    """
    if self.n_outputs_ > 1:
        raise NotImplementedError("Variance for multi-output regression is not supported now")
    check_is_fitted(self)
    # Check data
    X = self._validate_X_predict(X)
    # Assign chunk of trees to jobs
    n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
    # avoid storing the output of every estimator by summing them here
    # Initialize a list to collect predictions from each estimator
    all_predictions = []
    # Parallel loop
    lock = threading.Lock()
    Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(delayed(_accumulate_prediction)(e.predict, X, all_predictions, lock)
                                                                       for e in self.estimators_)
    # Convert list to numpy array for easier manipulation
    all_predictions = np.array(all_predictions)
    # Compute mean and variance across predictions from all estimators
    mean_predictions = np.mean(all_predictions, axis=0)
    std_predictions = np.std(all_predictions, axis=0)
    return mean_predictions, std_predictions


RandomForestRegressor.predict_w_std = predict_w_std

### Hyperparameters tuning


#### Parameters


#### Grid search


#### Compute hyperparameter tuning stats


#### Plot hyperparameter tuning


### Train regressors


In [None]:
serializer = PickleSerializer()

rf_regressors = {}

# Random Forest Regressor hyperparameters
n_estimators = 50  # Number of trees in random forest
max_features = None  # Number of features to consider at every split
max_depth = None  # Maximum number of levels in tree
min_samples_split = 2  # Minimum number of samples required to split a node
min_samples_leaf = 1  # Minimum number of samples required at each leaf node
max_neighbors = 50
    
# load from cache the already trained models
if useCached and not forceTrain:
    # load the models in dataCacheDirpath/rf_regressors. The file name is the system name.
    for systemName in systemsName_Valid:
        try:
            rf_regressors[systemName] = serializer.retrieve_model(os.path.join(dataCacheDirpath, 'rf_regressors'), systemName)
        except FileNotFoundError:
            continue
    print(f"Loaded {len(rf_regressors)}/{len(systemsName_Valid)} models. {len(systemsName_Valid) - len(rf_regressors)} models to train.")

# train the models for the systems that are not already trained
iteration_times_rf_training = []
for targetName in tqdm(set(systemsName_Valid) - set(rf_regressors), desc='Training regressors'):
    start_time = time.time()

    rf_regressor = RandomForestRegressor(oob_score=True, random_state=random_state, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    X_train, y_train = get_system_data(targetName, set='train', relative=True, max_neighbors=max_neighbors)

    # split the data into training and testing sets
    rf_regressor.fit(X_train, y_train)
    rf_regressors[targetName] = rf_regressor
    # save the model in dataCacheDirpath/rf_regressors. The file name is the system name.
    serializer.save_model(rf_regressor, os.path.join(dataCacheDirpath, 'rf_regressors'), targetName)

    iteration_duration = time.time() - start_time
    iteration_times_rf_training.append(iteration_duration)


### OOB train prediction and metrics


In [15]:
systemsData_RelativeExpectedDailyEnergy_train_List = []
regressorsMetrics_mape_scaled_train = pd.Series(index=systemsName_Valid, name='Train MAPE Normalized')
# extract all oob predictions for each system and add them to the list
for targetName, rf_regressor in rf_regressors.items():
    y_train = systemsData_RelativeMeasuredDailyEnergy_train[targetName].dropna()
    y_pred = pd.Series(rf_regressor.oob_prediction_, index=y_train.index, name=targetName).dropna()
    regressorsMetrics_mape_scaled_train.loc[targetName] = mean_absolute_error(y_train, y_pred)
    systemsData_RelativeExpectedDailyEnergy_train_List.append(y_pred)
# Concatenate all the columns in the list to create one dataframe
systemsData_RelativeExpectedDailyEnergy_train = pd.concat(systemsData_RelativeExpectedDailyEnergy_train_List, axis=1)
systemsData_RelativeExpectedDailyEnergy_train.index = pd.to_datetime(systemsData_RelativeExpectedDailyEnergy_train.index)
systemsData_RelativeExpectedDailyEnergy_train.sort_index(inplace=True)

### Compute absolute expected daily energy


In [16]:
systemsData_ExpectedDailyEnergy_train = systemsData_RelativeExpectedDailyEnergy_train * normalizer_estimated_max_daily_energy

### Compute feature importance


In [17]:
compute_permutation_importance = False

cacheFilename_permutation_importance_mean = os.path.join(dataCacheDirpath, 'permutation_importance_mean.csv')
cacheFilename_permutation_importance_std = os.path.join(dataCacheDirpath, 'permutation_importance_std.csv')

# start = time.time()

features_importance_df = pd.DataFrame(index=systemsName_Valid, columns=systemsName_Valid)
for targetName in systemsName_Valid:
    rf_regressor = rf_regressors[targetName]
    features_importance_df.loc[targetName, rf_regressor.feature_names_in_] = rf_regressor.feature_importances_


if compute_permutation_importance:
    if useCached and os.path.exists(cacheFilename_permutation_importance_mean) and os.path.exists(cacheFilename_permutation_importance_std):
        print(f"Loading cached data in {cacheFilename_permutation_importance_mean}")
        permutation_importance_mean_df = pd.read_csv(cacheFilename_permutation_importance_mean, index_col=0)
        print(f"Loading cached data in {cacheFilename_permutation_importance_std}")
        permutation_importance_std_df = pd.read_csv(cacheFilename_permutation_importance_std, index_col=0)
    else:
        permutation_importance_mean_df = pd.DataFrame(index=systemsName_Valid, columns=systemsName_Valid)
        permutation_importance_std_df = pd.DataFrame(index=systemsName_Valid, columns=systemsName_Valid)
        for targetName in tqdm(systemsName_Valid):
            X, y = get_system_data(targetName)
            rf_regressor = rf_regressors[targetName]
            permutation_importance_results = permutation_importance(rf_regressor, X, y, n_repeats=5, random_state=random_state, n_jobs=-1, scoring=mae_scorer)
            permutation_importance_mean_df.loc[targetName, X.columns] = permutation_importance_results.importances_mean
            permutation_importance_std_df.loc[targetName, X.columns] = permutation_importance_results.importances_std
        # save the permutation importances
        permutation_importance_mean_df.to_csv(cacheFilename_permutation_importance_mean)
        permutation_importance_std_df.to_csv(cacheFilename_permutation_importance_std)


# print(f"Time elapsed: {time.time() - start} - Time per system: {(time.time() - start) / len(systemsName_Valid)}")

## Expected daily energy on TEST set


In [None]:
# valid system that have been trained
systems_trained = [systemName for systemName in systemsName_Valid if systemName in rf_regressors]

print("Systems not trained:", [systemName for systemName in systemsName_Valid if systemName not in rf_regressors])


# check that all the index in systemsData_MeasuredDailyEnergy_test are in systemsData_EstimatedMaxDailyEnergy
if not systemsData_MeasuredDailyEnergy_test.index.isin(normalizer_estimated_max_daily_energy.index).all():
    raise ValueError("Some index in systemsData_MeasuredDailyEnergy_test are not in systemsData_EstimatedMaxDailyEnergy")

In [19]:
# compute relative value
systemsData_RelativeMeasuredDailyEnergy_test = systemsData_MeasuredDailyEnergy_test / normalizer_estimated_max_daily_energy

### Compute expected daily energy


In [None]:
systemsData_RelativeExpectedDailyEnergy_test_mean_List = []
systemsData_RelativeExpectedDailyEnergy_test_std_List = []

iteration_times_rf_prediction = []
for targetName in tqdm(systems_trained):
    start_time = time.time()

    X_test, y_test = get_system_data(targetName, set='test', relative=True)
    # check that there is at least one observation
    if y_test.count() == 0:
        continue
    regressor = rf_regressors[targetName]
    fitted_features = regressor.feature_names_in_

    # adjust the feature in the validation set to match the feature in the training set
    # Identify extra columns in X_test that are not used by the regressor
    extra_features = set(X_test.columns) - set(fitted_features)
    # Drop extra columns from X_val
    X_test = X_test.drop(columns=list(extra_features), errors='ignore')
    # Identify missing columns in X_test and add them as empty columns
    missing_features = set(fitted_features) - set(X_test.columns)
    for feature in missing_features:
        X_test[feature] = np.nan

    y_mean, y_std = regressor.predict_w_std(X_test)
    y_mean = pd.Series(y_mean, index=X_test.index, name=targetName)
    y_std = pd.Series(y_std, index=X_test.index, name=targetName)
    systemsData_RelativeExpectedDailyEnergy_test_mean_List.append(y_mean)
    systemsData_RelativeExpectedDailyEnergy_test_std_List.append(y_std)

    iteration_duration = time.time() - start_time
    iteration_times_rf_prediction.append(iteration_duration)



systemsData_RelativeExpectedDailyEnergy_test_mean = pd.concat(systemsData_RelativeExpectedDailyEnergy_test_mean_List, axis=1)
systemsData_RelativeExpectedDailyEnergy_test_mean.index = pd.to_datetime(systemsData_RelativeExpectedDailyEnergy_test_mean.index)
systemsData_RelativeExpectedDailyEnergy_test_mean.sort_index(inplace=True)

systemsData_RelativeExpectedDailyEnergy_test_std = pd.concat(systemsData_RelativeExpectedDailyEnergy_test_std_List, axis=1)
systemsData_RelativeExpectedDailyEnergy_test_std.index = pd.to_datetime(systemsData_RelativeExpectedDailyEnergy_test_std.index)
systemsData_RelativeExpectedDailyEnergy_test_std.sort_index(inplace=True)

### Compute metrics


In [21]:
regressorsMetrics_mape_scaled_test = pd.Series(index=systems_trained, name='Test MAPE Normalized')
for targetName in systems_trained:
    X_test, y_test = get_system_data(targetName, set='test', relative=True)
    y_pred = systemsData_RelativeExpectedDailyEnergy_test_mean[targetName].dropna()
    regressorsMetrics_mape_scaled_test.loc[targetName] = mean_absolute_error(y_test, y_pred)

### Compute absolute expected daily energy


In [22]:
systemsData_ExpectedDailyEnergy_test_mean = systemsData_RelativeExpectedDailyEnergy_test_mean * normalizer_estimated_max_daily_energy
systemsData_ExpectedDailyEnergy_test_std = systemsData_RelativeExpectedDailyEnergy_test_std * normalizer_estimated_max_daily_energy

## Statistics on the models metrics


In [23]:

# do a figure with 3 box plot, one for each column of the dataframe df_mape_train_save.
regressorsMetrics_mape_scaled_train_percent = regressorsMetrics_mape_scaled_train[regressorsMetrics_mape_scaled_train < 0.5] * 100
regressorsMetrics_mape_scaled_test_percent = regressorsMetrics_mape_scaled_test[regressorsMetrics_mape_scaled_test < 0.5] * 100

fig = go.Figure()
# fig.add_trace(go.Box(y=regressorsMetrics_mape_scaled_train_percent, name='Train Set', boxmean=True))
fig.add_trace(go.Box(y=regressorsMetrics_mape_scaled_test_percent, name='Test Set', boxmean=True, boxpoints='all', jitter=0.5, pointpos=-1.8))


# remove legend
fig.update_layout(showlegend=False)
# set the x axis name to "Training Days"
# set y axis name to "MAPE (%)"
fig.update_yaxes(title_text='MAPE (%)')
fig.update_layout(width=666, height=666)
fig.show()

## Comparator


In [24]:
systemsData_RelativeDelta_test = systemsData_RelativeExpectedDailyEnergy_test_mean - systemsData_RelativeMeasuredDailyEnergy_test

# Detector


In [25]:
# eronate data are the data where the relative delta is lower than regressorsMetrics_test
max_z_score = 2.33  # 98% confidence interval
systemsData_RelativeDelta_test_detected = systemsData_RelativeDelta_test[systemsData_RelativeDelta_test.loc[:, regressorsMetrics_mape_scaled_test.index] > max_z_score * regressorsMetrics_mape_scaled_test]

# Scaling technics & Outliers removal

Compute the:

- Global mean
- Global median
- Global standard deviation
- Rolling mean
- Rolling median
- Rolling standard deviation
- Simulate max production without info
- SImulated max production with info


# Plot results


In [None]:
# Initialize the Dash app
app = dash.Dash(__name__)

tab_height = '2em'
app.layout = html.Div([
    html.Div([
        dcc.Dropdown(
            id='system-dropdown',
            options=[{'label': name, 'value': name} for name in systemsName_Valid],
            value=systemsName_Valid[0],
            style={'width': '50%'}  # Adjust width and font size
        ),
        html.Div(id='metric-text-container', style={'display': 'inline-block', 'margin-left': '20px'})  # Container for the metric text
    ], style={'display': 'flex', 'align-items': 'center'}),  # Align items horizontally
    dcc.Tabs(id='plot-tabs', value='tab-energy', children=[
        dcc.Tab(label='Absolute Energy', value='tab-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Normalized Energy', value='tab-rel-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Normalizer Tuning', value='tab-norm-tuning', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        # plot systemsData_RelativeDelta_test
        dcc.Tab(label='All Normalized Energy', value='tab-rel-energy-all', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Under Production', value='tab-delta-rel-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='All Missing Value', value='tab-miss-val-all', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Selected Neighboring Systems', value='tab-neighbors', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height

    ]),  # Adjust height for tabs
    html.Div(id='tabs-content', style={'flex': '1 1 auto'})  # Allow the tabs-content div to grow
], style={'display': 'flex', 'flexDirection': 'column', 'height': '100vh'})  # Make the outer container fill the screen height


@app.callback(
    [Output('tabs-content', 'children'),
     Output('metric-text-container', 'children')],
    [Input('plot-tabs', 'value'),
     Input('system-dropdown', 'value')]
)
def render_content(tab, selected_system):
    # Statistic text
    try:
        mae_train = regressorsMetrics_mape_scaled_train.loc[selected_system]
    except:
        mae_train = np.nan
    try:
        mae_test = regressorsMetrics_mape_scaled_test.loc[selected_system]
    except:
        mae_test = np.nan
    try:
        loss = systemsMetadata[selected_system]['metadata']['loss']
    except:
        loss = np.nan

    mae_train_text = f"Half Sibling Regressor - Train set MAPE: {mae_train * 100:.2f}%"
    mae_test_text = f"Half Sibling Regressor - Test set MAPE: {mae_test * 100:.2f}%"
    loss_text = f"Normalizer Tuning - Static System Loss : {loss * 100:.2f}%"

    metric_text_div = html.Div([
        html.Div(mae_train_text),
        html.Div(mae_test_text),
        html.Div(loss_text)
    ], style={'fontSize': 16})

    if tab == 'tab-energy':
        fig1 = go.Figure(layout_yaxis_title="Daily Energy (kWh)")

        # remove nan from systemsData_EstimatedMaxDailyEnergy[selected_system]

        # first_trace = True

        # for start_date, value in pv_gis_pred_a001267_april_july.items():
        #     end_date = (pd.to_datetime(start_date) + pd.DateOffset(months=1) - pd.DateOffset(days=1)).strftime('%Y-%m-%d')
        #     fig1.add_trace(go.Scatter(
        #         x=[start_date, end_date],
        #         y=[value, value],
        #         mode='lines',
        #         name='Actual Prediction from PV GIS' if first_trace else '',
        #         line=dict(color='yellow'),
        #         showlegend=first_trace  # Only show legend for the first trace
        #     ))
        #     first_trace = False

        try:
            estimatedMaxDailyEnergy = normalizer_estimated_max_daily_energy[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=estimatedMaxDailyEnergy.index,
                y=estimatedMaxDailyEnergy,
                mode='lines',
                name='Estimated Max Daily Energy',
                marker_color='LightSeaGreen'
            ))
        except:
            pass

        try:
            measuredDailyEnergy = systemsData_MeasuredDailyEnergy[selected_system].dropna()
            fig1.add_trace(go.Bar(
                x=measuredDailyEnergy.index,
                y=measuredDailyEnergy,
                # mode='markers',
                name='Measured Daily Energy',
                marker_color='blue'
            ))
        except:
            pass

        # try:
        # measuredDailyEnergy_train_outliers = systemsData_MeasuredDailyEnergy_train_outliers[selected_system].dropna()
        #     fig1.add_trace(go.Scatter(
        #         x=measuredDailyEnergy_train_outliers.index,
        #         y=measuredDailyEnergy_train_outliers,
        #         mode='markers',
        #         name='Outliers',
        #         marker_color='yellow'
        #     ))
        # except:
        #     pass
        # try:
        #     expectedDailyEnergy_train = systemsData_ExpectedDailyEnergy_train[selected_system].dropna()
        #     fig1.add_trace(go.Scatter(
        #         x=expectedDailyEnergy_train.index,
        #         y=expectedDailyEnergy_train,
        #         mode='markers',
        #         name='Expected Daily Energy',
        #         marker_color='red'
        #     ))
        # except:
        #     pass

        try:
            expectedDailyEnergy_test_mean = systemsData_ExpectedDailyEnergy_test_mean[selected_system].dropna()
            expectedDailyEnergy_test_std = systemsData_ExpectedDailyEnergy_test_std[selected_system].dropna()
            fig1.add_trace(go.Bar(
                x=expectedDailyEnergy_test_mean.index,
                y=expectedDailyEnergy_test_mean,
                # mode='markers',
                name='Expected Daily Energy',
                marker_color='red'
                # error_y=dict(
                #     type='data',
                #     array=expectedDailyEnergy_test_std,
                #     visible=True
                # )
            ))
        except:
            pass

        # Update layout for legend position
        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure

    elif tab == 'tab-norm-tuning':
        fig1 = go.Figure(layout_yaxis_title="Daily Energy (kWh)")
        try:
            measuredDailyEnergy = systemsData_MeasuredDailyEnergy[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=measuredDailyEnergy.index,
                y=measuredDailyEnergy,
                mode='markers',
                name='Measured Daily Energy',
                marker_color='blue'
            ))
        except:
            pass
        try:
            measuredMax = normalizer_tuning_measure_max[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=measuredMax.index,
                y=measuredMax,
                mode='markers',
                name='Max Measured Daily Energy (7 days window)',
                marker_color='red'
            ))
        except:
            pass
        try:
            outliers = normalizer_tuning_outliers[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=outliers.index,
                y=outliers,
                mode='markers',
                name='Tuning Outliers',
                marker_color='yellow'
            ))
        except:
            pass
        try:
            estimatedMaxDailyEnergy_untuned = normalizer_tuning_estimated_max_daily_energy_untuned[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=estimatedMaxDailyEnergy_untuned.index,
                y=estimatedMaxDailyEnergy_untuned,
                mode='lines',
                name='Estimated Max Daily Energy (Untuned)',
                marker_color='violet'
            ))
        except:
            pass
        try:
            estimatedMaxDailyEnergy = normalizer_estimated_max_daily_energy[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=estimatedMaxDailyEnergy.index,
                y=estimatedMaxDailyEnergy,
                mode='lines',
                name='Estimated Max Daily Energy (Tuned)',
                marker_color='LightSeaGreen'
            ))
        except:
            pass

        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div
    elif tab == 'tab-rel-energy':
        fig1 = go.Figure(layout_yaxis_title="Normalized Daily Energy (%)")
        # add a line at 100% for the Estimated Max Daily Energy
        estimatedMaxDailyEnergy = normalizer_estimated_max_daily_energy[selected_system].dropna()
        fig1.add_shape(
            type="line",
            x0=estimatedMaxDailyEnergy.index.min(),
            y0=100,
            x1=estimatedMaxDailyEnergy.index.max(),
            y1=100,
            name='Estimated Max Daily Energy',
            line_color='LightSeaGreen'
            # line=dict(
            #     color="LightSeaGreen",
            #     width=2,
            #     dash="dashdot",
            # ),
        )
        try:
            relativeMeasuredDailyEnergy = systemsData_RelativeMeasuredDailyEnergy[selected_system].dropna()
            fig1.add_trace(go.Bar(
                x=relativeMeasuredDailyEnergy.index,
                y=relativeMeasuredDailyEnergy * 100,
                # mode='markers',
                name='Measured Daily Energy',
                marker_color='blue'
            ))
        except:
            pass
        try:
            relativeExpectedDailyEnergy_test_mean = systemsData_RelativeExpectedDailyEnergy_test_mean[selected_system].dropna()
            fig1.add_trace(go.Bar(
                x=relativeExpectedDailyEnergy_test_mean.index,
                y=relativeExpectedDailyEnergy_test_mean * 100,
                # mode='markers',
                name='Expected Daily Energy',
                marker_color='red'
                # error_y=dict(
                #     type='data',
                #     array=systemsData_RelativeExpectedDailyEnergy_test_std[selected_system] * 100,
                #     visible=True
                # )
            ))
        except:
            pass

        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )
        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-delta-rel-energy':
        fig1 = go.Figure(layout_yaxis_title="Normalized Daily Energy Loss (%)")
        try:
            relativeDelta_test = systemsData_RelativeDelta_test[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=relativeDelta_test.index,
                y=relativeDelta_test * 100,
                mode='markers',
                name='Normalized Delta Energy',
            ))
        except:
            pass
        try:
            relativeDelta_test_detected = systemsData_RelativeDelta_test_detected[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=relativeDelta_test_detected.index,
                y=relativeDelta_test_detected * 100,
                mode='markers',
                name='Detected Anomaly',
                marker_color='red'
            ))
        except:
            pass

        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )
        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-rel-energy-all':
        fig1 = go.Figure(layout_yaxis_title="Normalized Daily Energy (%)")
        # try:
        features_importance_norm = features_importance_df.loc[selected_system] / features_importance_df.loc[selected_system].max()

        for systemName in systemsName_Valid:
            if systemName != selected_system:
                if features_importance_norm[systemName] > 0.05:
                    fig1.add_trace(go.Scatter(
                        x=systemsData_RelativeMeasuredDailyEnergy[systemName].index,
                        y=systemsData_RelativeMeasuredDailyEnergy[systemName] * 100,
                        mode='markers',
                        name=f'{systemName}',
                        marker_color='blue',
                        marker_opacity=features_importance_norm[systemName]
                    ))
        fig1.add_trace(go.Scatter(
            x=systemsData_RelativeMeasuredDailyEnergy[selected_system].index,
            y=systemsData_RelativeMeasuredDailyEnergy[selected_system] * 100,
            mode='markers',
            name=f'{selected_system}',
            marker_color='red'
        ))
        fig1.update_layout(yaxis=dict(range=[-5, 120]))

        # except:
        #     pass

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-miss-val-all':
        measures = systemsData_MeasuredDailyEnergy

        # Sort columns by the number of missing values
        sorted_columns = measures.isnull().sum().sort_values().index
        sorted_measures = measures[sorted_columns]

        # Create a boolean DataFrame where True indicates missing values
        missing_values = (~sorted_measures.isnull()).astype(int)

        # Plot heatmap
        fig = go.Figure(data=go.Heatmap(
            z=missing_values,
            x=missing_values.columns,
            y=missing_values.index,
            showscale=False,
            colorscale='Greys'  # Set colorscale to black and white
        ))
        fig.update_layout(
            yaxis=dict(
                showticklabels=True,  # Show y-axis tick labels
                autorange='reversed'  # Invert the y-axis
            ),
            yaxis_tickmode='array',
            yaxis_tickvals=pd.date_range(start=missing_values.index.min(), end=missing_values.index.max(), freq='ME'),
            yaxis_ticktext=pd.date_range(start=missing_values.index.min(), end=missing_values.index.max(), freq='ME').strftime('%b %Y')
        )

        return dcc.Graph(figure=fig, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure

    elif tab == 'tab-neighbors':
        fig2 = go.Figure()

        # Add initial traces with secondary y-axis
        try:
            selected_features_importance_df = features_importance_df.loc[selected_system,].dropna()
            fig2.add_trace(go.Bar(
                x=selected_features_importance_df.index,
                y=selected_features_importance_df*100,
                name='Impurity-based Importance',
                yaxis='y1',
                offsetgroup=1,
                texttemplate='%{y:.1f}%',  # Display the percentage value on top of each bar

            ))
            fig2.update_layout(
                yaxis1=dict(
                    title='Impurity-based Importance'
                    # range=[0, selected_features_importance_df.max()],
                )
            )
        except:
            pass
        try:
            fig2.add_trace(go.Bar(
                x=permutation_importance_mean_df.columns,
                y=permutation_importance_mean_df.loc[selected_system],
                name='Permutation Importance',
                yaxis='y2',
                offsetgroup=2
            ))
            fig2.update_layout(
                yaxis2=dict(
                    title='Permutation Importance',
                    overlaying='y',
                    side='right',
                    range=[0, permutation_importance_mean_df.loc[selected_system].max()],
                )
            )
        except:
            pass

        return dcc.Graph(figure=fig2, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure


def open_browser():
    webbrowser.open("http://127.0.0.1:8050/")


if __name__ == '__main__':
    # Open the Dash app in a new browser window
    Timer(1, open_browser).start()
    app.run_server(debug=True, use_reloader=False)

# Comparisons between regressors


In [None]:
import plotly.graph_objects as go

# Define the categories and their corresponding statistics
categories = ["PLR", "RF", "KNN", "SVR", "LR"]
q1 =            [3.5 , 2.8, 3.7, 5.8, 5.4    ]
median =        [5.9 , 3.9, 4.5, 9.8, 11.7   ]
q3 =            [10.8, 5.8, 6.5, 14.9, 19.5  ]
lowerfence =    [1.4 , 0.6, 1.3, 1.5, 1.6    ]
upperfence =    [14.8, 10.3, 9.6, 20.1, 35.3 ]
mean =          [8.2 , 4.7, 5.0, 12.2, 15.8  ]

# Define colors for each category, with "PLR" set to orange
colors = ['orange', 'blue', 'blue', 'blue', 'blue']

# Initialize the figure
fig = go.Figure()

# Add a separate Box trace for each category
for i, category in enumerate(categories):
    fig.add_trace(go.Box(
        name=category,
        x=[category],
        q1=[q1[i]],
        median=[median[i]],
        q3=[q3[i]],
        lowerfence=[lowerfence[i]],
        upperfence=[upperfence[i]],
        mean=[mean[i]],
        marker_color=colors[i],
        showlegend=False  # Show legend only once for "PLR"
    ))

# Update layout settings
fig.update_layout(
    yaxis_title='MAPE [%]',
    width=1000,
    height=666
)

# Show the figure
fig.show()


# Impact of number of neighbors


In [None]:
# Random Forest Regressor hyperparameters
n_estimators = 100  # Number of trees in random forest
max_features = None  # Number of features to consider at every split
max_depth = None  # Maximum number of levels in tree
min_samples_split = 2  # Minimum number of samples required to split a node
min_samples_leaf = 1  # Minimum number of samples required at each leaf node


df_mape_n_neighbors_test = pd.DataFrame(index=systemsName_Valid, columns=[1, 2, 5, 10, 25, 50, 150, 300])
for max_neighbors in df_mape_n_neighbors_test.columns:
    for targetName in tqdm(systemsName_Valid):
        # train
        rf_regressor = RandomForestRegressor(oob_score=True, random_state=random_state, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_train, y_train = get_system_data(targetName, set='train', relative=True, max_neighbors=max_neighbors)
        # split the data into training and testing sets
        rf_regressor.fit(X_train, y_train)

        # predict
        X_test, y_test = get_system_data(targetName, set='test', relative=True)
        if y_test.count() == 0:
            continue

        # adjust the feature in the validation set to match the feature in the training set
        fitted_features = rf_regressor.feature_names_in_
        # Identify extra columns in X_test that are not used by the regressor
        extra_features = set(X_test.columns) - set(fitted_features)
        # Drop extra columns from X_val
        X_test = X_test.drop(columns=list(extra_features), errors='ignore')
        # Identify missing columns in X_test and add them as empty columns
        missing_features = set(fitted_features) - set(X_test.columns)
        for feature in missing_features:
            X_test[feature] = np.nan

        y_mean_array = rf_regressor.predict(X_test)
        y_pred = pd.Series(y_mean_array, index=X_test.index, name=targetName)
        df_mape_n_neighbors_test.loc[targetName, max_neighbors] = mean_absolute_error(y_test, y_pred)

In [None]:
# do a figure with 3 box plot, one for each column of the dataframe df_mape_train_save.
df_mape_n_neighbors_test_filtered = df_mape_n_neighbors_test[df_mape_n_neighbors_test < 0.5] * 100
fig = go.Figure()
for column in df_mape_n_neighbors_test_filtered.columns:
    fig.add_trace(go.Box(y=df_mape_n_neighbors_test_filtered[column], name=column, boxmean=True))

# remove legend
fig.update_layout(showlegend=False)
# set the x axis name to "Training Days"
fig.update_xaxes(title_text='Neighbors systems')
# set y axis name to "MAPE (%)"
fig.update_yaxes(title_text='MAPE (%)')
# set the fig size to 1000 x 666
fig.update_layout(width=1000, height=666)
fig.show()

In [None]:
# Step 1: Convert 'training_time' from 'HH:MM' format to minutes
data = {'systems': df_mape_n_neighbors_test.columns,
        'training_time': ['01:04','01:07','01:25','02:06','03:49','05:49','15:38','29:21']}

df = pd.DataFrame(data)

# Convert training time to minutes
df['training_time_seconds'] = df['training_time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1])) / 326

df

In [None]:
(df_mape_n_neighbors_test.astype(float)*100).describe()

# Impact of the number of training data


In [None]:
# Random Forest Regressor hyperparameters
n_estimators = 100  # Number of trees in random forest
max_features = 'log2'  # Number of features to consider at every split
max_depth = None  # Maximum number of levels in tree
min_samples_split = 2  # Minimum number of samples required to split a node
min_samples_leaf = 1  # Minimum number of samples required at each leaf node


df_mape_n_history_test = pd.DataFrame(index=systemsName_Valid, columns=[2])
for max_training_days in df_mape_n_history_test.columns:
    for targetName in tqdm(systemsName_Valid):
        # train
        rf_regressor = RandomForestRegressor(oob_score=False, random_state=random_state, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_train, y_train = get_system_data(targetName, set='train', relative=True, max_days=max_training_days)

        # split the data into training and testing sets
        rf_regressor.fit(X_train, y_train)

        # predict
        X_test, y_test = get_system_data(targetName, set='test', relative=True)
        if y_test.count() == 0:
            continue

        # adjust the feature in the validation set to match the feature in the training set
        fitted_features = rf_regressor.feature_names_in_
        # Identify extra columns in X_test that are not used by the regressor
        extra_features = set(X_test.columns) - set(fitted_features)
        # Drop extra columns from X_val
        X_test = X_test.drop(columns=list(extra_features), errors='ignore')
        # Identify missing columns in X_test and add them as empty columns
        missing_features = set(fitted_features) - set(X_test.columns)
        for feature in missing_features:
            X_test[feature] = np.nan

        y_mean_array = rf_regressor.predict(X_test)
        y_pred = pd.Series(y_mean_array, index=X_test.index, name=targetName)
        df_mape_n_history_test.loc[targetName, max_training_days] = mean_absolute_error(y_test, y_pred)

## Mean Distance to the important features

In [None]:
from haversine import haversine, Unit

# Rest of the code
weighted_mean_distances = pd.Series(index=features_importance_df.index, name='distances')
for targetName in features_importance_df.index:
    important_neighbors_sytems = features_importance_df.loc[targetName].sort_values(ascending=False).dropna()
    important_neighbors_sytems_names = important_neighbors_sytems.index

    long_target = systemsMetadata[targetName]['metadata']['loc_longitude']
    lat_target = systemsMetadata[targetName]['metadata']['loc_latitude']
    distances = pd.Series(index=important_neighbors_sytems_names, name='distance')
    for systemName in important_neighbors_sytems_names:
        long = systemsMetadata[systemName]['metadata']['loc_longitude']
        lat = systemsMetadata[systemName]['metadata']['loc_latitude']
        # compute the distance between the two systems
        distances[systemName] = haversine((lat_target, long_target), (lat, long), Unit.KILOMETERS)
        
    weighted_mean_distances[targetName] = (distances * important_neighbors_sytems).sum() / important_neighbors_sytems.sum()

In [None]:
# box plot of weighted_mean_distances using plotly
fig = go.Figure()
fig.add_trace(go.Box(y=weighted_mean_distances, name='Weighted Mean Distances', boxmean=True,
            boxpoints='all', # can also be outliers, or suspectedoutliers, or False
            jitter=0.3, # add some jitter for a better separation between points
            pointpos=-1.8))
fig.update_layout(showlegend=False)
fig.update_yaxes(title_text='Distance (km)')
fig.update_layout(width=1000, height=666)
fig.show()

In [None]:
#squatter plot with X weighted_mean_distances and Y regressorsMetrics_mape_scaled_test
fig = go.Figure()
fig.add_trace(go.Scatter(x=weighted_mean_distances, y=regressorsMetrics_mape_scaled_test*100, mode='markers', text=regressorsMetrics_mape_scaled_test.index))
fig.update_xaxes(title_text='Distance (km)')
fig.update_yaxes(title_text='MAPE (%)')
fig.update_layout(width=1000, height=666)
fig.show()

## Mean Number of important features

In [None]:
nbr_significant_neighbors

In [None]:
nbr_significant_neighbors = (features_importance_df>0.1).sum(axis=1)

# Create the histogram
fig = go.Figure()

# Add the histogram trace with percentage normalization
fig.add_trace(go.Histogram(
    x=nbr_significant_neighbors,
    nbinsx=10,
    histnorm='percent',
    texttemplate='%{y:.1f}%',  # Display the percentage value on top of each bar
    textposition='auto',        # Position the text on top of each bar
    marker_color='#1f77b4',     # Optional: Set a custom color for the bars
))

# Update layout for axis titles and figure size
fig.update_layout(
    xaxis_title='Number of selected neighbors (more than 10% importance)',
    yaxis_title='Percentage of PV Systems (%)',
    width=1000,
    height=666
)

# Display the figure
fig.show()

## Number of days

In [None]:
import plotly.graph_objects as go

# Assuming systemsData_MeasuredDailyEnergy is a DataFrame
non_na_counts = systemsData_MeasuredDailyEnergy.notna().sum(axis=0)

# Create a Plotly histogram
fig = go.Figure(data=[go.Histogram(
    x=non_na_counts.values,  # Count of non-NA values
    nbinsx=100  # Number of bins (adjust as needed)
)])

# Update layout for better readability
fig.update_layout(
    xaxis_title='Number of Days',
    yaxis_title='Number of Systems'
)

fig.update_layout(
    width=1000,
    height=666
)

fig.show()

In [None]:
non_na_counts.describe()

# Timing Stats

In [None]:
#do box plots of the iteration times of the 3 variables, iteration_times_rf_training, iteration_times_rf_prediction, iteration_times_normalizer
fig = go.Figure()
fig.add_trace(go.Box(y=iteration_times_normalizer, name='Normalizer', boxmean=True))
fig.add_trace(go.Box(y=iteration_times_rf_training, name='HSR Training', boxmean=True))
fig.add_trace(go.Box(y=iteration_times_rf_prediction, name='Prediction', boxmean=True))

# remove legend
fig.update_layout(showlegend=False)
# set the x axis name to "Training Days"
fig.update_yaxes(title_text='Time (s)')
# set y axis name to "MAPE (%)"
fig.update_layout(width=1000, height=666)
fig.show()
