# Setup & Import


In [109]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pvlib
import json
import os
from pvlib.pvsystem import PVSystem, Array, FixedMount
from pvlib.location import Location
from pvlib.modelchain import ModelChain
from pvlib.temperature import TEMPERATURE_MODEL_PARAMETERS
import plotly.graph_objects as go
import plotly.io as pio
from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import forestci as fci
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import threading
from sklearn.metrics import make_scorer
import dash
from dash import dcc, html
import plotly.graph_objects as go
from dash.dependencies import Input, Output
import webbrowser
from threading import Timer
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pickle

from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import (
    check_is_fitted,
)
from sklearn.ensemble._base import _partition_estimators

pio.renderers.default = "browser"  # render plotly figures in browser

PARENT_DATA_DIR = os.getenv('PARENT_DATA_DIR')
if PARENT_DATA_DIR is None:
    raise ValueError("PARENT_DATA_DIR environment variable is not set")


dataDirpath = PARENT_DATA_DIR + r"\PRiOT\dataExport_2"  # "/Applications/Documents/TM Maxime/dataExport_3400_daily"#
dataCacheDirpath = os.path.join(dataDirpath, "cache")
logsDirpath = "../logs"
useCached = False
forceTrain = True
tuneMaxProductionEstimators = True
random_state = 42


testingDays = 100
minTestingDays = 30
minTrainingDays = 7

if not os.path.exists(logsDirpath):
    os.makedirs(logsDirpath)

if not os.path.exists(dataCacheDirpath):
    os.makedirs(dataCacheDirpath)

# Serializer


In [110]:
# https://scikit-learn.org/stable/model_persistence.html


class ModelSerializer:
    def _save_model(self, model, serial_type, save_params):
        serial_type.dump(model, save_params)

    def _retrieve_model(self, serial_type, retrieve_params):
        return serial_type.load(retrieve_params)


# save_model_path = "Serialized_models\\"


class JoblibSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        super()._save_model(model, joblib, os.path.join(save_model_path, filename + ".joblib"))

    def retrieve_model(self, save_model_path, filename):
        return super()._retrieve_model(joblib, os.path.join(save_model_path, filename + '.joblib'))


class PickleSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        # create folder if not exists
        if not os.path.exists(save_model_path):
            os.makedirs(save_model_path)
        with open(os.path.join(save_model_path, filename + ".pkl"), 'wb') as f:
            super()._save_model(model, pickle, f)

    def retrieve_model(self, save_model_path, filename):
        with open(os.path.join(save_model_path, filename + ".pkl"), 'rb') as f:
            return super()._retrieve_model(pickle, f)

# Function


In [111]:
def get_altitude_from_wgs84(longitude, latitude):
    # Convert WGS84 to LV95
    lv95_url = "https://geodesy.geo.admin.ch/reframe/wgs84tolv95"
    params_lv95 = {
        "easting": longitude,
        "northing": latitude,
        "format": "json"
    }

    response_lv95 = requests.get(lv95_url, params=params_lv95)
    if response_lv95.status_code != 200:
        raise Exception("Error converting WGS84 to LV95: " + response_lv95.text)

    lv95_data = response_lv95.json()
    lv95_easting = lv95_data["easting"]
    lv95_northing = lv95_data["northing"]

    # Get altitude from LV95 coordinates
    altitude_url = "https://api3.geo.admin.ch/rest/services/height"
    params_altitude = {
        "easting": lv95_easting,
        "northing": lv95_northing
    }

    response_altitude = requests.get(altitude_url, params=params_altitude)
    if response_altitude.status_code != 200:
        raise Exception("Error retrieving altitude: " + response_altitude.text)

    altitude_data = response_altitude.json()
    altitude = altitude_data["height"]

    return float(altitude)


def remove_system(systemName, message):
    if 'systemsName_Valid' in globals() and systemName in systemsName_Valid:
        systemsName_Valid.remove(systemName)
    if 'systemsName_Valid' in globals() and systemName in systemsName_Valid:
        systemsName_Valid.remove(systemName)
    if 'systemsData_EstimatedMaxDailyEnergy' in globals() and systemName in systemsData_EstimatedMaxDailyEnergy.columns:
        systemsData_EstimatedMaxDailyEnergy.drop(columns=systemName, inplace=True)
    if 'systemsData_MeasuredDailyEnergy_train' in globals() and systemName in systemsData_MeasuredDailyEnergy_train.columns:
        systemsData_MeasuredDailyEnergy_train.drop(columns=systemName, inplace=True)
    if 'systemsData_MeasuredDailyEnergy_test' in globals() and systemName in systemsData_MeasuredDailyEnergy_test.columns:
        systemsData_MeasuredDailyEnergy_test.drop(columns=systemName, inplace=True)
    print(message)

## Import metadata


In [112]:
metadataFilepath = os.path.join(dataDirpath, "metadata.json")

with open(metadataFilepath, 'r') as f:
    systemsMetadata = json.load(f)

# Add altitude to metadata, if not already present (TODO : imporove with multi threading)

for systemId, systemMetadata in tqdm(systemsMetadata.items()):
    if "loc_altitude" not in systemMetadata['metadata']:
        if "loc_longitude" in systemMetadata['metadata'] and "loc_latitude" in systemMetadata['metadata']:
            systemMetadata['metadata']["loc_altitude"] = get_altitude_from_wgs84(systemMetadata['metadata']["loc_longitude"], systemMetadata['metadata']["loc_latitude"])

# Split arrays in dictionaries by module number
for systemId, systemMetadata in systemsMetadata.items():
    arrays = {}
    keys_to_delete = []
    for key, value in systemMetadata['metadata'].items():
        if 'mod' in key:
            # Extract the module number
            array_num = key.split('_')[1][-1]
            # Remove the module number from the key
            new_key = '_'.join(key.split('_')[:1] + key.split('_')[2:])
            # Add the key-value pair to the appropriate module dictionary
            if 'arrays' not in systemMetadata:
                systemMetadata['arrays'] = {}
            if array_num not in systemMetadata['arrays']:
                systemMetadata['arrays'][array_num] = {}
            systemMetadata['arrays'][array_num][new_key] = value
            keys_to_delete.append(key)
    for key in keys_to_delete:
        del systemMetadata['metadata'][key]

# Save metadata with altitude
with open(metadataFilepath, 'w') as f:
    json.dump(systemsMetadata, f, indent=4)

100%|██████████| 481/481 [00:00<00:00, 477392.39it/s]




## Import measures


In [113]:
cacheFilename_systemsData_MeasuredDailyEnergy = os.path.join(dataCacheDirpath, 'systemsData_MeasuredDailyEnergy.pkl')
if useCached and os.path.exists(cacheFilename_systemsData_MeasuredDailyEnergy):
    print(f"Loading cached data in {cacheFilename_systemsData_MeasuredDailyEnergy}")
    systemsData_MeasuredDailyEnergy = pd.read_pickle(cacheFilename_systemsData_MeasuredDailyEnergy)
    systemsName_Valid = systemsData_MeasuredDailyEnergy.columns
else:
    # Load all csv files from the data directory
    systemsData = {}
    for file in os.listdir(dataDirpath):
        if file.endswith(".csv"):
            systemName = file.split("_")[0]
            systemsData[systemName] = pd.read_csv(os.path.join(dataDirpath, file))
            systemsData[systemName]['Datetime'] = pd.to_datetime(systemsData[systemName]['Timestamp'], unit='ms', utc=True).dt.tz_convert('Europe/Zurich')
            systemsData[systemName]['Date'] = (systemsData[systemName]['Datetime'] + pd.Timedelta(hours=1)).dt.date  # Convert the datetime to only the date, as the production is the daily production. The +1h is to manage the saving time. Normally PRiOT exports the data at midnight (local time) for the day after (e.g. the energy for the July 1st is saved at July 1st 00:00 Europe/Zurich). However it seams that the saving time is not always correctly handled, and sometime the export is done at 23:00 the day before (e.g. the energy for the July 1st is saved at June 30th 23:00 Europe/Zurich). This is why we add 1h to the datetime to be sure to have the correct date.

    systemsName = list(systemsData.keys())

    df_duplicate_list = list()
    for systemName, systemData in systemsData.items():
        # Save duplicate dates to log list, and the in a log file
        duplicates = systemData[systemData['Date'].duplicated(keep=False)]
        if len(duplicates) > 0:
            df_duplicate_list.append(duplicates)

            # Remove duplicate date where tt_forward_active_energy_total_toDay is the smallest
            # TODO maybe we should sum the energy of the duplicates instead of removing the smallest one. However, when looking in PRiOT Portal, it seams that in the daily energy, only the biggest value is represented. We do the same here.
            systemData.sort_values('tt_forward_active_energy_total_toDay', ascending=True, inplace=True)
            systemsData[systemName].drop_duplicates(subset='Date', keep='last', inplace=True)

        # Set date as the index and sort the data by date
        systemsData[systemName].set_index('Date', inplace=True)
        systemData.sort_index(ascending=True, inplace=True)

    # Save duplicate dates to log file
    df_duplicate = pd.concat(df_duplicate_list)
    print(f"Number of duplicate dates found: {len(df_duplicate)} (see log file for more details)")
    df_duplicate.to_csv(os.path.join(logsDirpath, 'duplicateDates.csv'), index=True)

    ## ----------------------------------------------- ##
    ## Convert data & Filter out invalid PRiOT systems ##
    ## ----------------------------------------------- ##

    systemsName_Valid = systemsName.copy()
    for systemName in systemsName:
        missingData = False
        # Check if the system has measures
        if len(systemsData[systemName]) == 0:
            missingData = True
            print(f"System {systemName} : No measures found")
        # Check if the system has metadata
        if systemName not in systemsMetadata:
            missingData = True
            print(f"System {systemName} : No metadata found")

        else:
            # Check metadata for the system
            for key in ['loc_latitude', 'loc_longitude', 'loc_altitude', 'pv_kwp']:
                # test that the key is present
                if key not in systemsMetadata[systemName]['metadata']:
                    missingData = True
                    print(f"System {systemName} : No '{key}' found")
                # if present, convert the value to a number, if possible
                elif not isinstance(systemsMetadata[systemName]['metadata'][key], (int, float)):
                    try:
                        systemsMetadata[systemName]['metadata'][key] = int(systemsMetadata[systemName]['metadata'][key])
                    except ValueError:
                        try:
                            systemsMetadata[systemName]['metadata'][key] = float(systemsMetadata[systemName]['metadata'][key])
                        except ValueError:
                            missingData = True
                            print(f"System {systemName} : The key-value '{key}:{systemsMetadata[systemName]['metadata'][key]}' is not a number")

            # Check metadata for the arrays
            if 'arrays' not in systemsMetadata[systemName] or len(systemsMetadata[systemName]['arrays']) == 0:
                print(f"System {systemName} : No PV arrays found")
                missingData = True
            else:
                for array_num, arrayData in systemsMetadata[systemName]['arrays'].items():
                    for key in ['pv_tilt', 'pv_azimut', 'pv_wp', 'pv_number']:
                        if key not in arrayData:
                            missingData = True
                            print(f"System {systemName} : No '{key}' found for array {array_num}")
                        # test that the value is a number
                        elif not isinstance(arrayData[key], (int, float)):
                            try:
                                arrayData[key] = int(arrayData[key])
                            except ValueError:
                                try:
                                    arrayData[key] = float(arrayData[key])
                                except ValueError:
                                    missingData = True
                                    print(f"System {systemName} : The key-value '{key}:{arrayData[key]}' is not a number for array {array_num}")

            # add the loss metadata if not present
            if 'loss' not in systemsMetadata[systemName]['metadata']:
                systemsMetadata[systemName]['metadata']['loss'] = 0

        if missingData:
            systemsName_Valid.remove(systemName)
            print(f"-> Removing system {systemName} from the list of systems")

    print(f"Number of systems with all the necessary data: {len(systemsName_Valid)}/{len(systemsName)}")

    # # Filter out systems with less than X days of data
    # for systemName in systemsName_Valid[:]:  # Create a copy of the list using slicing [:] to avoid removing elements while iterating over the list itself
    #     if len(systemsData[systemName]) < minMeasurements:
    #         systemsName_Valid.remove(systemName)
    #         print(f"-> Removing system {systemName} from the list of systems because it has less than {minMeasurements} days of data")

    # print(f"Number of systems with at least {minMeasurements} days of data: {len(systemsName_Valid)}/{len(systemsName)}")

    ## ---------------------------------------------------------------------------- ##
    ## Create one 2D DataFrame with the daily production of every remaining systems ##
    ## ---------------------------------------------------------------------------- ##

    # Create an empty list to store all measured data for each systems
    systemsData_MeasuredDailyEnergy_List = []

    # Iterate over each key-value pair in the systemsData dictionary
    for systemName in systemsName_Valid:
        # Extract the 'tt_forward_active_energy_total_toDay' column from the current dataframe
        measuredDailyEnergy = systemsData[systemName]['tt_forward_active_energy_total_toDay']

        # Rename the column with the system name
        measuredDailyEnergy.rename(systemName, inplace=True)

        systemsData_MeasuredDailyEnergy_List.append(measuredDailyEnergy)
        # Concatenate the column to the new_dataframe

    # Concatenate all the columns in the list to create one dataframe
    systemsData_MeasuredDailyEnergy = pd.concat(systemsData_MeasuredDailyEnergy_List, axis=1)
    systemsData_MeasuredDailyEnergy.index = pd.to_datetime(systemsData_MeasuredDailyEnergy.index)
    systemsData_MeasuredDailyEnergy.sort_index(inplace=True)

    ## ------------------ ##
    ## Save the dataframe ##
    ## ------------------ ##
    # Save the dataframe for later use
    # create cache directory if it does not exist

    systemsData_MeasuredDailyEnergy.to_pickle(cacheFilename_systemsData_MeasuredDailyEnergy)

# Print the dataframe
systemsData_MeasuredDailyEnergy

Number of duplicate dates found: 1004 (see log file for more details)
System 2026239 : No measures found
System 2026239 : No 'pv_kwp' found
-> Removing system 2026239 from the list of systems
System a001001 : No 'pv_wp' found for array 1
System a001001 : No 'pv_number' found for array 1
-> Removing system a001001 from the list of systems
System a001028 : The key-value 'pv_azimut:39-129-219' is not a number for array 1
-> Removing system a001028 from the list of systems
System a001038 : The key-value 'pv_azimut:58-138-238' is not a number for array 1
-> Removing system a001038 from the list of systems
System a001103 : No 'pv_tilt' found for array 1
-> Removing system a001103 from the list of systems
System a001116 : No 'pv_wp' found for array 2
-> Removing system a001116 from the list of systems
System a001118 : The key-value 'pv_azimut:55-235' is not a number for array 1
-> Removing system a001118 from the list of systems
System a001122 : No 'pv_tilt' found for array 2
System a001122 :

Unnamed: 0_level_0,2026250,2026251,2026258,2026269,2026271,a001017,a001018,a001020,a001021,a001022,...,a001633,a001634,a001637,a001638,a001661,g001002,g001003,g001004,g001005,g001006
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-20,47.1,76.5,123.3,51.70,20.5,,43.7,46.4,25.1,32.40,...,,,,,,29.7,65.2,81.7,84.7,58.6
2023-03-21,53.4,77.7,121.2,60.20,24.1,,40.5,54.4,25.4,40.50,...,,,,,,30.3,78.9,71.1,103.3,68.4
2023-03-22,70.5,97.0,158.9,75.10,29.8,,57.4,68.5,32.6,52.30,...,,,,,,39.4,97.5,83.7,127.5,84.4
2023-03-23,25.8,47.2,55.5,21.80,9.2,,19.6,24.2,12.1,18.30,...,,,,,,13.9,30.6,28.0,50.6,39.8
2023-03-24,9.7,19.1,24.5,9.60,4.6,,6.8,13.3,5.4,8.60,...,,,,,,6.8,14.6,21.9,16.1,15.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-06,26.7,55.5,69.7,32.75,13.0,84.8,19.8,28.1,12.7,18.32,...,35.9,50.95,38.6,29.6,43.74,,38.1,42.9,49.2,44.0
2024-07-07,38.5,76.7,89.1,33.24,24.7,109.9,36.4,38.6,20.5,30.53,...,60.9,60.53,52.6,41.3,47.46,,47.4,61.8,101.0,62.2
2024-07-08,73.6,133.0,190.5,85.23,40.1,200.9,61.8,78.3,35.4,54.48,...,102.0,124.97,105.7,73.5,113.71,23.0,102.4,105.9,188.6,116.3
2024-07-09,72.9,129.1,178.4,82.22,40.3,216.5,58.8,77.8,33.5,51.86,...,100.2,121.30,101.8,71.5,103.27,42.3,100.2,102.5,186.1,110.5


# Simulate Anomalies


In [114]:
# Define the percentage of True values
percentage_true = 0.05

# Calculate the number of True values per column
num_rows = testingDays
num_true_per_column = int(num_rows * percentage_true)

# Initialize the mask DataFrame with all False values
anomalies_mask = pd.DataFrame(False, index=systemsData_MeasuredDailyEnergy[-testingDays:].index, columns=systemsData_MeasuredDailyEnergy.columns)

# Randomly assign True values in each column
for col in anomalies_mask.columns:
    true_indices = np.random.choice(anomalies_mask.index, num_true_per_column, replace=False, )
    anomalies_mask.loc[true_indices, col] = True


anomalies_losses = anomalies_mask * 0.95

In [118]:
# apply anomalies to the test set
systemsData_MeasuredDailyEnergy[-testingDays:] = systemsData_MeasuredDailyEnergy[-testingDays:] * (1 - anomalies_losses)

## Create train & test set


In [120]:
# create a validation set with the last 100 days
# if testingDays == 0:
#     systemsData_MeasuredDailyEnergy_train = systemsData_MeasuredDailyEnergy
#     systemsData_MeasuredDailyEnergy_test = pd.DataFrame()
# else:
if testingDays > len(systemsData_MeasuredDailyEnergy):
    raise ValueError(f"testingDays ({testingDays}) is greater than the number of days in the dataset ({len(systemsData_MeasuredDailyEnergy)})")
systemsData_MeasuredDailyEnergy_train, systemsData_MeasuredDailyEnergy_test = train_test_split(systemsData_MeasuredDailyEnergy, test_size=testingDays, random_state=42, shuffle=False)

In [121]:
# remove systems with not enough days that are not null for training or testing
for systemName in systemsData_MeasuredDailyEnergy_train.loc[:, systemsData_MeasuredDailyEnergy_train.notnull().sum() < minTrainingDays].columns:

    remove_system(systemName, f"System {systemName} : Not enough days for training (min {minTrainingDays} days required)")

for systemName in systemsData_MeasuredDailyEnergy_test.loc[:, systemsData_MeasuredDailyEnergy_test.notnull().sum() < minTestingDays].columns:
    remove_system(systemName, f"System {systemName} : Not enough days for testing (min {minTestingDays} days required)")

System a001037 : Not enough days for training (min 7 days required)
System a001472 : Not enough days for training (min 7 days required)
System a001508 : Not enough days for training (min 7 days required)
System a001543 : Not enough days for training (min 7 days required)
System a001546 : Not enough days for training (min 7 days required)
System a001548 : Not enough days for training (min 7 days required)
System a001553 : Not enough days for training (min 7 days required)
System a001557 : Not enough days for training (min 7 days required)
System a001560 : Not enough days for training (min 7 days required)
System a001569 : Not enough days for training (min 7 days required)
System a001570 : Not enough days for training (min 7 days required)
System a001581 : Not enough days for training (min 7 days required)
System a001589 : Not enough days for training (min 7 days required)
System a001593 : Not enough days for training (min 7 days required)
System a001596 : Not enough days for training (m

## Max production estimator


### Functions


In [124]:
# Convert the power production with a given frequency to the total daily energy
def daily_energy(df_power):
    # Get the frequency in minutes
    freq_in_minutes = pd.Timedelta(df_power.index.freq).seconds / 60
    # Convert power from kW to kWh
    df_energy = df_power * (freq_in_minutes / 60)
    # Resample to daily frequency and sum the values
    daily_energy = df_energy.resample('D').sum()
    # daily_energy.index = daily_energy.index.date

    return daily_energy

# Simulate the daily production of a system from a start date to an end date using the given PVLib ModelChain


def generate_max_production_estimate(startDate, endDate, estimator: ModelChain, samplingFreq='1h'):
    # The end date needs to be estimated completly(end date at 23:59). But "endDate" is considered as 00:00 by pd.date_range().
    # So we add 1 day to the end date to include the entire end date in the date_range(), and then we exclude the last value with the inclusive='left' proprety, to remove "endDate+1" at 00:00) in the date_range().
    endDate = endDate + pd.Timedelta(days=1)

    times = pd.date_range(start=startDate, end=endDate, freq=samplingFreq, tz=estimator.location.tz, inclusive='left')
    weatherClearSky = estimator.location.get_clearsky(times)  # In W/m2
    # TODO adjust the clear sky model to take into account the horizon https://pvlib-python.readthedocs.io/en/stable/gallery/shading/plot_simple_irradiance_adjustment_for_horizon_shading.html
    estimator.run_model(weatherClearSky)
    production = estimator.results.ac / 1000  # Convert W to kW
    dailyProduction = daily_energy(production)
    dailyProduction.index = pd.to_datetime(dailyProduction.index.date)
    return dailyProduction


def generate_max_production_estimator(systemMetadata):
    latitude = systemMetadata['metadata']['loc_latitude']
    longitude = systemMetadata['metadata']['loc_longitude']
    altitude = systemMetadata['metadata']['loc_altitude']
    Wp_Tot = systemMetadata['metadata']['pv_kwp'] * 1000
    loss = systemMetadata['metadata']['loss'] * 100

    arrays = []
    for array_num, arrayData in systemMetadata['arrays'].items():
        array = Array(
            mount=FixedMount(surface_tilt=arrayData['pv_tilt'], surface_azimuth=arrayData['pv_azimut'], racking_model='open_rack'),
            module_parameters={'pdc0': arrayData['pv_wp'], 'gamma_pdc': -0.004},
            module_type='glass_polymer',
            modules_per_string=arrayData['pv_number'],
            strings=1,
            temperature_model_parameters=TEMPERATURE_MODEL_PARAMETERS['sapm']['open_rack_glass_polymer'],
        )
        arrays.append(array)

    location = Location(latitude=latitude, longitude=longitude, altitude=altitude, tz='Europe/Zurich')
    system = PVSystem(arrays=arrays,
                      inverter_parameters={'pdc0': Wp_Tot, 'eta_inv_nom': 0.96},
                      losses_parameters={'nameplate_rating': loss, 'soiling': 0, 'shading': 0, 'snow': 0, 'mismatch': 0, 'wiring': 0, 'connections': 0, 'lid': 0, 'age': 0, 'availability': 0})
    modelChain = ModelChain(system, location, clearsky_model='ineichen', aoi_model='no_loss', spectral_model="no_loss", losses_model='pvwatts')

    return modelChain


def tune_max_production_estimator(measured_series, max_estimated_series, window=7):
    # Remove the obvious outliers. It's important before calculating the std, which can be strongly impacted by the strong outliers.
    outliers_mask = measured_series > 2 * max_estimated_series
    measured_no_outliers_series = measured_series[~outliers_mask]
    # if 10% of the data is removed as outliers, we consider that the system is not valid
    if outliers_mask.sum().sum() / outliers_mask.size > 0.1:
        return None, None, None, None
    # Keep only the max measured value
    max_measured_series = pd.Series(index=measured_series.index, dtype=float)
    # Iterate over windows of a given size, and keep only the maximum value in each window
    for i in range(0, len(measured_series), window):
        window_data = measured_no_outliers_series.iloc[i:i + window]
        if not window_data.empty and not window_data.isna().all():
            max_value = window_data.max()
            max_index = window_data.idxmax(skipna=True)
            max_measured_series[max_index] = max_value

    # Calculate the relative difference between the maximum measured and maximum estimated value
    realtive_difference = max_measured_series / max_estimated_series

    # Compute statistics
    std = realtive_difference.std()
    mean = realtive_difference.mean()

    # Remove the outilers that have a z-score greater than 1
    z_scores = np.abs(realtive_difference - mean) / std

    # Add the measure with a z-score greater than 1 to the previous outliers (AND operation)
    outliers_mask = outliers_mask | (z_scores > 1)

    # Get the loss that overestimate the estimate maximum daily energy
    loss = 1 - realtive_difference[~outliers_mask].max()

    return loss, std, max_measured_series, outliers_mask

### Create estimator


In [125]:
cacheFilename_systemsData_EstimatedMaxDailyEnergy = os.path.join(dataCacheDirpath, 'systemsData_EstimatedMaxDailyEnergy.pkl')

# if useCached and os.path.exists(cacheFilename_systemsData_EstimatedMaxDailyEnergy):
if True and os.path.exists(cacheFilename_systemsData_EstimatedMaxDailyEnergy):
    # TODO how to deal if the cached data is not up to date and some systems have been added or removed?
    print(f"Loading cached data in {cacheFilename_systemsData_EstimatedMaxDailyEnergy}")
    systemsData_EstimatedMaxDailyEnergy = pd.read_pickle(cacheFilename_systemsData_EstimatedMaxDailyEnergy)
else:
    systemsData_EstimatedMaxDailyEnergy_dic = {}
    systemsData_Tuning_EstimatedMaxDailyEnergy_untuned_dic = {}
    systemsData_Tuning_MeasureMax_dic = {}
    systemsData_Tuning_Outliers_dic = {}

    unfitted_systems = []
    for systemName in tqdm(systemsName_Valid):
        tuned = not tuneMaxProductionEstimators  # If we don't want to tune the estimators, we say that the estimator is already tuned
        # reset the loss in the metadata if we want to tune the estimators
        if tuneMaxProductionEstimators:
            systemsMetadata[systemName]['metadata']['loss'] = 0

        while True:  # emulate do while loop

            ## ------------------ ##
            ## Create ModelChains ##
            ## ------------------ ##
            estimator = generate_max_production_estimator(systemsMetadata[systemName])

            ## ------------------- ##
            ## Simulate production ##
            ## ------------------- ##
            measured_series = systemsData_MeasuredDailyEnergy[systemName]
            startDate = measured_series[~measured_series.isna()].index.min()
            endDate = measured_series[~measured_series.isna()].index.max()
            estimatedMaxDailyEnergy = generate_max_production_estimate(startDate, endDate, estimator, samplingFreq='1h')

            # fill remaining days with NaN
            estimatedMaxDailyEnergy = estimatedMaxDailyEnergy.reindex(measured_series.index, fill_value=np.nan)

            # add the series to the dictionary
            systemsData_EstimatedMaxDailyEnergy_dic[systemName] = estimatedMaxDailyEnergy

            ## --------------- ##
            ## Tune estimators ##
            ## --------------- ##
            if tuned:
                break

            loss, std, measuredMax, outliersMask = tune_max_production_estimator(measured_series, estimatedMaxDailyEnergy)

            if loss is None:
                unfitted_systems.append(systemName)
                break

            systemsData_Tuning_EstimatedMaxDailyEnergy_untuned_dic[systemName] = estimatedMaxDailyEnergy
            systemsData_Tuning_MeasureMax_dic[systemName] = measuredMax
            systemsData_Tuning_Outliers_dic[systemName] = measured_series[outliersMask]

            # If the std is greater than 1, we remove the system from the list of systems to be processed.
            # This is to avoid to have a system that is not well fitted by the maximum energy estimator model, and that could impact the training of the RF model.
            if std is None or std > 1 or measured_series.count() == 0:
                unfitted_systems.append(systemName)
                break

            # write the loss in systemsMetadata
            systemsMetadata[systemName]['metadata']['loss'] = loss

            tuned = True

    systemsData_EstimatedMaxDailyEnergy = pd.concat(systemsData_EstimatedMaxDailyEnergy_dic, axis=1)
    systemsData_EstimatedMaxDailyEnergy.index = pd.to_datetime(systemsData_EstimatedMaxDailyEnergy.index)
    systemsData_EstimatedMaxDailyEnergy.sort_index(inplace=True)

    systemsData_Tuning_EstimatedMaxDailyEnergy_untuned = pd.concat(systemsData_Tuning_EstimatedMaxDailyEnergy_untuned_dic, axis=1)
    systemsData_Tuning_EstimatedMaxDailyEnergy_untuned.index = pd.to_datetime(systemsData_Tuning_EstimatedMaxDailyEnergy_untuned.index)
    systemsData_Tuning_EstimatedMaxDailyEnergy_untuned.sort_index(inplace=True)

    systemsData_Tuning_MeasureMax = pd.concat(systemsData_Tuning_MeasureMax_dic, axis=1)
    systemsData_Tuning_MeasureMax.index = pd.to_datetime(systemsData_Tuning_MeasureMax.index)
    systemsData_Tuning_MeasureMax.sort_index(inplace=True)

    systemsData_Tuning_Outliers = pd.concat(systemsData_Tuning_Outliers_dic, axis=1)
    systemsData_Tuning_Outliers.index = pd.to_datetime(systemsData_Tuning_Outliers.index)
    systemsData_Tuning_Outliers.sort_index(inplace=True)

    # Remove unfitted systems from systemsName_Valid, systemsName_Valid, systemsData_EstimatedMaxDailyEnergy, systemsData_MeasuredDailyEnergy
    for systemName in unfitted_systems:
        remove_system(systemName, f"System {systemName} : We can't find the model corresponding to the measured data. This system is removed from the list of systems to be processed.")

    # Save the dataframe to a CSV file
    systemsData_EstimatedMaxDailyEnergy.to_pickle(cacheFilename_systemsData_EstimatedMaxDailyEnergy)

    # Save metadata with tuned parameters
    if tuneMaxProductionEstimators:
        with open(metadataFilepath, 'w') as f:
            json.dump(systemsMetadata, f, indent=4)

    # save systemsData_EstimatedMaxDailyEnergy in cacheFilename_systemsData_EstimatedMaxDailyEnergy
    systemsData_EstimatedMaxDailyEnergy.to_pickle(cacheFilename_systemsData_EstimatedMaxDailyEnergy)


# Print the dataframe
systemsData_EstimatedMaxDailyEnergy

Loading cached data in C:\Users\conta\Berner Fachhochschule\TI IEM PVLab PA-BA-MA - 2024_FS_CIS-MA-PV-Meter_Data_Analysis - 2024_FS_CIS-MA-PV-Meter_Data_Analysis\07_Data\databases\PRiOT\dataExport_2\cache\systemsData_EstimatedMaxDailyEnergy.pkl


Unnamed: 0_level_0,2026250,2026251,2026258,2026269,2026271,a001017,a001018,a001020,a001021,a001022,...,a001623,a001624,a001625,a001633,a001638,g001002,g001003,g001004,g001005,g001006
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-03-20,70.888989,95.336762,164.184834,78.452140,30.376840,,60.615294,73.873859,34.826828,58.214068,...,,,,,,41.749462,100.159754,82.698311,124.438180,80.333151
2023-03-21,71.264898,96.521976,165.472712,78.911319,30.698535,,60.969196,74.338863,34.992887,58.406191,...,,,,,,41.966318,100.552129,83.536288,126.037822,81.302952
2023-03-22,71.635194,97.703614,166.742508,79.363039,31.019467,,61.315983,74.799466,35.155068,58.592192,...,,,,,,42.180589,100.936241,84.372736,127.634776,82.268455
2023-03-23,71.999616,98.885005,167.993447,79.806344,31.339167,,61.655443,75.255347,35.313119,58.772028,...,,,,,,42.391936,101.311083,85.206992,129.227605,83.228758
2023-03-24,72.357927,100.065710,169.224871,80.240469,31.657152,,61.987403,75.706208,35.466839,58.945679,...,,,,,,42.600060,101.675845,86.038310,130.814868,84.183033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-06,83.185355,156.286307,216.079947,93.654375,43.976330,247.806622,70.785363,91.611577,38.679248,60.498934,...,103.585458,114.677673,76.194899,105.923430,75.765813,49.036988,107.727989,127.335693,206.992937,129.228550
2024-07-07,83.127291,155.993437,215.772194,93.552374,43.939366,247.875290,70.759369,91.490535,38.647102,60.474055,...,103.498158,114.612248,76.043658,105.761748,75.661299,48.997777,107.648891,127.197147,206.633137,128.959225
2024-07-08,83.066360,155.682826,215.449770,93.445723,43.899008,247.946040,70.731730,91.364191,38.613683,60.448477,...,103.406466,114.543562,75.883166,105.591870,75.550270,48.956724,107.567283,127.047433,206.249675,128.675676
2024-07-09,83.002497,155.360955,215.117931,93.334334,43.855235,248.018628,70.702395,91.232514,38.578960,60.422127,...,103.310337,114.471552,75.713471,105.413825,75.432732,48.913794,107.483042,126.886526,205.842658,128.377928


### Remove outliers


In [126]:
rel_mesasured_series = systemsData_MeasuredDailyEnergy_train / systemsData_EstimatedMaxDailyEnergy

# remove the outliers in measured data that are greater than 1.1 times (+10%) the maximum estimated value, or less than 1% of the maximum estimated value
inliers = (rel_mesasured_series < 1.1) & (rel_mesasured_series > 0.01)
systemsData_MeasuredDailyEnergy_train_outliers = systemsData_MeasuredDailyEnergy_train[~inliers]
systemsData_MeasuredDailyEnergy_train = systemsData_MeasuredDailyEnergy_train[inliers]

# remove the systems that have less than 7 days
for systemName in systemsData_MeasuredDailyEnergy_train.loc[:, systemsData_MeasuredDailyEnergy_train.count() < minTrainingDays].columns:
    remove_system(systemName, f"System {systemName} : The system has less than {minTrainingDays} days of data. This system is removed from the list of systems to be processed.")

System a001087 : The system has less than 7 days of data. This system is removed from the list of systems to be processed.
System a001223 : The system has less than 7 days of data. This system is removed from the list of systems to be processed.
System a001433 : The system has less than 7 days of data. This system is removed from the list of systems to be processed.
System a001456 : The system has less than 7 days of data. This system is removed from the list of systems to be processed.
System a001486 : The system has less than 7 days of data. This system is removed from the list of systems to be processed.
System a001489 : The system has less than 7 days of data. This system is removed from the list of systems to be processed.


### Relative production

True production scaled by the maximum production from the simulator


In [127]:
# Calculate the relative energy for each system
systemsData_RelativeMeasuredDailyEnergy_train = systemsData_MeasuredDailyEnergy_train / systemsData_EstimatedMaxDailyEnergy
systemsData_RelativeMeasuredDailyEnergy = systemsData_MeasuredDailyEnergy / systemsData_EstimatedMaxDailyEnergy

### Compare the difference between simulation with hourly and 10min sampling rate


## Correlation between Systems


In [128]:
correlation_matrix = systemsData_MeasuredDailyEnergy.corr(method='pearson', min_periods=minTrainingDays)
# set all negative value (therefore when the value of one system increasse, the other systme decrease) to 0
correlation_matrix[correlation_matrix < 0] = 0

### Plot linear regression between 2 systems


## Half-Sibling Regression


### Functions


In [129]:
def get_system_data(targetName, set='train', relative=True, max_neighbors=None):
    # take the max_neighbors best neighbours from the correlation matrix
    # if none, take all the neighbours
    if max_neighbors == None or max_neighbors > len(systemsName_Valid) - 1:
        max_neighbors = len(systemsName_Valid) - 1
    best_neighbours = correlation_matrix.loc[targetName, systemsName_Valid].sort_values(ascending=False).index[1:max_neighbors + 1]
    # Create the feature matrix X and the target vector y
    if set == 'train' and relative:
        X = systemsData_RelativeMeasuredDailyEnergy_train[best_neighbours]
        y = systemsData_RelativeMeasuredDailyEnergy_train[targetName]
    elif set == 'train' and not relative:
        X = systemsData_MeasuredDailyEnergy_train[best_neighbours]
        y = systemsData_MeasuredDailyEnergy_train[targetName]
    elif set == 'test' and relative:
        X = systemsData_RelativeMeasuredDailyEnergy_test[best_neighbours]
        y = systemsData_RelativeMeasuredDailyEnergy_test[targetName]
    elif set == 'test' and not relative:
        X = systemsData_MeasuredDailyEnergy_test[best_neighbours]
        y = systemsData_MeasuredDailyEnergy_test[targetName]
    else:
        raise ValueError(f"Invalid set value: {set}")
    # remove the observations where their is no target value
    X = X[~y.isna()]
    y = y[~y.isna()]
    return X, y


def mean_absolute_percentage_error_mean_denominator(
        y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"):
    # Copy of the function mean_absolute_percentage_error from sklearn.metrics._regression, with the denominator of the MAPE changed to the mean of the true values
    import sklearn
    y_type, y_true, y_pred, multioutput = sklearn.metrics._regression._check_reg_targets(
        y_true, y_pred, multioutput)
    sklearn.utils.validation.check_consistent_length(y_true, y_pred, sample_weight)
    epsilon = np.finfo(np.float64).eps
    mape = np.abs(y_pred - y_true) / np.maximum(np.mean(np.abs(y_true)), epsilon)
    output_errors = np.average(mape, weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None
    return np.average(output_errors, weights=multioutput)


def mean_absolute_percentage_error_epsilon(y_true, y_pred, epsilon=np.finfo(np.float64).eps, *, sample_weight=None, multioutput="uniform_average"):
    # Copy of the function mean_absolute_percentage_error from sklearn.metrics._regression, with epsilon as a parameter
    import sklearn
    y_type, y_true, y_pred, multioutput = sklearn.metrics._regression._check_reg_targets(
        y_true, y_pred, multioutput)
    sklearn.utils.validation.check_consistent_length(y_true, y_pred, sample_weight)
    mape = np.abs(y_pred - y_true) / np.maximum(np.abs(y_true), epsilon)
    output_errors = np.average(mape, weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None
    return np.average(output_errors, weights=multioutput)


def mad(arr):
    return abs(arr - arr.median()).median()


def modified_z_score(arr):
    # based on https://www.ibm.com/docs/en/cognos-analytics/11.1.0?topic=terms-modified-z-score
    mad_value = mad(arr)
    if mad_value == 0:
        MeanAD = np.mean(np.abs(arr - np.mean(arr)))
        denominator = 1.253314 * MeanAD
    else:
        denominator = 1.486 * mad_value
    return (arr - np.median(arr)) / denominator


def metrics(y_true, y_pred):
    return {'MAPE': mean_absolute_percentage_error(y_true, y_pred), 'MAPE-MD': mean_absolute_percentage_error_mean_denominator(y_true, y_pred), 'MAE': mean_absolute_error(y_true, y_pred), 'RMSE': root_mean_squared_error(y_true, y_pred), 'R2': r2_score(y_true, y_pred)}


mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
mape_eps_scorer = make_scorer(mean_absolute_percentage_error_epsilon, greater_is_better=False)
# Return the metrics for a RFR model trained on the given data.
# The entire dataset is used for training, and the OOB prediction is used to compute the metrics.


def oob_metrics(X, y, metricFct, rf_parames={}):
    model = RandomForestRegressor(oob_score=True, **rf_parames)
    y_pred = model.fit(X, y).oob_prediction_
    return metricFct(y, y_pred)
# Return the metrics for a RFR model trained on the given data.
# KFold cross-validation is used train the model and to compute the metrics.


def kfold_metrics(X, y, metricFct, rf_parames={}, n_folds=5):
    model = RandomForestRegressor(**rf_parames)
    metrics_list = []
    for train_index, test_index in KFold(n_splits=n_folds).split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        y_pred = model.fit(X_train, y_train).predict(X_test)
        metrics_list.append(metricFct(y_test, y_pred))
    if isinstance(metrics_list[0], dict):
        # Convert list of dictionaries to a DataFrame
        metrics_df = pd.DataFrame(metrics_list)
        # Compute mean for each column
        aggregated_metrics = metrics_df.mean().to_dict()
        return aggregated_metrics
    else:
        # Compute mean of the list for numerical metrics
        return np.mean(metrics_list)


def _accumulate_prediction(predict, X, out, lock):
    """    This is a utility function for joblib's Parallel.    It can't go locally in ForestClassifier or ForestRegressor, because joblib    complains that it cannot pickle it when placed there.    """
    prediction = predict(X, check_input=False)
    with lock:
        out.append(prediction)


def predict_w_std(self, X):
    """    Predict regression target and standard deviation for X.    The predicted regression target of an input sample is computed as the    mean predicted regression targets of the trees in the forest. The standard    deviation of the predicted regression targets of the trees in the forest    is also computed to provide an estimate of the prediction uncertainty.    Parameters    ----------    X : {array-like, sparse matrix} of shape (n_samples, n_features)        The input samples. Internally, its dtype will be converted to        ``dtype=np.float32``. If a sparse matrix is provided, it will be        converted into a sparse ``csr_matrix``.    Returns    -------    mean_predictions : ndarray of shape (n_samples,)        The predicted values (mean of the predictions from all estimators).    std_predictions : ndarray of shape (n_samples,)        The standard deviation of the predicted values (standard deviation of the        predictions from all estimators).    Raises    ------    NotImplementedError        If the model was trained for multi-output regression.    Notes    -----    This function does not support multi-output regression. If the model was    trained for multi-output regression, an exception will be raised.    """
    if self.n_outputs_ > 1:
        raise NotImplementedError("Variance for multi-output regression is not supported now")
    check_is_fitted(self)
    # Check data
    X = self._validate_X_predict(X)
    # Assign chunk of trees to jobs
    n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs)
    # avoid storing the output of every estimator by summing them here
    # Initialize a list to collect predictions from each estimator
    all_predictions = []
    # Parallel loop
    lock = threading.Lock()
    Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(delayed(_accumulate_prediction)(e.predict, X, all_predictions, lock)
                                                                       for e in self.estimators_)
    # Convert list to numpy array for easier manipulation
    all_predictions = np.array(all_predictions)
    # Compute mean and variance across predictions from all estimators
    mean_predictions = np.mean(all_predictions, axis=0)
    std_predictions = np.std(all_predictions, axis=0)
    return mean_predictions, std_predictions


RandomForestRegressor.predict_w_std = predict_w_std

### Hyperparameters tuning


#### Hyperparameters tuning


### Train regressors


In [130]:
serializer = PickleSerializer()

rf_regressors = {}

# Random Forest Regressor hyperparameters
n_estimators = 100  # Number of trees in random forest
max_features = 'log2'  # Number of features to consider at every split
max_depth = None  # Maximum number of levels in tree
min_samples_split = 2  # Minimum number of samples required to split a node
min_samples_leaf = 1  # Minimum number of samples required at each leaf node
max_neighbors = 20  # Maximum number of best neighbors to consider

if useCached and not forceTrain:
    # load the models in dataCacheDirpath/rf_regressors. The file name is the system name.
    for systemName in systemsName_Valid:
        try:
            rf_regressors[systemName] = serializer.retrieve_model(os.path.join(dataCacheDirpath, 'rf_regressors'), systemName)
        except FileNotFoundError:
            continue
    print(f"Loaded {len(rf_regressors)}/{len(systemsName_Valid)} models. {len(systemsName_Valid) - len(rf_regressors)} models to train.")

systemsData_RelativeExpectedDailyEnergy_train_List = []
for targetName in tqdm(set(systemsName_Valid) - set(rf_regressors), desc='Training regressors'):
    rf_regressor = RandomForestRegressor(oob_score=True, random_state=random_state, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
    X_train, y_train = get_system_data(targetName, set='train', relative=True, max_neighbors=max_neighbors)

    # split the data into training and testing sets
    rf_regressor.fit(X_train, y_train)
    rf_regressors[targetName] = rf_regressor
    # save the rf_regressor.oob_prediction_ in systemsData_RelativeExpectedDailyEnergy_train_List
    systemsData_RelativeExpectedDailyEnergy_train_List.append(pd.Series(rf_regressor.oob_prediction_, index=X_train.index, name=targetName))
    # save the model in dataCacheDirpath/rf_regressors. The file name is the system name.
    serializer.save_model(rf_regressor, os.path.join(dataCacheDirpath, 'rf_regressors'), targetName)

# Concatenate all the columns in the list to create one dataframe
systemsData_RelativeExpectedDailyEnergy_train = pd.concat(systemsData_RelativeExpectedDailyEnergy_train_List, axis=1)
systemsData_RelativeExpectedDailyEnergy_train.index = pd.to_datetime(systemsData_RelativeExpectedDailyEnergy_train.index)
systemsData_RelativeExpectedDailyEnergy_train.sort_index(inplace=True)

Training regressors: 100%|██████████| 326/326 [02:12<00:00,  2.46it/s]


In [131]:
# Compute absolute expected daily energy
systemsData_ExpectedDailyEnergy_train = systemsData_RelativeExpectedDailyEnergy_train * systemsData_EstimatedMaxDailyEnergy

## Split between two dates

TODO


### Compupte training metrics of the model


In [132]:
cacheFilename_regressorsMetrics_train = os.path.join(dataCacheDirpath, 'metrics_train.csv')

if useCached and os.path.exists(cacheFilename_regressorsMetrics_train):
    # TODO how to deal if the cached data is not up to date and some systems have been added or removed?
    print(f"Loading cached data in {cacheFilename_regressorsMetrics_train}")
    regressorsMetrics = pd.read_csv(cacheFilename_regressorsMetrics_train, index_col=0).squeeze()
else:

    # regressorsMetrics_mape_train = pd.Series(index=systemsName_Valid, name='Train MAE')
    regressorsMetrics_mape_scaled_train = pd.Series(index=systemsName_Valid, name='Train MAPE Normalized')

    for targetName in systemsName_Valid:

        X_train, y_train = get_system_data(targetName, set='train', relative=True)

        rf_regressor = rf_regressors[targetName]
        y_pred = pd.Series(rf_regressor.oob_prediction_, index=X_train.index, name=targetName)
        y_train_scaled = y_train  # (y_train / systemsData_EstimatedMaxDailyEnergy[targetName]).dropna()
        y_pred_scaled = y_pred  # (y_pred / systemsData_EstimatedMaxDailyEnergy[targetName]).dropna()

        # regressorsMetrics_mae_train.loc[targetName] = mean_absolute_error(y_train, y_pred)
        # regressorsMetrics_mape_train.loc[targetName] = mean_absolute_percentage_error_epsilon(y_train, y_pred, epsilon=1)

        regressorsMetrics_mape_scaled_train.loc[targetName] = mean_absolute_error(y_train_scaled, y_pred_scaled)
        # regressorsMetrics_mape_scaled_train.loc[targetName] = mean_absolute_percentage_error_epsilon(y_train_scaled, y_pred_scaled, epsilon=0.01)

    # save the metrics

    # regressorsMetrics_mae_train.to_csv(cacheFilename_regressorsMetrics_train)

### Compute feature importance


In [133]:
compute_permutation_importance = False

cacheFilename_features_importance = os.path.join(dataCacheDirpath, 'features_importance.csv')
cacheFilename_permutation_importance_mean = os.path.join(dataCacheDirpath, 'permutation_importance_mean.csv')
cacheFilename_permutation_importance_std = os.path.join(dataCacheDirpath, 'permutation_importance_std.csv')

# start = time.time()

if useCached and os.path.exists(cacheFilename_features_importance):
    print(f"Loading cached data in {cacheFilename_features_importance}")
    features_importance_df = pd.read_csv(cacheFilename_features_importance, index_col=0)
else:
    features_importance_df = pd.DataFrame(index=systemsName_Valid, columns=systemsName_Valid)
    for targetName in systemsName_Valid:
        rf_regressor = rf_regressors[targetName]
        features_importance_df.loc[targetName, rf_regressor.feature_names_in_] = rf_regressor.feature_importances_
    # save the feature importances
    features_importance_df.to_csv(cacheFilename_features_importance)


if compute_permutation_importance:
    if useCached and os.path.exists(cacheFilename_permutation_importance_mean) and os.path.exists(cacheFilename_permutation_importance_std):
        print(f"Loading cached data in {cacheFilename_permutation_importance_mean}")
        permutation_importance_mean_df = pd.read_csv(cacheFilename_permutation_importance_mean, index_col=0)
        print(f"Loading cached data in {cacheFilename_permutation_importance_std}")
        permutation_importance_std_df = pd.read_csv(cacheFilename_permutation_importance_std, index_col=0)
    else:
        permutation_importance_mean_df = pd.DataFrame(index=systemsName_Valid, columns=systemsName_Valid)
        permutation_importance_std_df = pd.DataFrame(index=systemsName_Valid, columns=systemsName_Valid)
        for targetName in tqdm(systemsName_Valid):
            X, y = get_system_data(targetName)
            rf_regressor = rf_regressors[targetName]
            permutation_importance_results = permutation_importance(rf_regressor, X, y, n_repeats=5, random_state=random_state, n_jobs=-1, scoring=mae_scorer)
            permutation_importance_mean_df.loc[targetName, X.columns] = permutation_importance_results.importances_mean
            permutation_importance_std_df.loc[targetName, X.columns] = permutation_importance_results.importances_std
        # save the permutation importances
        permutation_importance_mean_df.to_csv(cacheFilename_permutation_importance_mean)
        permutation_importance_std_df.to_csv(cacheFilename_permutation_importance_std)


# print(f"Time elapsed: {time.time() - start} - Time per system: {(time.time() - start) / len(systemsName_Valid)}")

## Generate expected value for each systems


In [134]:
# valid system that have been trained
systems_trained = [systemName for systemName in systemsName_Valid if systemName in rf_regressors]

print("Systems not trained:", [systemName for systemName in systemsName_Valid if systemName not in rf_regressors])

Systems not trained: []


In [135]:
# check that all the index in systemsData_MeasuredDailyEnergy_test are in systemsData_EstimatedMaxDailyEnergy
if not systemsData_MeasuredDailyEnergy_test.index.isin(systemsData_EstimatedMaxDailyEnergy.index).all():
    raise ValueError("Some index in systemsData_MeasuredDailyEnergy_test are not in systemsData_EstimatedMaxDailyEnergy")

In [136]:
# compute relative value
systemsData_RelativeMeasuredDailyEnergy_test = systemsData_MeasuredDailyEnergy_test / systemsData_EstimatedMaxDailyEnergy

In [137]:
# compute estimate and metrics

systemsData_RelativeExpectedDailyEnergy_test_mean_List = []
systemsData_RelativeExpectedDailyEnergy_test_std_List = []

# regressorsMetrics_mae_test = pd.Series(index=systems_trained, name='Test MAE')
# regressorsMetrics_mape_test = pd.Series(index=systems_trained, name='Test MAPE')
regressorsMetrics_mape_scaled_test = pd.Series(index=systems_trained, name='Test MAPE Normalized')
# regressorsMetrics_mape_scaled_test = pd.Series(index=systems_trained, name='Test MAPE Scaled')

for targetName in tqdm(systems_trained):
    X_test, y_test = get_system_data(targetName, set='test', relative=True)
    # check that there is at least one observation
    if y_test.count() == 0:
        continue
    regressor = rf_regressors[targetName]
    fitted_features = regressor.feature_names_in_

    # adjust the feature in the validation set to match the feature in the training set
    # Identify extra columns in X_test that are not used by the regressor
    extra_features = set(X_test.columns) - set(fitted_features)
    # Drop extra columns from X_val
    X_test = X_test.drop(columns=list(extra_features), errors='ignore')
    # Identify missing columns in X_test and add them as empty columns
    missing_features = set(fitted_features) - set(X_test.columns)
    for feature in missing_features:
        X_test[feature] = np.nan

    y_mean, y_std = regressor.predict_w_std(X_test)
    y_mean = pd.Series(y_mean, index=X_test.index, name=targetName)
    y_std = pd.Series(y_std, index=X_test.index, name=targetName)
    systemsData_RelativeExpectedDailyEnergy_test_mean_List.append(y_mean)
    systemsData_RelativeExpectedDailyEnergy_test_std_List.append(y_std)

    # metrics
    y_test_scaled = y_test  # (y_test / systemsData_EstimatedMaxDailyEnergy[targetName]).dropna()
    y_mean_scaled = y_mean  # (y_mean / systemsData_EstimatedMaxDailyEnergy[targetName]).dropna()

    # regressorsMetrics_mae_test.loc[targetName] = mean_absolute_error(y_test, y_mean)
    # regressorsMetrics_mape_test.loc[targetName] = mean_absolute_percentage_error_epsilon(y_test, y_mean, epsilon=1)

    regressorsMetrics_mape_scaled_test.loc[targetName] = mean_absolute_error(y_test_scaled, y_mean_scaled)
    # regressorsMetrics_mape_scaled_test.loc[targetName] = mean_absolute_percentage_error_epsilon(y_test_scaled, y_mean_scaled, epsilon=0.01)


systemsData_RelativeExpectedDailyEnergy_test_mean = pd.concat(systemsData_RelativeExpectedDailyEnergy_test_mean_List, axis=1)
systemsData_RelativeExpectedDailyEnergy_test_mean.index = pd.to_datetime(systemsData_RelativeExpectedDailyEnergy_test_mean.index)
systemsData_RelativeExpectedDailyEnergy_test_mean.sort_index(inplace=True)

systemsData_RelativeExpectedDailyEnergy_test_std = pd.concat(systemsData_RelativeExpectedDailyEnergy_test_std_List, axis=1)
systemsData_RelativeExpectedDailyEnergy_test_std.index = pd.to_datetime(systemsData_RelativeExpectedDailyEnergy_test_std.index)
systemsData_RelativeExpectedDailyEnergy_test_std.sort_index(inplace=True)

100%|██████████| 326/326 [00:07<00:00, 42.16it/s]


In [138]:
# Compute absolute expected daily energy
systemsData_ExpectedDailyEnergy_test_mean = systemsData_RelativeExpectedDailyEnergy_test_mean * systemsData_EstimatedMaxDailyEnergy
systemsData_ExpectedDailyEnergy_test_std = systemsData_RelativeExpectedDailyEnergy_test_std * systemsData_EstimatedMaxDailyEnergy

## Statics on the models metrics


## Comparator


In [139]:
systemsData_RelativeDelta_test = systemsData_RelativeExpectedDailyEnergy_test_mean - systemsData_RelativeMeasuredDailyEnergy_test

# Detector


In [227]:
# eronate data are the data where the relative delta is lower than regressorsMetrics_test
max_z_score = 1.65  # 90% confidence interval
systemsData_RelativeDelta_test_detected_anomalies_mask = systemsData_RelativeDelta_test.loc[:, regressorsMetrics_mape_scaled_test.index] > max_z_score * regressorsMetrics_mape_scaled_test

## Detector metrics - confusion matrix


In [228]:
# Calculate TP, TN, FP, FN
aligned_simulated_anomalies_mask, aligned_detected_anomalies_mask = anomalies_mask.align(systemsData_RelativeDelta_test_detected_anomalies_mask, join='inner', axis=None)

TP = ((aligned_detected_anomalies_mask == True) & (aligned_simulated_anomalies_mask == True)).sum().sum()   
TN = ((aligned_detected_anomalies_mask == False) & (aligned_simulated_anomalies_mask == False)).sum().sum()   
FP = ((aligned_detected_anomalies_mask == False) & (aligned_simulated_anomalies_mask == True)).sum().sum()  
FN = ((aligned_detected_anomalies_mask == True) & (aligned_simulated_anomalies_mask == False)).sum().sum()

# Display the results
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")

True Positives (TP): 1587
True Negatives (TN): 30602
False Positives (FP): 43
False Negatives (FN): 368


In [180]:
import pandas as pd
import numpy as np

# Example boolean masks as DataFrames (replace these with your actual data)
simulated_anomalies_mask = pd.DataFrame(np.random.choice([True, False], size=(100, 10)))
detected_anomalies_mask = pd.DataFrame(np.random.choice([True, False], size=(50, 20)))

# Align the two DataFrames to have the same rows and columns
aligned_simulated, aligned_detected = simulated_anomalies_mask.align(detected_anomalies_mask, join='inner', axis=None)

# Calculate TP, TN, FP, FN on the aligned DataFrames
TP = ((aligned_simulated == True) & (aligned_detected == True)).sum().sum()
TN = ((aligned_simulated == False) & (aligned_detected == False)).sum().sum()
FP = ((aligned_simulated == False) & (aligned_detected == True)).sum().sum()
FN = ((aligned_simulated == True) & (aligned_detected == False)).sum().sum()

# Display the results
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")


True Positives (TP): 122
True Negatives (TN): 133
False Positives (FP): 132
False Negatives (FN): 113


# Scaling technics & Outliers removal

Compute the:

- Global mean
- Global median
- Global standard deviation
- Rolling mean
- Rolling median
- Rolling standard deviation
- Simulate max production without info
- SImulated max production with info


# Plot results


In [144]:
# Initialize the Dash app
app = dash.Dash(__name__)

tab_height = '2em'
app.layout = html.Div([
    html.Div([
        dcc.Dropdown(
            id='system-dropdown',
            options=[{'label': name, 'value': name} for name in systemsName_Valid],
            value=systemsName_Valid[0],
            style={'width': '50%'}  # Adjust width and font size
        ),
        html.Div(id='metric-text-container', style={'display': 'inline-block', 'margin-left': '20px'})  # Container for the metric text
    ], style={'display': 'flex', 'align-items': 'center'}),  # Align items horizontally
    dcc.Tabs(id='plot-tabs', value='tab-energy', children=[
        dcc.Tab(label='Absolute Energy', value='tab-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Normalized Energy', value='tab-rel-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Normalizer Tuning', value='tab-norm-tuning', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        # plot systemsData_RelativeDelta_test
        dcc.Tab(label='All Normalized Energy', value='tab-rel-energy-all', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Dynamic Losses', value='tab-delta-rel-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='All Missing Value', value='tab-miss-val-all', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Similar neighboring systems', value='tab-neighbors', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height

    ]),  # Adjust height for tabs
    html.Div(id='tabs-content', style={'flex': '1 1 auto'})  # Allow the tabs-content div to grow
], style={'display': 'flex', 'flexDirection': 'column', 'height': '100vh'})  # Make the outer container fill the screen height


@app.callback(
    [Output('tabs-content', 'children'),
     Output('metric-text-container', 'children')],
    [Input('plot-tabs', 'value'),
     Input('system-dropdown', 'value')]
)
def render_content(tab, selected_system):
    # Statistic text
    try:
        mae_train = regressorsMetrics_mape_scaled_train.loc[selected_system]
    except:
        mae_train = np.nan
    try:
        mae_test = regressorsMetrics_mape_scaled_test.loc[selected_system]
    except:
        mae_test = np.nan
    try:
        loss = systemsMetadata[selected_system]['metadata']['loss']
    except:
        loss = np.nan

    mae_train_text = f"Half Sibling Regressor - Train set MAPE: {mae_train * 100:.2f}%"
    mae_test_text = f"Half Sibling Regressor - Test set MAPE: {mae_test * 100:.2f}%"
    loss_text = f"Normalizer Tuning - Static System Loss : {loss * 100:.2f}%"

    metric_text_div = html.Div([
        html.Div(mae_train_text),
        html.Div(mae_test_text),
        html.Div(loss_text)
    ], style={'fontSize': 16})

    if tab == 'tab-energy':
        fig1 = go.Figure(layout_yaxis_title="Daily Energy (kWh)")

        # remove nan from systemsData_EstimatedMaxDailyEnergy[selected_system]

        try:
            estimatedMaxDailyEnergy = systemsData_EstimatedMaxDailyEnergy[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=estimatedMaxDailyEnergy.index,
                y=estimatedMaxDailyEnergy,
                mode='lines',
                name='Estimated Max Daily Energy',
                marker_color='LightSeaGreen'
            ))
        except:
            pass

        try:
            measuredDailyEnergy = systemsData_MeasuredDailyEnergy[selected_system].dropna()
            anomalous_days = anomalies_mask[anomalies_mask[selected_system] == True].index
            fig1.add_trace(go.Bar(
                x=measuredDailyEnergy.index,
                y=measuredDailyEnergy,
                # mode='markers',
                name='Measured Daily Energy',
                marker_color=np.where(measuredDailyEnergy.index.isin(anomalous_days), 'green', 'blue')
            ))
        except:
            pass

        # try:
        # measuredDailyEnergy_train_outliers = systemsData_MeasuredDailyEnergy_train_outliers[selected_system].dropna()
        #     fig1.add_trace(go.Scatter(
        #         x=measuredDailyEnergy_train_outliers.index,
        #         y=measuredDailyEnergy_train_outliers,
        #         mode='markers',
        #         name='Outliers',
        #         marker_color='yellow'
        #     ))
        # except:
        #     pass
        # try:
        #     expectedDailyEnergy_train = systemsData_ExpectedDailyEnergy_train[selected_system].dropna()
        #     fig1.add_trace(go.Scatter(
        #         x=expectedDailyEnergy_train.index,
        #         y=expectedDailyEnergy_train,
        #         mode='markers',
        #         name='Expected Daily Energy',
        #         marker_color='red'
        #     ))
        # except:
        #     pass

        try:
            expectedDailyEnergy_test_mean = systemsData_ExpectedDailyEnergy_test_mean[selected_system].dropna()
            expectedDailyEnergy_test_std = systemsData_ExpectedDailyEnergy_test_std[selected_system].dropna()
            fig1.add_trace(go.Bar(
                x=expectedDailyEnergy_test_mean.index,
                y=expectedDailyEnergy_test_mean,
                # mode='markers',
                name='Expected Daily Energy',
                marker_color='red'
                # error_y=dict(
                #     type='data',
                #     array=expectedDailyEnergy_test_std,
                #     visible=True
                # )
            ))
        except:
            pass

        # Update layout for legend position
        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure

    elif tab == 'tab-norm-tuning':
        fig1 = go.Figure(layout_yaxis_title="Daily Energy (kWh)")
        try:
            measuredDailyEnergy = systemsData_MeasuredDailyEnergy[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=measuredDailyEnergy.index,
                y=measuredDailyEnergy,
                mode='markers',
                name='Measured Daily Energy',
                marker_color='blue'
            ))
        except:
            pass
        try:
            measuredMax = systemsData_Tuning_MeasureMax[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=measuredMax.index,
                y=measuredMax,
                mode='markers',
                name='Max Measured Daily Energy (7 days window)',
                marker_color='red'
            ))
        except:
            pass
        try:
            outliers = systemsData_Tuning_Outliers[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=outliers.index,
                y=outliers,
                mode='markers',
                name='Tuning Outliers',
                marker_color='yellow'
            ))
        except:
            pass
        try:
            estimatedMaxDailyEnergy_untuned = systemsData_Tuning_EstimatedMaxDailyEnergy_untuned[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=estimatedMaxDailyEnergy_untuned.index,
                y=estimatedMaxDailyEnergy_untuned,
                mode='lines',
                name='Estimated Max Daily Energy (Untuned)',
                marker_color='violet'
            ))
        except:
            pass
        try:
            estimatedMaxDailyEnergy = systemsData_EstimatedMaxDailyEnergy[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=estimatedMaxDailyEnergy.index,
                y=estimatedMaxDailyEnergy,
                mode='lines',
                name='Estimated Max Daily Energy (Tuned)',
                marker_color='LightSeaGreen'
            ))
        except:
            pass

        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div
    elif tab == 'tab-rel-energy':
        fig1 = go.Figure(layout_yaxis_title="Normalized Daily Energy (%)")
        # add a line at 100% for the Estimated Max Daily Energy
        estimatedMaxDailyEnergy = systemsData_EstimatedMaxDailyEnergy[selected_system].dropna()
        fig1.add_shape(
            type="line",
            x0=estimatedMaxDailyEnergy.index.min(),
            y0=100,
            x1=estimatedMaxDailyEnergy.index.max(),
            y1=100,
            name='Estimated Max Daily Energy',
            line_color='LightSeaGreen'
            # line=dict(
            #     color="LightSeaGreen",
            #     width=2,
            #     dash="dashdot",
            # ),
        )
        try:
            relativeMeasuredDailyEnergy = systemsData_RelativeMeasuredDailyEnergy[selected_system].dropna()
            fig1.add_trace(go.Bar(
                x=relativeMeasuredDailyEnergy.index,
                y=relativeMeasuredDailyEnergy * 100,
                # mode='markers',
                name='Measured Daily Energy',
                marker_color='blue'
            ))
        except:
            pass
        try:
            relativeExpectedDailyEnergy_test_mean = systemsData_RelativeExpectedDailyEnergy_test_mean[selected_system].dropna()
            fig1.add_trace(go.Bar(
                x=relativeExpectedDailyEnergy_test_mean.index,
                y=relativeExpectedDailyEnergy_test_mean * 100,
                # mode='markers',
                name='Expected Daily Energy',
                marker_color='red'
                # error_y=dict(
                #     type='data',
                #     array=systemsData_RelativeExpectedDailyEnergy_test_std[selected_system] * 100,
                #     visible=True
                # )
            ))
        except:
            pass

        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )
        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-delta-rel-energy':
        fig1 = go.Figure(layout_yaxis_title="Normalized Daily Energy Loss (%)")
        try:
            relativeDelta_test = systemsData_RelativeDelta_test[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=relativeDelta_test.index,
                y=relativeDelta_test * 100,
                mode='markers',
                name='Normalized Delta Energy',
            ))
        except:
            pass
        try:
            relativeDelta_test_detected = systemsData_RelativeDelta_test[systemsData_RelativeDelta_test_detected_anomalies_mask][selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=relativeDelta_test_detected.index,
                y=relativeDelta_test_detected * 100,
                mode='markers',
                name='Detected Anomaly',
                marker_color='red'
            ))
        except:
            pass

        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )
        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-rel-energy-all':
        fig1 = go.Figure(layout_yaxis_title="Normalized Daily Energy (%)")
        # try:
        features_importance_norm = features_importance_df.loc[selected_system] / features_importance_df.loc[selected_system].max()

        for systemName in systemsName_Valid:
            if systemName != selected_system:
                if features_importance_norm[systemName] > 0.1:
                    fig1.add_trace(go.Scatter(
                        x=systemsData_RelativeMeasuredDailyEnergy[systemName].index,
                        y=systemsData_RelativeMeasuredDailyEnergy[systemName] * 100,
                        mode='markers',
                        name=f'{systemName}',
                        marker_color='blue',
                        marker_opacity=features_importance_norm[systemName]
                    ))
        fig1.add_trace(go.Scatter(
            x=systemsData_RelativeMeasuredDailyEnergy[selected_system].index,
            y=systemsData_RelativeMeasuredDailyEnergy[selected_system] * 100,
            mode='markers',
            name=f'{selected_system}',
            marker_color='red'
        ))
        fig1.update_layout(yaxis=dict(range=[-5, 120]))

        # except:
        #     pass

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-miss-val-all':
        measures = systemsData_MeasuredDailyEnergy

        # Sort columns by the number of missing values
        sorted_columns = measures.isnull().sum().sort_values().index
        sorted_measures = measures[sorted_columns]

        # Create a boolean DataFrame where True indicates missing values
        missing_values = (~sorted_measures.isnull()).astype(int)

        # Plot heatmap
        fig = go.Figure(data=go.Heatmap(
            z=missing_values,
            x=missing_values.columns,
            y=missing_values.index,
            showscale=False,
            colorscale='Greys'  # Set colorscale to black and white
        ))
        fig.update_layout(
            yaxis=dict(
                showticklabels=True,  # Show y-axis tick labels
                autorange='reversed'  # Invert the y-axis
            ),
            yaxis_tickmode='array',
            yaxis_tickvals=pd.date_range(start=missing_values.index.min(), end=missing_values.index.max(), freq='ME'),
            yaxis_ticktext=pd.date_range(start=missing_values.index.min(), end=missing_values.index.max(), freq='ME').strftime('%b %Y')
        )

        return dcc.Graph(figure=fig, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure

    elif tab == 'tab-neighbors':
        fig2 = go.Figure()

        # Add initial traces with secondary y-axis
        try:
            fig2.add_trace(go.Bar(
                x=features_importance_df.columns,
                y=features_importance_df.loc[selected_system],
                name='Impurity-based Importance',
                yaxis='y1',
                offsetgroup=1
            ))
            fig2.update_layout(
                yaxis1=dict(
                    title='Impurity-based Importance',
                    range=[0, features_importance_df.loc[selected_system].max()],
                )
            )
        except:
            pass
        try:
            fig2.add_trace(go.Bar(
                x=permutation_importance_mean_df.columns,
                y=permutation_importance_mean_df.loc[selected_system],
                name='Permutation Importance',
                yaxis='y2',
                offsetgroup=2
            ))
            fig2.update_layout(
                yaxis2=dict(
                    title='Permutation Importance',
                    overlaying='y',
                    side='right',
                    range=[0, permutation_importance_mean_df.loc[selected_system].max()],
                )
            )
        except:
            pass

        return dcc.Graph(figure=fig2, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure


def open_browser():
    webbrowser.open("http://127.0.0.1:8050/")


if __name__ == '__main__':
    # Open the Dash app in a new browser window
    Timer(1, open_browser).start()
    app.run_server(debug=True, use_reloader=False)

## Interpreting result from a prediction

https://towardsdatascience.com/interpreting-random-forests-638bca8b49ea


# Comparisons between regressors


In [None]:
# show stats for the series regressorsMetrics_mape_scaled_test
test_mape = regressorsMetrics_mape_scaled_test[regressorsMetrics_mape_scaled_test < 1] * 100
print(test_mape.describe())
# show a box plot
# test_mape.plot.box()

# show histogram
test_mape.plot.hist(bins=50)

# Impact of number of neighbors


In [None]:
# Random Forest Regressor hyperparameters
n_estimators = 100  # Number of trees in random forest
max_features = 'log2'  # Number of features to consider at every split
max_depth = None  # Maximum number of levels in tree
min_samples_split = 2  # Minimum number of samples required to split a node
min_samples_leaf = 1  # Minimum number of samples required at each leaf node


df_mape_n_neighbors_test = pd.DataFrame(index=systemsName_Valid, columns=[1, 2, 5, 10, 50, 150, 300])
for max_neighbors in df_mape_n_neighbors_test.columns:
    for targetName in tqdm(systemsName_Valid):
        # train
        rf_regressor = RandomForestRegressor(oob_score=True, random_state=random_state, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_train, y_train = get_system_data(targetName, set='train', relative=True, max_neighbors=max_neighbors)
        # split the data into training and testing sets
        rf_regressor.fit(X_train, y_train)

        # predict
        X_test, y_test = get_system_data(targetName, set='test', relative=True)
        if y_test.count() == 0:
            continue

        # adjust the feature in the validation set to match the feature in the training set
        fitted_features = rf_regressor.feature_names_in_
        # Identify extra columns in X_test that are not used by the regressor
        extra_features = set(X_test.columns) - set(fitted_features)
        # Drop extra columns from X_val
        X_test = X_test.drop(columns=list(extra_features), errors='ignore')
        # Identify missing columns in X_test and add them as empty columns
        missing_features = set(fitted_features) - set(X_test.columns)
        for feature in missing_features:
            X_test[feature] = np.nan

        y_mean_array = rf_regressor.predict(X_test)
        y_pred = pd.Series(y_mean_array, index=X_test.index, name=targetName)
        df_mape_n_neighbors_test.loc[targetName, max_neighbors] = mean_absolute_error(y_test, y_pred)

In [None]:
# do a figure with 3 box plot, one for each column of the dataframe df_mape_train_save.
df_mape_n_neighbors_test_filtered = df_mape_n_neighbors_test[df_mape_n_neighbors_test < 0.5] * 100
fig = go.Figure()
for column in df_mape_n_neighbors_test_filtered.columns:
    fig.add_trace(go.Box(y=df_mape_n_neighbors_test_filtered[column], name=column, boxmean=True))

# remove legend
fig.update_layout(showlegend=False)
# set the x axis name to "Training Days"
fig.update_xaxes(title_text='Neighbors systems')
# set y axis name to "MAPE (%)"
fig.update_yaxes(title_text='MAPE (%)')
# set the fig size to 1000 x 666
fig.update_layout(width=1000, height=666)
fig.show()

# Impact of the number of training data


In [None]:
# Random Forest Regressor hyperparameters
n_estimators = 100  # Number of trees in random forest
max_features = 'log2'  # Number of features to consider at every split
max_depth = None  # Maximum number of levels in tree
min_samples_split = 2  # Minimum number of samples required to split a node
min_samples_leaf = 1  # Minimum number of samples required at each leaf node


df_mape_n_history_test = pd.DataFrame(index=systemsName_Valid, columns=[2])
for max_training_days in df_mape_n_history_test.columns:
    for targetName in tqdm(systemsName_Valid):
        # train
        rf_regressor = RandomForestRegressor(oob_score=False, random_state=random_state, n_estimators=n_estimators, max_features=max_features, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf)
        X_train, y_train = get_system_data(targetName, set='train', relative=True)

        # keep only the last max_training_days days
        X_train = X_train.iloc[-max_training_days:]
        y_train = y_train.iloc[-max_training_days:]

        # split the data into training and testing sets
        rf_regressor.fit(X_train, y_train)

        # predict
        X_test, y_test = get_system_data(targetName, set='test', relative=True)
        if y_test.count() == 0:
            continue

        # adjust the feature in the validation set to match the feature in the training set
        fitted_features = rf_regressor.feature_names_in_
        # Identify extra columns in X_test that are not used by the regressor
        extra_features = set(X_test.columns) - set(fitted_features)
        # Drop extra columns from X_val
        X_test = X_test.drop(columns=list(extra_features), errors='ignore')
        # Identify missing columns in X_test and add them as empty columns
        missing_features = set(fitted_features) - set(X_test.columns)
        for feature in missing_features:
            X_test[feature] = np.nan

        y_mean_array = rf_regressor.predict(X_test)
        y_pred = pd.Series(y_mean_array, index=X_test.index, name=targetName)
        df_mape_n_history_test.loc[targetName, max_training_days] = mean_absolute_error(y_test, y_pred)

In [None]:
# do a figure with 3 box plot, one for each column of the dataframe df_mape_train_save.
df_mape_n_history_test_filtered = df_mape_n_history_test_save[df_mape_n_history_test_save < 0.5] * 100
fig = go.Figure()
for column in df_mape_n_history_test_filtered.columns:
    fig.add_trace(go.Box(y=df_mape_n_history_test_filtered[column], name=column, boxmean=True))

# remove legend
fig.update_layout(showlegend=False)
# set the x axis name to "Training Days"
fig.update_xaxes(title_text='Training Days')
# set y axis name to "MAPE (%)"
fig.update_yaxes(title_text='MAPE (%)')
fig.update_layout(width=1000, height=666)
fig.show()

In [None]:
df_mape_n_history_test_filtered.astype(float).describe()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit

# Step 1: Convert 'training_time' from 'HH:MM' format to minutes
data = {'days': [2, 7, 14, 30, 180, 365],
        'training_time': ['02:36', '02:27', '02:29', '02:47', '03:30', '04:10']}

df = pd.DataFrame(data)

# Convert training time to minutes
df['training_time_seconds'] = df['training_time'].apply(lambda x: int(x.split(':')[0]) * 60 + int(x.split(':')[1])) / 326

df

# Simulate anomalies
