# Import libs


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pvlib
import json
import os
from pvlib.pvsystem import PVSystem, Array, FixedMount
from pvlib.location import Location
from pvlib.modelchain import ModelChain
from pvlib.temperature import TEMPERATURE_MODEL_PARAMETERS
import plotly.graph_objects as go
import plotly.io as pio
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import forestci as fci
from sklearn.model_selection import KFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import threading
from sklearn.metrics import make_scorer
import dash
from dash import dcc, html
import plotly.graph_objects as go
from dash.dependencies import Input, Output
import webbrowser
from threading import Timer
import requests
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import datetime
from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import (
    check_is_fitted,
)
from sklearn.ensemble._base import _partition_estimators

import pickle
import joblib
import os

# Setup


## Setup Directories


In [2]:
pio.renderers.default = "browser"  # render plotly figures in browser

PARENT_DATA_DIR = os.getenv('PARENT_DATA_DIR')
if PARENT_DATA_DIR is None:
    raise ValueError("PARENT_DATA_DIR environment variable is not set")


data_dirpath = PARENT_DATA_DIR + r"\PRiOT\dataExport_2"  # "/Applications/Documents/TM Maxime/dataExport_3400_daily"#
cache_dirpath = os.path.join(data_dirpath, "cache")
logs_dirpath = "../logs"

if not os.path.exists(logs_dirpath):
    os.makedirs(logs_dirpath)

if not os.path.exists(cache_dirpath):
    os.makedirs(cache_dirpath)

## Setup Parameters


In [3]:
use_cache = False
force_train_hsr = True
force_tune_MaxProductionNormalizer = True
random_state = 42

max_training_days = None # None = Maximum possible
min_training_days = 14
testing_days = 14


# Functions


## Serializer


In [4]:
# https://scikit-learn.org/stable/model_persistence.html
class ModelSerializer:
    def _save_model(self, model, serial_type, save_params):
        serial_type.dump(model, save_params)

    def _retrieve_model(self, serial_type, retrieve_params):
        return serial_type.load(retrieve_params)


class JoblibSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        super()._save_model(model, joblib, os.path.join(save_model_path, filename + ".joblib"))

    def retrieve_model(self, save_model_path, filename):
        return super()._retrieve_model(joblib, os.path.join(save_model_path, filename + '.joblib'))


class PickleSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        with open(os.path.join(save_model_path, filename + ".pkl"), 'wb') as f:
            super()._save_model(model, pickle, f)

    def retrieve_model(self, save_model_path, filename):
        with open(os.path.join(save_model_path, filename + ".pkl"), 'rb') as f:
            return super()._retrieve_model(pickle, f)

## Utils


In [5]:
def get_altitude_from_wgs84(longitude, latitude):
    # Convert WGS84 to LV95
    lv95_url = "https://geodesy.geo.admin.ch/reframe/wgs84tolv95"
    params_lv95 = {
        "easting": longitude,
        "northing": latitude,
        "format": "json"
    }

    response_lv95 = requests.get(lv95_url, params=params_lv95)
    if response_lv95.status_code != 200:
        raise Exception("Error converting WGS84 to LV95: " + response_lv95.text)

    lv95_data = response_lv95.json()
    lv95_easting = lv95_data["easting"]
    lv95_northing = lv95_data["northing"]

    # Get altitude from LV95 coordinates
    altitude_url = "https://api3.geo.admin.ch/rest/services/height"
    params_altitude = {
        "easting": lv95_easting,
        "northing": lv95_northing
    }

    response_altitude = requests.get(altitude_url, params=params_altitude)
    if response_altitude.status_code != 200:
        raise Exception("Error retrieving altitude: " + response_altitude.text)

    altitude_data = response_altitude.json()
    altitude = altitude_data["height"]

    return float(altitude)


# Convert the power production with a given frequency to the total daily energy
def daily_energy(df_power):
    # Get the frequency in minutes
    freq_in_minutes = pd.Timedelta(df_power.index.freq).seconds / 60
    # Convert power from kW to kWh
    df_energy = df_power * (freq_in_minutes / 60)
    # Resample to daily frequency and sum the values
    daily_energy = df_energy.resample('D').sum()

    return daily_energy

# Import


## Import metadata


## Import data


In [6]:
class DataHandler:

    def __init__(self, data_dirpath, cache_dirpath):
        self.data_dirpath = data_dirpath
        self.cache_dirpath = cache_dirpath
        self.metadata_filepath = os.path.join(self.data_dirpath, "metadata.json")

        self._measures = None
        self._train_index = None
        self._test_index = None
        self._metadata = None
        
        self.valid_systems = None
        
        self.estimated_max_production = None

        self.tuner_estimated_max_productions_untuned = None
        self.tuner_measures_max_mask = None
        self.tuner_measures_outliers_mask = None

        self.hsr_outliers_mask = None

    def get_metadata(self, system_name=None):
        if self._metadata is None:
            raise ValueError("Metadata not loaded. Please load the metadata first.")
        if system_name is not None:
            return self._metadata[system_name]
        return self._metadata
    
    def get_measures(self, set='all', systems_name=None):
        if self._measures is None:
            raise ValueError("Measures not loaded. Please load the measures first.")
        if self.valid_systems is None:
            raise ValueError("Valid systems not set. Please check the data integrity first.")
        
        systems_name = systems_name if systems_name is not None else self.valid_systems

        if set == 'all':
            return self._measures.loc[:, systems_name].dropna(axis='index', how='all').copy()
        elif set == 'train':
            if self._train_index is None:
                raise ValueError("Train index not set. Please create a train-test set first.")
            if self.hsr_outliers_mask is None:
                raise ValueError("Outliers not checked. Please check the outliers first.")
            # take the observation of the training set, for the desired systems, without the outliers values
            return self._measures.loc[self._train_index, systems_name][~self.hsr_outliers_mask].dropna(axis='index', how='all').copy()
        elif set == 'test':
            if self._test_index is None:
                raise ValueError("Test index not set. Please create a train-test set first.")
            return self._measures.loc[self._test_index, systems_name].dropna(axis='index', how='all').copy()
        else:
            raise ValueError("Invalid set value. Please use 'all', 'train' or 'test'.")
            
    def get_missing_value(self, sorted=True):
        if sorted:
            # Sort columns by number of missing values
            sorted_columns = self._measures.isnull().sum().sort_values().index
            sorted_measures = self._measures[sorted_columns]

            # Create a boolean DataFrame where True indicates missing values
            missing_values = sorted_measures.isnull()
        else:
            missing_values = self._measures.isnull()
        return missing_values
    
    def normalize(self, data):
        if self.estimated_max_production is None:
            raise ValueError("Estimated max production not set. Please estimate the max production first.")
        if isinstance(data, pd.DataFrame):
            # Normalize DataFrame
            data_norm = data / self.estimated_max_production.loc[data.index, data.columns]
        elif isinstance(data, pd.Series):
            # Normalize Series
            data_norm = data / self.estimated_max_production.loc[data.index, data.name]
        else:
            raise ValueError("Data must be a DataFrame or Series")
        
        return data_norm


    def load_metadata(self):
        with open(self.metadata_filepath, 'r') as f:
            self._metadata = json.load(f)

        for _, system_metadata in tqdm(self._metadata.items(), desc="Post-processing metadata"):
            # Add altitude to metadata, if not already present (TODO : imporove with multi threading)
            if "loc_altitude" not in system_metadata['metadata']:
                if "loc_longitude" in system_metadata['metadata'] and "loc_latitude" in system_metadata['metadata']:
                    system_metadata['metadata']["loc_altitude"] = get_altitude_from_wgs84(system_metadata['metadata']["loc_longitude"], system_metadata['metadata']["loc_latitude"])

            # Add the default loss to metadata if not already present
            if 'loss' not in system_metadata['metadata']:
                system_metadata['metadata']['loss'] = 0

            # Convert key with "modX" in the name (x is the array number) to a dictionary with the array number as key
            keys_to_delete = []
            for key, value in system_metadata['metadata'].items():
                if 'mod' in key:
                    # Extract the module number
                    array_num = key.split('_')[1][-1]
                    # Remove the module number from the key
                    new_key = '_'.join(key.split('_')[:1] + key.split('_')[2:])
                    # Add the key-value pair to the appropriate module dictionary
                    if 'arrays' not in system_metadata:
                        system_metadata['arrays'] = {}
                    if array_num not in system_metadata['arrays']:
                        system_metadata['arrays'][array_num] = {}
                    system_metadata['arrays'][array_num][new_key] = value
                    keys_to_delete.append(key)
            for key in keys_to_delete:
                del system_metadata['metadata'][key]

        # Save metadata with new format and value
        self.save_metadata()

    def save_metadata(self):
        with open(self.metadata_filepath, 'w') as f:
            json.dump(self._metadata, f, indent=4)

    def load_csv(self):
        measures_dic = {}
        duplicates_list = []
        for filename in tqdm(os.listdir(self.data_dirpath), desc="Loading CSV files"):
            if filename.endswith(".csv"):
                system_name = filename.split('_')[0]
                system_measures = pd.read_csv(os.path.join(self.data_dirpath, filename))
                # convert the timestamp to datetime with correct timezone
                system_measures['Datetime'] = pd.to_datetime(system_measures['Timestamp'], unit='ms', utc=True).dt.tz_convert('Europe/Zurich')
                # Convert the datetime to only the date, as the production is the daily production. The +1h is to manage the saving time. Normally PRiOT exports the data at midnight (local time) for the day after (e.g. the energy for the July 1st is saved at July 1st 00:00 Europe/Zurich). However it seams that the saving time is not always correctly handled, and sometime the export is done at 23:00 the day before (e.g. the energy for the July 1st is saved at June 30th 23:00 Europe/Zurich). This is why we add 1h to the datetime to be sure to have the correct date.
                system_measures['Date'] = pd.to_datetime((system_measures['Datetime'] + pd.Timedelta(hours=1)).dt.date)
                # Set the date as index
                system_measures.set_index('Date', inplace=True)
                # Append in duplicates_list all the rows with duplicated index, for logging purpose
                if len(system_measures.index.duplicated(keep=False)):
                    duplicates_list.append(system_measures[system_measures.index.duplicated(keep=False)])
                # keep only the measures tt_forward_active_energy_total_toDay as a Series
                system_measures = system_measures['tt_forward_active_energy_total_toDay']
                # Group by the index (Date) and sum the system_measures for each date to handle duplicates
                system_measures = system_measures.groupby('Date').sum()

                measures_dic[system_name] = system_measures
        # convert the dictionary of series to a pandas dataframe
        self._measures = pd.DataFrame(measures_dic)
        # Log the duplicates
        duplicates_df = pd.concat(duplicates_list)
        log_filename = os.path.join(logs_dirpath, "measureDuplicates.csv")
        print(f"Number of duplicate dates found: {len(duplicates_df)} (see log file {log_filename} for more details)")
        duplicates_df.to_csv(log_filename, index=True)

    def check_integrity(self):
        # Check if the metadata is loaded
        if self._metadata is None:
            raise ValueError("Metadata not loaded. Please load the metadata first.")

        # Check if the measures are loaded
        if self._measures is None:
            raise ValueError("Measures not loaded. Please load the measures first.")

        self.valid_systems = self._measures.columns

        for system_name in tqdm(self._measures.columns, desc="Checking data integrity"):
            valid_system = True

            # Check if the system has measures
            if system_name not in self._measures or self._measures[system_name].count() == 0:
                valid_system = False
                print(f"System {system_name} : No measures found")
            # Check if the system has metadata
            if system_name not in self._metadata:
                valid_system = False
                print(f"System {system_name} : No metadata found")
            else:
                # Check metadata for the entire system
                system_metadata = self._metadata[system_name]
                for key in ['loc_latitude', 'loc_longitude', 'loc_altitude', 'pv_kwp']:
                    # test that the key is present
                    if key not in system_metadata['metadata']:
                        valid_system = False
                        print(f"System {system_name} : No '{key}' found")
                    # if present, convert the value to a number, if possible
                    elif not isinstance(system_metadata['metadata'][key], (int, float)):
                        try:
                            system_metadata['metadata'][key] = int(system_metadata['metadata'][key])
                        except ValueError:
                            try:
                                system_metadata['metadata'][key] = float(system_metadata['metadata'][key])
                            except ValueError:
                                valid_system = False
                                print(f"System {system_name} : The key-value '{key}:{system_metadata['metadata'][key]}' is not a number")

                # Check metadata for the arrays
                if 'arrays' not in system_metadata or len(system_metadata['arrays']) == 0:
                    print(f"System {system_name} : No PV arrays found")
                    valid_system = False
                else:
                    for array_num, array_data in system_metadata['arrays'].items():
                        for key in ['pv_tilt', 'pv_azimut', 'pv_wp', 'pv_number']:
                            if key not in array_data:
                                valid_system = False
                                print(f"System {system_name} : No '{key}' found for array {array_num}")
                            # test that the value is a number, and convert it if possible
                            elif not isinstance(array_data[key], (int, float)):
                                try:
                                    array_data[key] = int(array_data[key])
                                except ValueError:
                                    try:
                                        array_data[key] = float(array_data[key])
                                    except ValueError:
                                        valid_system = False
                                        print(f"System {system_name} : The key-value '{key}:{array_data[key]}' is not a number for array {array_num}")
            if not valid_system:
                self.valid_systems = self.valid_systems.drop(system_name)

        print(f"Number of systems with all the necessary data: {len(self.valid_systems)}/{len(self._measures.columns)}")
    
    def create_train_test_set(self, test_size=None, max_train_size=None, random_state=None, shuffle=True):
        if self.hsr_outliers_mask is None:
            raise ValueError("Outliers not checked. Please check the outliers first.")
        
        self._train_index, self._test_index = train_test_split(self._measures.index, test_size=test_size, random_state=random_state, shuffle=shuffle)
        # remove hsr_outliers_mask from the training set
        if max_train_size is not None and max_train_size < len(self._train_index):
            # Now we want to randomly select "train_size" observation from the training set, with the least number of missing values
            # We will do this by looking at the number of missing values of the "train_size" th element in the sorted list of observation by number of missing values
            # This way, we will know the maximum number of missing values that the selected observation will have
            # Then, we will randomly select "train_size" observation from all the observation with this number of missing value or less

            # Get the training set.            
            training_set = self.get_measures(set='train')
            
            nbr_missing_values_per_day = training_set.isnull().sum(axis=1)
            # Get the maximum number of missing values that the selected observation will have
            max_missing_value = nbr_missing_values_per_day.sort_values().iloc[max_train_size-1]
            # Get all the observation with this number of missing value or less
            valid_observations = training_set[nbr_missing_values_per_day <= max_missing_value]
            # Randomly select "train_size" observation
            self._train_index = valid_observations.sample(n=max_train_size, random_state=random_state).index

    def check_outliers(self, max_threshold=1.1, min_threshold=0.01):
        norm_measures = self.normalize(self.get_measures('all'))
        self.hsr_outliers_mask = (norm_measures > max_threshold) | (norm_measures < min_threshold)
        # If 10% of the values are outliers, remove the system from the list of valid systems
        outliers_count = self.hsr_outliers_mask.sum(axis=0)
        invalid_systems = outliers_count[outliers_count > 0.1 * len(norm_measures)].index
        self.valid_systems = self.valid_systems.drop(invalid_systems)
        for system_name in invalid_systems:
            print(f"System {system_name} : More than 10% of the values are outliers. This system is removed from the list of systems to be trained.")

    def check_training_size(self, min_training_days=14):
        # Calculate the number of valid (non-null) values
        valid_values_count = self.get_measures(set='train').notnull().sum(axis=0)

        # Get the boolean Series where valid values count is less than 14
        invalid_systems = valid_values_count[valid_values_count < min_training_days].index

        # Remove invalid systems from the valid_system list
        self.valid_systems = self.valid_systems.drop(invalid_systems)

        for system_name in invalid_systems:
            print(f"System {system_name} : The system has less than {min_training_days} days of training data. This system is removed from the list of systems to be trained.")
        

        
        

# Max production Normalizer

In [7]:
class MaxProductionNormalizer:
    def __init__(self):
        self.estimated_max_production = None
        self.model = None

    def create_model(self, system_metadata):
        latitude = system_metadata['metadata']['loc_latitude']
        longitude = system_metadata['metadata']['loc_longitude']
        altitude = system_metadata['metadata']['loc_altitude']
        Wp_Tot = system_metadata['metadata']['pv_kwp'] * 1000
        loss = system_metadata['metadata']['loss'] * 100

        arrays = []
        for array_num, arrayData in system_metadata['arrays'].items():
            array = Array(
                mount=FixedMount(surface_tilt=arrayData['pv_tilt'], surface_azimuth=arrayData['pv_azimut'], racking_model='open_rack'),
                module_parameters={'pdc0': arrayData['pv_wp'], 'gamma_pdc': -0.004},
                module_type='glass_polymer',
                modules_per_string=arrayData['pv_number'],
                strings=1,
                temperature_model_parameters=TEMPERATURE_MODEL_PARAMETERS['sapm']['open_rack_glass_polymer'],
            )
            arrays.append(array)

        location = Location(latitude=latitude, longitude=longitude, altitude=altitude, tz='Europe/Zurich')
        system = PVSystem(arrays=arrays,
                        inverter_parameters={'pdc0': Wp_Tot, 'eta_inv_nom': 0.96},
                        losses_parameters={'nameplate_rating': loss, 'soiling': 0, 'shading': 0, 'snow': 0, 'mismatch': 0, 'wiring': 0, 'connections': 0, 'lid': 0, 'age': 0, 'availability': 0})
        self.model = ModelChain(system, location, clearsky_model='ineichen', aoi_model='no_loss', spectral_model="no_loss", losses_model='pvwatts')

    def generate_estimation_data(self, dates, sampling_freq='1h'):
        if self.model is None:
            raise ValueError("Model not set. Please create a model first.")
        
        # The end date needs to be estimated completly(end date at 23:59). But "endDate" is considered as 00:00 by pd.date_range().
        # So we add 1 day to the end date to include the entire end date in the date_range(), and then we exclude the last value with the inclusive='left' proprety, to remove "endDate+1" at 00:00) in the date_range().
        start_date = dates.min()
        end_date = dates.max() + pd.Timedelta(days=1)

        all_datetimes = pd.date_range(start=start_date, end=end_date, freq=sampling_freq, tz=self.model.location.tz, inclusive='left')

        # Get the clear sky irradiance for the given dates
        weather_clearsky = self.model.location.get_clearsky(all_datetimes)  # In W/m2
        # TODO adjust the clear sky model to take into account the horizon https://pvlib-python.readthedocs.io/en/stable/gallery/shading/plot_simple_irradiance_adjustment_for_horizon_shading.html
        
        # Run the model to get the estimated production
        self.model.run_model(weather_clearsky)
        production_sampling_rate = self.model.results.ac / 1000  # Convert W to kW
        self.estimated_max_production = daily_energy(production_sampling_rate)
        self.estimated_max_production.index = pd.to_datetime(self.estimated_max_production.index.date)

    def tune(self, system_measures, window=7):
        if self.estimated_max_production is None:
            raise ValueError("Estimated max production not set. Please estimate the max production first.")

        # Remove the obvious outliers. It's important before calculating the std, which can be strongly impacted by the strong outliers.
        outliers_mask = system_measures > 2 * self.estimated_max_production[system_measures.index]
        
        valid_system_measures = system_measures[~outliers_mask]
        # if 10% of the data is removed as outliers, we consider that the system is not valid
        if outliers_mask.sum() / system_measures.size > 0.1:
            return None, None, None, None
        # Keep only the max measured value
        # Iterate over windows of a given size, and keep only the maximum value in each window
        max_measured_mask = pd.Series(False, index=system_measures.index)
        for i in range(0, len(system_measures), window):
            window_data = valid_system_measures.iloc[i:i + window]
            if not window_data.empty and not window_data.isna().all():
                max_measured_mask[window_data.idxmax(skipna=True)] = True

        # Calculate the relative difference between the maximum measured and maximum estimated value
        realtive_difference = system_measures[max_measured_mask] / self.estimated_max_production

        # Compute statistics
        std = realtive_difference.std()
        mean = realtive_difference.mean()

        # Remove the outilers that have a z-score greater than 1
        z_scores = np.abs(realtive_difference - mean) / std

        # Add the measure with a z-score greater than 1 to the previous outliers (AND operation)
        outliers_mask = outliers_mask | (z_scores > 1)

        # Get the loss that overestimate the estimate maximum daily energy
        loss = 1 - realtive_difference[~outliers_mask].max()

        return loss, std, max_measured_mask, outliers_mask
        

In [8]:
class MaxProductionNormalizers:
    def run(self, data_handler, tune=True):
        estimated_max_productions_dic = {}
        tuner_estimated_max_productions_untuned_dic = {}
        tuner_measures_max_mask_dic = {}
        tuner_measures_outliers_mask_dic = {}
        unfitted_systems = []

        for system_name in tqdm(data_handler.valid_systems, desc="Max production normalizers"):
            system_metadata = data_handler.get_metadata(system_name)
            # If we don't want to tune the estimators, we say that the estimator is already tuned
            tuned = not tune

            # reset the loss in the metadata if we want to tune the estimators
            if tune:
                system_metadata['metadata']['loss'] = 0   

            max_production_normalizer = MaxProductionNormalizer()
            while True:  # emulate do while loop
                max_production_normalizer.create_model(system_metadata)
                measures = data_handler.get_measures(systems_name=system_name).dropna()
                dates = measures.index
                max_production_normalizer.generate_estimation_data(dates, sampling_freq='1h')

                # add the estimation to the dictionary
                estimated_max_productions_dic[system_name] = max_production_normalizer.estimated_max_production

                # Tune estimators
                if tuned:
                    break

                loss, std, max_measured_mask, outliers_mask = max_production_normalizer.tune(measures, window=7)

                if loss is None or std is None or max_measured_mask is None or outliers_mask is None:
                    unfitted_systems.append(system_name)
                    break

                # If the std is greater than 1, we remove the system from the list of systems to be processed.
                # This is to avoid to have a system that is not well fitted by the maximum energy estimator model, and that could impact the training of the RF model.
                if std > 1 :
                    unfitted_systems.append(system_name)
                    break

                # write the loss in systemsMetadata
                system_metadata['metadata']['loss'] = loss

                # save the untuned estimation to plot the difference before/aftre tuning
                tuner_estimated_max_productions_untuned_dic[system_name] = max_production_normalizer.estimated_max_production
                tuner_measures_max_mask_dic[system_name] = max_measured_mask
                tuner_measures_outliers_mask_dic[system_name] = outliers_mask

                tuned = True

        # Concatenate all the dictionaries to create dataframe
        data_handler.estimated_max_production = pd.concat(estimated_max_productions_dic, axis=1)
        data_handler.tuner_estimated_max_productions_untuned = pd.concat(tuner_estimated_max_productions_untuned_dic, axis=1)
        data_handler.tuner_measures_max_mask = pd.concat(tuner_measures_max_mask_dic, axis=1)
        data_handler.tuner_measures_outliers_mask = pd.concat(tuner_measures_outliers_mask_dic, axis=1)

        # remove unfitted_systems from the valid_systems
        data_handler.valid_systems = data_handler.valid_systems.drop(unfitted_systems)
        for system_name in unfitted_systems:
            print(f"System {system_name} : We can't find the model corresponding to the measured data. This system is removed from the list of systems to be processed.")

        # Save the metadata with the new loss value
        data_handler.save_metadata()  







In [14]:
# Assuming data_handler.get_measures() returns a DataFrame
measures = data_handler.get_measures()

# Sort columns by the number of missing values
sorted_columns = measures.isnull().sum().sort_values().index
sorted_measures = measures[sorted_columns]

# Create a boolean DataFrame where True indicates missing values
missing_values = (~sorted_measures.isnull()).astype(int)

# Plot heatmap
fig = go.Figure(data=go.Heatmap(
    z=missing_values,
    x=missing_values.columns,
    y=missing_values.index,
    showscale=False,
    colorscale='Greys'  # Set colorscale to black and white
))
fig.update_layout(
    yaxis=dict(
        showticklabels=True,  # Show y-axis tick labels
        autorange='reversed'  # Invert the y-axis
    ),
    yaxis_tickmode='array',
    yaxis_tickvals=pd.date_range(start=missing_values.index.min(), end=missing_values.index.max(), freq='ME'),
    yaxis_ticktext=pd.date_range(start=missing_values.index.min(), end=missing_values.index.max(), freq='ME').strftime('%b %Y')
)
fig.show()

In [9]:
data_handler = DataHandler(data_dirpath, cache_dirpath)
data_handler.load_metadata()
data_handler.load_csv()
data_handler.check_integrity()


max_production_normalizers = MaxProductionNormalizers()
max_production_normalizers.run(data_handler, tune=force_tune_MaxProductionNormalizer)


Post-processing metadata: 100%|██████████| 481/481 [00:00<00:00, 237487.96it/s]


Loading CSV files: 100%|██████████| 454/454 [00:10<00:00, 41.89it/s]


Number of duplicate dates found: 1004 (see log file ../logs\measureDuplicates.csv for more details)


Checking data integrity: 100%|██████████| 451/451 [00:00<00:00, 6855.40it/s]


System 2026239 : No measures found
System 2026239 : No 'pv_kwp' found
System a001001 : No 'pv_wp' found for array 1
System a001001 : No 'pv_number' found for array 1
System a001028 : The key-value 'pv_azimut:39-129-219' is not a number for array 1
System a001038 : The key-value 'pv_azimut:58-138-238' is not a number for array 1
System a001103 : No 'pv_tilt' found for array 1
System a001116 : No 'pv_wp' found for array 2
System a001118 : The key-value 'pv_azimut:55-235' is not a number for array 1
System a001122 : No 'pv_tilt' found for array 2
System a001122 : No 'pv_tilt' found for array 1
System a001164 : No measures found
System a001165 : The key-value 'pv_azimut:90° / 270°' is not a number for array 1
System a001199 : No 'pv_azimut' found for array 1
System a001222 : The key-value 'pv_azimut:56-146-236' is not a number for array 1
System a001226 : No 'pv_tilt' found for array 1
System a001226 : No 'pv_azimut' found for array 1
System a001226 : No 'pv_number' found for array 1
Syste

Max production normalizers:  56%|█████▋    | 201/356 [01:55<01:29,  1.74it/s]


KeyboardInterrupt: 

In [None]:
data_handler.check_outliers(max_threshold=1.1, min_threshold=0.01)
data_handler.create_train_test_set(test_size=testing_days, max_train_size=50, random_state=random_state, shuffle=False)
data_handler.check_training_size(min_training_days=min_training_days)

# App

In [None]:
# Initialize the Dash app
# no_raise_mode = True

app = dash.Dash(__name__)

tab_height = '2em'
app.layout = html.Div([
    html.Div([
        dcc.Dropdown(
            id='system-dropdown',
            options=[{'label': name, 'value': name} for name in data_handler.valid_systems],
            value=data_handler.valid_systems[0],
            style={'width': '50%'}  # Adjust width and font size
        ),
        html.Div(id='metric-text-container', style={'display': 'inline-block', 'margin-left': '20px'})  # Container for the metric text
    ], style={'display': 'flex', 'align-items': 'center'}),  # Align items horizontally
    dcc.Tabs(id='plot-tabs', value='tab-energy', children=[
        dcc.Tab(label='Energy', value='tab-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Normalizer Tuning', value='tab-norm-tuning', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Relative Energy', value='tab-rel-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        # plot systemsData_RelativeDelta_val
        dcc.Tab(label='Delta Error', value='tab-delta-rel-energy', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='All Relative Energy', value='tab-rel-energy-all', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='All Missing Value', value='tab-miss-val-all', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height
        dcc.Tab(label='Similar neighboring systems', value='tab-neighbors', style={'padding': '0px', 'lineHeight': tab_height}, selected_style={'padding': '0px', 'lineHeight': tab_height, 'fontWeight': 'bold'}),  # Adjust height and line height

    ]),  # Adjust height for tabs
    html.Div(id='tabs-content', style={'flex': '1 1 auto'})  # Allow the tabs-content div to grow
], style={'display': 'flex', 'flexDirection': 'column', 'height': '100vh'})  # Make the outer container fill the screen height


@app.callback(
    [Output('tabs-content', 'children'),
     Output('metric-text-container', 'children')],
    [Input('plot-tabs', 'value'),
     Input('system-dropdown', 'value')]
)
def render_content(tab, selected_system):
    # Statistic text
    try:
        mae_train = regressorsMetrics_train.loc[selected_system]
    except:
        mae_train = np.nan
    try:
        mae_val = regressorsMetrics_val.loc[selected_system]
    except:
        mae_val = np.nan
    try:
        loss = data_handler.get_metadata(selected_system)['metadata']['loss']
    except:
        loss = np.nan

    mae_train_text = f"Estimator Train Error : {mae_train * 100:.2f}%"
    mae_test_text = f"Estimator Test Error  : {mae_val * 100:.2f}%"
    loss_text = f"System Loss   : {loss * 100:.2f}%"

    metric_text_div = html.Div([
        html.Div(mae_train_text),
        html.Div(mae_test_text),
        html.Div(loss_text)
    ], style={'fontSize': 16})

    if tab == 'tab-energy':
        fig1 = go.Figure(layout_yaxis_title="Daily Energy (kWh)")

        # remove nan from systemsData_EstimatedMaxDailyEnergy[selected_system]

        try:
            data = data_handler.estimated_max_production[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=data.index,
                y=data,
                mode='markers',
                name='Estimated Max Daily Energy',
                marker_color='LightSeaGreen'
            ))
        except:
            pass

        try:
            data = data_handler.get_measures(set='all', systems_name = selected_system)
            fig1.add_trace(go.Scatter(
                x=data.index,
                y=data,
                mode='markers',
                name='Measured Daily Energy',
                marker_color='blue'
            ))
        except:
            pass

        try:
            expectedDailyEnergy_val_mean = systemsData_ExpectedDailyEnergy_val_mean[selected_system].dropna()
            expectedDailyEnergy_val_std = systemsData_ExpectedDailyEnergy_val_std[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=expectedDailyEnergy_val_mean.index,
                y=expectedDailyEnergy_val_mean,
                mode='markers',
                name='Expected Daily Energy',
                marker_color='red'
                # error_y=dict(
                #     type='data',
                #     array=expectedDailyEnergy_val_std,
                #     visible=True
                # )
            ))
        except:
            pass

        # Update layout for legend position
        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure

    elif tab == 'tab-norm-tuning':
        fig1 = go.Figure(layout_yaxis_title="Daily Energy (kWh)")
        try:
            data = data_handler.get_measures(set='all', systems_name = selected_system)
            fig1.add_trace(go.Scatter(
                x=data.index,
                y=data,
                mode='markers',
                name='Measured Daily Energy',
                marker_color='blue'
            ))
        except:
            pass
        try:
            data = data_handler._measures[data_handler.tuner_measures_max_mask][selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=data.index,
                y=data,
                mode='markers',
                name='Max Measured Daily Energy (7 days)',
                marker_color='red'
            ))
        except:
            pass
        try:
            data = data_handler._measures[data_handler.tuner_measures_outliers_mask][selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=data.index,
                y=data,
                mode='markers',
                name='Tuning Outliers',
                marker_color='yellow'
            ))
        except:
            pass
        try:
            data = data_handler.tuner_estimated_max_productions_untuned[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=data.index,
                y=data,
                mode='markers',
                name='Estimated Max Daily Energy (Untuned)',
                marker_color='violet'
            ))
        except:
            pass
        try:
            data = data_handler.estimated_max_production[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=data.index,
                y=data,
                mode='markers',
                name='Estimated Max Daily Energy',
                marker_color='LightSeaGreen'
            ))
        except:
            pass

        fig1.update_layout(
            legend=dict(
                x=0.99,
                y=0.99,
                xanchor='right',
                yanchor='top',
                orientation='h'
            )
        )

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div
    elif tab == 'tab-rel-energy':
        fig1 = go.Figure(layout_yaxis_title="Proportional Daily Energy (%)")
        # add a line at 100% for the Estimated Max Daily Energy
        data = data_handler.estimated_max_production[selected_system].dropna()
        fig1.add_shape(
            type="line",
            x0=data.index.min(),
            y0=100,
            x1=data.index.max(),
            y1=100,
            name='Estimated Max Daily Energy',
            line_color='LightSeaGreen'
        )
        # try:
        data = data_handler.normalize(data_handler.get_measures(set='all', systems_name = selected_system))
        fig1.add_trace(go.Scatter(
            x=data.index,
            y=data * 100,
            mode='markers',
            name='Measured Daily Energy',
            marker_color='blue'
        ))
        # except:
        #     pass
        try:
            relativeExpectedDailyEnergy_val_mean = systemsData_RelativeExpectedDailyEnergy_val_mean[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=relativeExpectedDailyEnergy_val_mean.index,
                y=relativeExpectedDailyEnergy_val_mean * 100,
                mode='markers',
                name='Expected Daily Energy',
                marker_color='red'
                # error_y=dict(
                #     type='data',
                #     array=systemsData_RelativeExpectedDailyEnergy_val_std[selected_system] * 100,
                #     visible=True
                # )
            ))
        except:
            pass

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-delta-rel-energy':
        fig1 = go.Figure(layout_yaxis_title="Proportional Daily Energy Error(%)")
        try:
            relativeDelta_val = systemsData_RelativeDelta_val[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=relativeDelta_val.index,
                y=relativeDelta_val * 100,
                mode='markers',
                name='Relative Delta Energy',
            ))
        except:
            pass
        try:
            relativeDelta_val_detected = systemsData_RelativeDelta_val_detected[selected_system].dropna()
            fig1.add_trace(go.Scatter(
                x=relativeDelta_val_detected.index,
                y=relativeDelta_val_detected * 100,
                mode='markers',
                name='Detected Errors',
                marker_color='red'
            ))
        except:
            pass

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-rel-energy-all':
        fig1 = go.Figure(layout_yaxis_title="Proportional Daily Energy (%)")
        try:
            all_data = data_handler.normalize(data_handler.get_measures(set='all'))
            for system_name in data_handler.valid_systems:
                if system_name != selected_system:
                    fig1.add_trace(go.Scatter(
                        x=all_data[system_name].index,
                        y=all_data[system_name] * 100,
                        mode='markers',
                        name=f'{system_name}',
                        marker_color='blue'
                    ))
            fig1.add_trace(go.Scatter(
                x=all_data[selected_system].index,
                y=all_data[selected_system] * 100,
                mode='markers',
                name=f'{selected_system}',
                marker_color='red'
            ))
            fig1.update_layout(yaxis=dict(range=[-5, 120]))

        except:
            pass

        return dcc.Graph(figure=fig1, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure
    elif tab == 'tab-miss-val-all':
        data = (~data_handler.get_missing_value(sorted=True)).astype(int)

        fig = go.Figure(data=go.Heatmap(
            z=data,
            x=data.columns,
            y=data.index,
            showscale=False,
            colorscale='Greys'  # Set colorscale to black and white
        ))
        fig.update_layout(
            yaxis=dict(
                autorange='reversed',  # Invert the y-axis
                showticklabels=True,
                tickmode='array',
                tickvals=pd.date_range(start=data.index.min(), end=data.index.max(), freq='ME'),
                ticktext=pd.date_range(start=data.index.min(), end=data.index.max(), freq='ME').strftime('%b %Y')
            )
        )

        return dcc.Graph(figure=fig, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure

    elif tab == 'tab-neighbors':
        fig2 = go.Figure()

        # Add initial traces with secondary y-axis
        try:
            fig2.add_trace(go.Bar(
                x=features_importance_df.columns,
                y=features_importance_df.loc[selected_system],
                name='Impurity-based Importance',
                yaxis='y1',
                offsetgroup=1
            ))
            fig2.update_layout(
                yaxis1=dict(
                    title='Impurity-based Importance',
                    range=[0, features_importance_df.loc[selected_system].max()],
                )
            )
        except:
            pass
        try:
            fig2.add_trace(go.Bar(
                x=permutation_importance_mean_df.columns,
                y=permutation_importance_mean_df.loc[selected_system],
                name='Permutation Importance',
                yaxis='y2',
                offsetgroup=2
            ))
            fig2.update_layout(
                yaxis2=dict(
                    title='Permutation Importance',
                    overlaying='y',
                    side='right',
                    range=[0, permutation_importance_mean_df.loc[selected_system].max()],
                )
            )
        except:
            pass

        return dcc.Graph(figure=fig2, style={'height': '100%', 'width': '100%'}), metric_text_div  # Adjust height and width of the figure


def open_browser():
    webbrowser.open("http://127.0.0.1:8060/")


if __name__ == '__main__':
    # Open the Dash app in a new browser window
    Timer(1, open_browser).start()
    app.run_server(debug=True, use_reloader=False, port=8060)