# Setup & Import


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pvlib
import json
import os
from pvlib.pvsystem import PVSystem, Array, FixedMount
from pvlib.location import Location
from pvlib.modelchain import ModelChain
from pvlib.temperature import TEMPERATURE_MODEL_PARAMETERS
import plotly.graph_objects as go
import plotly.io as pio
from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error, root_mean_squared_error
from sklearn.inspection import permutation_importance
import forestci as fci

pio.renderers.default = "browser"  # render plotly figures in browser

PARENT_DATA_DIR = os.getenv('PARENT_DATA_DIR')
if PARENT_DATA_DIR is None:
    raise ValueError("PARENT_DATA_DIR environment variable is not set")


dataDirpath = PARENT_DATA_DIR + r"\PRiOT\dataExport_3400_daily"  # "/Applications/Documents/TM Maxime/dataExport_3400_daily"#
dataCacheDirpath = os.path.join(dataDirpath, "cache")
logsDirpath = "../logs"
forceImport = True
forceTrain = True
random_state = 42

if not os.path.exists(logsDirpath):
    os.makedirs(logsDirpath)

# Serializer


In [None]:
# https://scikit-learn.org/stable/model_persistence.html
import pickle
import joblib
import os


class ModelSerializer:
    def _save_model(self, model, serial_type, save_params):
        serial_type.dump(model, save_params)

    def _retrieve_model(self, serial_type, retrieve_params):
        return serial_type.load(retrieve_params)


# save_model_path = "Serialized_models\\"


class JoblibSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        super()._save_model(model, joblib, os.path.join(save_model_path, filename + ".joblib"))

    def retrieve_model(self, save_model_path, filename):
        return super()._retrieve_model(joblib, os.path.join(save_model_path, filename + '.joblib'))


class PickleSerializer(ModelSerializer):
    def save_model(self, model, save_model_path, filename):
        with open(os.path.join(save_model_path, filename + ".pkl"), 'wb') as f:
            super()._save_model(model, pickle, f)

    def retrieve_model(self, save_model_path, filename):
        with open(os.path.join(save_model_path, filename + ".pkl"), 'rb') as f:
            return super()._retrieve_model(pickle, f)

## Import metadata


In [None]:
metadataFilepath = os.path.join(dataDirpath, "metadata.json")

with open(metadataFilepath, 'r') as f:
    systemsMetadata = json.load(f)

## Import measures


In [None]:
# cacheFilename_systemsData_MeasuredDailyEnergy = os.path.join(dataCacheDirpath,'systemsData_MeasuredDailyEnergy.pkl')

# if not forceImport and os.path.exists(cacheFilename_systemsData_MeasuredDailyEnergy):
#     print(f"Loading cached data in {cacheFilename_systemsData_MeasuredDailyEnergy}")
#     systemsData_MeasuredDailyEnergy = pd.read_pickle(cacheFilename_systemsData_MeasuredDailyEnergy)
# else:

# Load all csv files from the data directory
systemsData = {}
for file in os.listdir(dataDirpath):
    if file.endswith(".csv"):
        systemName = file.split("_")[0]
        systemsData[systemName] = pd.read_csv(os.path.join(dataDirpath, file))
        systemsData[systemName]['Datetime'] = pd.to_datetime(systemsData[systemName]['Timestamp'], unit='ms', utc=True).dt.tz_convert('Europe/Zurich')
        systemsData[systemName]['Date'] = (systemsData[systemName]['Datetime'] + pd.Timedelta(hours=1)).dt.date  # Convert the datetime to only the date, as the production is the daily production. The +1h is to manage the saving time. Normally PRiOT exports the data at midnight (local time) for the day after (e.g. the energy for the July 1st is saved at July 1st 00:00 Europe/Zurich). However it seams that the saving time is not always correctly handled, and sometime the export is done at 23:00 the day before (e.g. the energy for the July 1st is saved at June 30th 23:00 Europe/Zurich). This is why we add 1h to the datetime to be sure to have the correct date.
        # systemsData[systemName]['energy_daily_norm'] = systemsData[systemName]['tt_forward_active_energy_total_toDay'] / metadata[systemName]['metadata']['pv_kwp']

systemsName = list(systemsData.keys())

df_duplicate_list = list()
for systemName, systemData in systemsData.items():
    # Save duplicate dates to log list, and the in a log file
    duplicates = systemData[systemData['Date'].duplicated(keep=False)]
    if len(duplicates) > 0:
        df_duplicate_list.append(duplicates)

        # Remove duplicate date where tt_forward_active_energy_total_toDay is the smallest
        # TODO maybe we should sum the energy of the duplicates instead of removing the smallest one. However, when looking in PRiOT Portal, it seams that in the daily energy, only the biggest value is represented. We do the same here.
        systemData.sort_values('tt_forward_active_energy_total_toDay', ascending=True, inplace=True)
        systemsData[systemName].drop_duplicates(subset='Date', keep='last', inplace=True)

    # Set date as the index and sort the data by date
    systemsData[systemName].set_index('Date', inplace=True)
    systemData.sort_index(ascending=True, inplace=True)

# Save duplicate dates to log file
df_duplicate = pd.concat(df_duplicate_list)
print(f"Number of duplicate dates found: {len(df_duplicate)} (see log file for more details)")
df_duplicate.to_csv(os.path.join(logsDirpath, 'duplicateDates.csv'), index=True)

## ----------------------------------------------- ##
## Convert data & Filter out invalid PRiOT systems ##
## ----------------------------------------------- ##

systemsName_Valid = systemsName.copy()
for systemName in systemsName:
    missingData = False
    if len(systemsData[systemName]) == 0:
        missingData = True
        print(f"No measures found for system {systemName}")
    for key in ['loc_latitude', 'loc_longitude', 'pv_kwp']:
        if key not in systemsMetadata[systemName]['metadata']:
            missingData = True
            print(f"No {key} found for {systemName}")
        # test that the value is a number
        elif not isinstance(systemsMetadata[systemName]['metadata'][key], (int, float)):
            try:
                systemsMetadata[systemName]['metadata'][key] = int(systemsMetadata[systemName]['metadata'][key])
            except ValueError:
                try:
                    systemsMetadata[systemName]['metadata'][key] = float(systemsMetadata[systemName]['metadata'][key])
                except ValueError:
                    missingData = True
                    print(f"The key-value '{key}:{systemsMetadata[systemName]['metadata'][key]}' is not a number for system {systemName}")

    if (len(systemsMetadata[systemName]['arrays']) == 0):
        print(f"No PV arrays found for system {systemName}")
        missingData = True
    for array_num, arrayData in systemsMetadata[systemName]['arrays'].items():
        for key in ['pv_tilt', 'pv_azimut', 'pv_wp', 'pv_number']:
            if key not in arrayData:
                missingData = True
                print(f"No {key} found for array {array_num} of system {systemName}")
            # test that the value is a number
            elif not isinstance(arrayData[key], (int, float)):
                try:
                    arrayData[key] = int(arrayData[key])
                except ValueError:
                    try:
                        arrayData[key] = float(arrayData[key])
                    except ValueError:
                        missingData = True
                        print(f"The key-value '{key}:{arrayData[key]}' is not a number for array {array_num} of system {systemName}")

    if missingData:
        systemsName_Valid.remove(systemName)
        print(f"-> Removing system {systemName} from the list of systems")

print(f"Number of systems with all the necessary data: {len(systemsName_Valid)}/{len(systemsName)}")

# Filter out systems with less than 100 days of data
minimumDays = 200
systemsName_Remaining = systemsName_Valid.copy()
for systemName in systemsName_Valid:
    if len(systemsData[systemName]) < minimumDays:
        systemsName_Remaining.remove(systemName)
        print(f"-> Removing system {systemName} from the list of systems because it has less than {minimumDays} days of data")

print(f"Number of systems with at least {minimumDays} days of data: {len(systemsName_Remaining)}/{len(systemsName)}")

## ---------------------------------------------------------------------------- ##
## Create one 2D DataFrame with the daily production of every remaining systems ##
## ---------------------------------------------------------------------------- ##

# Create an empty list to store all measured data for each systems
systemsData_MeasuredDailyEnergy_List = []

# Iterate over each key-value pair in the systemsData dictionary
for systemName in systemsName_Remaining:
    # Extract the 'tt_forward_active_energy_total_toDay' column from the current dataframe
    measuredDailyEnergy = systemsData[systemName]['tt_forward_active_energy_total_toDay']

    # Rename the column with the system name
    measuredDailyEnergy.rename(systemName, inplace=True)

    systemsData_MeasuredDailyEnergy_List.append(measuredDailyEnergy)
    # Concatenate the column to the new_dataframe

# Concatenate all the columns in the list to create one dataframe
systemsData_MeasuredDailyEnergy = pd.concat(systemsData_MeasuredDailyEnergy_List, axis=1)
systemsData_MeasuredDailyEnergy.index = pd.to_datetime(systemsData_MeasuredDailyEnergy.index)
systemsData_MeasuredDailyEnergy.sort_index(inplace=True)

## ------------------ ##
## Save the dataframe ##
## ------------------ ##
# Save the dataframe for later use
# systemsData_MeasuredDailyEnergy.to_pickle(cacheFilename_systemsData_MeasuredDailyEnergy)

# Print the dataframe
systemsData_MeasuredDailyEnergy

### Show missing value


## Simulate max production


### Functions


In [None]:
# Convert the power production with a given frequency to the total daily energy
def daily_energy(df_power):
    # Get the frequency in minutes
    freq_in_minutes = pd.Timedelta(df_power.index.freq).seconds / 60
    # Convert power from kW to kWh
    df_energy = df_power * (freq_in_minutes / 60)
    # Resample to daily frequency and sum the values
    daily_energy = df_energy.resample('D').sum()
    # daily_energy.index = daily_energy.index.date

    return daily_energy

# Simulate the daily production of a system from a start date to an end date using the given PVLib ModelChain


def simulateMaxDailyEnergy(startDate, endDate, modelChain: ModelChain, samplingFreq='1h'):
    # The end date is included in the simulation (end date at 23:59).
    # So we add 1 day to the end date to include the entire end date in the date_range(), and then we exclude the last value (end date +1 at 00:00) in the date_range().
    # TODO It is possible to take into account the horizon, using this method: https://pvlib-python.readthedocs.io/en/stable/gallery/shading/plot_simple_irradiance_adjustment_for_horizon_shading.html
    endDate = endDate + pd.Timedelta(days=1)

    times = pd.date_range(start=startDate, end=endDate, freq=samplingFreq, tz=modelChain.location.tz, inclusive='left')
    weatherClearSky = modelChain.location.get_clearsky(times)  # In W/m2
    modelChain.run_model(weatherClearSky)
    production = modelChain.results.ac / 1000  # Convert W to kW
    dailyProduction = daily_energy(production)
    dailyProduction.index = dailyProduction.index.date
    return dailyProduction

In [None]:
cacheFilename_systemsData_SimulatedMaxDailyEnergy = os.path.join(dataCacheDirpath, 'systemsData_SimulatedMaxDailyEnergy.pkl')

if not forceImport and os.path.exists(cacheFilename_systemsData_SimulatedMaxDailyEnergy):
    print(f"Loading cached data in {cacheFilename_systemsData_SimulatedMaxDailyEnergy}")
    systemsData_SimulatedMaxDailyEnergy = pd.read_pickle(cacheFilename_systemsData_SimulatedMaxDailyEnergy)
else:

    ## ------------------ ##
    ## Create ModelChains ##
    ## ------------------ ##
    modelChains = {}
    for systemName in systemsData_MeasuredDailyEnergy.columns:
        latitude = systemsMetadata[systemName]['metadata']['loc_latitude']
        longitude = systemsMetadata[systemName]['metadata']['loc_longitude']
        altitude = 533  # TODO: Get the altitude from the metadata or an API
        Wp_Tot = systemsMetadata[systemName]['metadata']['pv_kwp'] * 1000

        arrays = []
        for array_num, arrayData in systemsMetadata[systemName]['arrays'].items():
            array = Array(
                mount=FixedMount(surface_tilt=arrayData['pv_tilt'], surface_azimuth=arrayData['pv_azimut'], racking_model='open_rack'),
                module_parameters={'pdc0': arrayData['pv_wp'], 'gamma_pdc': -0.004},
                module_type='glass_polymer',
                modules_per_string=arrayData['pv_number'],
                strings=1,
                temperature_model_parameters=TEMPERATURE_MODEL_PARAMETERS['sapm']['open_rack_glass_polymer'],
            )
            arrays.append(array)

        location = Location(latitude=latitude, longitude=longitude, altitude=altitude, tz='Europe/Zurich', name=systemName)
        system = PVSystem(arrays=arrays, inverter_parameters={'pdc0': Wp_Tot, 'eta_inv_nom': 0.96})
        modelChain = ModelChain(system, location, clearsky_model='ineichen', aoi_model='no_loss', spectral_model="no_loss")

        modelChains[systemName] = modelChain

    ## ------------------- ##
    ## Simulate production ##
    ## ------------------- ##

    # Create an empty list to store all measured data for each systems
    systemsData_SimulatedMaxDailyEnergy_List = []

    # Iterate over each key-value pair in the systemsData dictionary
    for systemName, modelChain in modelChains.items():

        startDate = systemsData_MeasuredDailyEnergy.loc[~systemsData_MeasuredDailyEnergy[systemName].isna(), systemName].index.min()
        endDate = systemsData_MeasuredDailyEnergy.loc[~systemsData_MeasuredDailyEnergy[systemName].isna(), systemName].index.max()
        simulatedMaxDailyEnergy = simulateMaxDailyEnergy(startDate, endDate, modelChain, samplingFreq='1h')

        # Rename the column with the system name
        simulatedMaxDailyEnergy.rename(systemName, inplace=True)

        systemsData_SimulatedMaxDailyEnergy_List.append(simulatedMaxDailyEnergy)
        # Concatenate the column to the new_dataframe

    # Concatenate all the columns in the list to create one dataframe
    systemsData_SimulatedMaxDailyEnergy = pd.concat(systemsData_SimulatedMaxDailyEnergy_List, axis=1)
    systemsData_SimulatedMaxDailyEnergy.sort_index(inplace=True)

    # Save the dataframe to a CSV file
    systemsData_SimulatedMaxDailyEnergy.to_pickle(cacheFilename_systemsData_SimulatedMaxDailyEnergy)

# Print the dataframe
systemsData_SimulatedMaxDailyEnergy

### Relative production

True production scaled by the maximum production from the simulator


In [None]:
# Calculate the relative energy for each system
systemsData_RelativeDailyEnergy = systemsData_MeasuredDailyEnergy / systemsData_SimulatedMaxDailyEnergy
# plot all the relative energy on the same graph
fig = go.Figure()
for systemName in systemsName_Remaining:
    fig.add_trace(go.Scatter(x=systemsData_RelativeDailyEnergy[systemName].index, y=systemsData_RelativeDailyEnergy[systemName], mode='markers', name=systemName))

fig.show()

### Compare the difference between simulation with hourly and 10min sampling rate


## Half-Sibling Regression


### Functions


In [None]:


def mean_absolute_percentage_error_mean_denominator(
    y_true, y_pred, *, sample_weight=None, multioutput="uniform_average"
):
    # Copy of the function mean_absolute_percentage_error from sklearn.metrics._regression, with the denominator of the MAPE changed to the mean of the true values
    import sklearn

    y_type, y_true, y_pred, multioutput = sklearn.metrics._regression._check_reg_targets(
        y_true, y_pred, multioutput
    )
    sklearn.utils.validation.check_consistent_length(y_true, y_pred, sample_weight)
    epsilon = np.finfo(np.float64).eps
    mape = np.abs(y_pred - y_true) / np.maximum(np.mean(np.abs(y_true)), epsilon)
    output_errors = np.average(mape, weights=sample_weight, axis=0)
    if isinstance(multioutput, str):
        if multioutput == "raw_values":
            return output_errors
        elif multioutput == "uniform_average":
            # pass None as weights to np.average: uniform mean
            multioutput = None

    return np.average(output_errors, weights=multioutput)


def mad(arr):
    return abs(arr - arr.median()).median()


def modified_z_score(arr):
    # based on https://www.ibm.com/docs/en/cognos-analytics/11.1.0?topic=terms-modified-z-score
    mad_value = mad(arr)
    if mad_value == 0:
        MeanAD = np.mean(np.abs(arr - np.mean(arr)))
        denominator = 1.253314 * MeanAD
    else:
        denominator = 1.486 * mad_value

    return (arr - np.median(arr)) / denominator


def metrics(y_true, y_pred):
    return {
        'MAPE': mean_absolute_percentage_error(y_true, y_pred),
        'MAPE-MD': mean_absolute_percentage_error_mean_denominator(y_true, y_pred),
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': root_mean_squared_error(y_true, y_pred)
    }

In [None]:
import threading


import numpy as np

from sklearn.utils.parallel import Parallel, delayed
from sklearn.utils.validation import (
    check_is_fitted,
)
from sklearn.ensemble._base import _partition_estimators

def _accumulate_prediction(predict, X, out, lock):
    """
    This is a utility function for joblib's Parallel.

    It can't go locally in ForestClassifier or ForestRegressor, because joblib
    complains that it cannot pickle it when placed there.
    """
    prediction = predict(X, check_input=False)
    with lock:
            out.append(prediction)

def predict_w_std(self, X):
        """
        Predict regression target for X.

        The predicted regression target of an input sample is computed as the
        mean predicted regression targets of the trees in the forest.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
            The predicted values.
        """
        if self.n_outputs_ > 1:
            raise NotImplementedError("Variance for multi-output regression is not supported")
        
        check_is_fitted(self)
        # Check data
        X = self._validate_X_predict(X)

        # Assign chunk of trees to jobs
        n_jobs, _, _ =  _partition_estimators(self.n_estimators, self.n_jobs)

        # avoid storing the output of every estimator by summing them here

        # Initialize a list to collect predictions from each estimator
        all_predictions = []

        # Parallel loop
        lock = threading.Lock()
        Parallel(n_jobs=n_jobs, verbose=self.verbose, require="sharedmem")(
            delayed(_accumulate_prediction)(e.predict, X, all_predictions, lock)
            for e in self.estimators_
        )

        # Convert list to numpy array for easier manipulation
        all_predictions = np.array(all_predictions)

        # Compute mean and variance across predictions from all estimators
        mean_predictions = np.mean(all_predictions, axis=0)
        std_predictions = np.std(all_predictions, axis=0)

        return mean_predictions, std_predictions

RandomForestRegressor.predict_w_std = predict_w_std

### Train regressors & Compute metrics


In [None]:
systemsName_Target = ["a001159"]
serializer = PickleSerializer()

rf_regressors = {}
forceTrain = True
if not forceTrain:
    # load the models in dataCacheDirpath/rf_regressors. The file name is the system name.
    for systemName in systemsName_Target:
        try:
            rf_regressors[systemName] = serializer.retrieve_model(os.path.join(dataCacheDirpath, 'rf_regressors'), systemName)
        except FileNotFoundError:
            continue
    print(f"Loaded {len(rf_regressors)}/{len(systemsName_Target)} models. {len(systemsName_Target) - len(rf_regressors)} models to train.")

# Train a Random Forest Regressor model to predict the daily energy production of a system based on the daily energy production of the other systems
metrics_df = pd.DataFrame(index=systemsName_Target, columns=['MAPE', 'MAPE-MD', 'MAE', 'RMSE'])
features_importance_df = pd.DataFrame(index=systemsName_Target, columns=systemsName_Remaining)
permutation_importance_df = pd.DataFrame(index=systemsName_Target, columns=systemsName_Remaining)

y_pred_List = {}
for targetName in tqdm(set(systemsName_Target) - set(rf_regressors), desc='Training regressors'):
    rf_regressor = RandomForestRegressor()
    # remove the target column from the features
    X = systemsData_MeasuredDailyEnergy.drop(columns=targetName)
    y = systemsData_MeasuredDailyEnergy[targetName]
    # remove the observations where their is no target value
    X = X[~systemsData_MeasuredDailyEnergy[targetName].isna()]
    y = y[~systemsData_MeasuredDailyEnergy[targetName].isna()]
    # split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=random_state)
    # train the regressor
    rf_regressor.fit(X_train, y_train)
    # save the feature importances
    features_importance_df.loc[targetName, X.columns] = rf_regressor.feature_importances_
    # permutation_importance_df.loc[targetName, X.columns] = permutation_importance(rf_regressor, X_test, y_test, n_repeats=10, random_state=random_state, n_jobs=-1).importances_mean

    
    # test the regressor
    # y_pred = rf_regressor.predict(X_test)
    y_mean, y_std = rf_regressor.predict_w_std(X_test)

    # y_pred_V_IJ_unbiased = fci.random_forest_error(rf_regressor, X_train, X_test)
    # all_tree_predictions = np.array([tree.predict(X_new) for tree in rf_regressor.estimators_])


    # create a dataframe with a column with y_pred and a column with y_pred_V_IJ_unbiased
    y_pred_df = pd.DataFrame(data={'pred': y_mean, 'err': y_std}, index=X_test.index)
    y_pred_List[targetName] = y_pred_df

    # compute the metrics
    metrics_df.loc[targetName] = metrics(y_test, y_mean)

    # save the model
    serializer.save_model(rf_regressor, os.path.join(dataCacheDirpath, 'rf_regressors'), targetName)
    rf_regressors[targetName] = rf_regressor

In [None]:
# plot y_mean, y_test and y_std of the system a001159. Data available in the y_pred_List dictionary. Use the standard deviation to plot the error bars.
fig = go.Figure()
y_pred_df = y_pred_List["a001159"]
fig.add_trace(go.Scatter(x=y_pred_df.index, y=y_pred_df['pred'], name='pred', error_y=dict(type='data', array=y_pred_df['err'], visible=True), mode='markers'))
fig.add_trace(go.Scatter(x=y_test.index, y=y_test, mode='markers', name='true'))

### Generate expected value for each systems


In [None]:
cacheFilename_systemsData_ExpectedDailyEnergy = os.path.join(dataCacheDirpath, 'systemsData_ExpectedDailyEnergy.pkl')

if not forceImport and os.path.exists(cacheFilename_systemsData_ExpectedDailyEnergy):
    print(f"Loading cached data in {cacheFilename_systemsData_ExpectedDailyEnergy}")
    systemsData_ExpectedDailyEnergy = pd.read_pickle(cacheFilename_systemsData_ExpectedDailyEnergy)
else:
    # Create an empty list to store all expected data for each systems
    systemsData_ExpectedDailyEnergy_List = []

    for systemName in tqdm(systemsName_Remaining, desc='Generating expected daily energy'):
        # Comute the expected daily energy for the target system for all the dates
        X = systemsData_MeasuredDailyEnergy.drop(columns=systemName) # remove the target column from the features
        X = X[~systemsData_MeasuredDailyEnergy[systemName].isna()] # remove the observations where their is no target value
        expectedDailyEnergy = pd.Series(rf_regressors[systemName].predict(X), index=X.index)
        expectedDailyEnergy.rename(systemName, inplace=True)
        expectedDailyEnergy_V_IJ_unbiased = fci.random_forest_error(rf_regressors[systemName], mpg_X_train, X )
        systemsData_ExpectedDailyEnergy_List.append(expectedDailyEnergy)

    # Concatenate all the columns in the list to create one dataframe
    systemsData_ExpectedDailyEnergy = pd.concat(systemsData_ExpectedDailyEnergy_List, axis=1)
    systemsData_ExpectedDailyEnergy.sort_index(inplace=True)

    # Save the dataframe to a CSV file
    systemsData_ExpectedDailyEnergy.to_pickle(cacheFilename_systemsData_ExpectedDailyEnergy)

# Scaling technics & Outliers removal

Compute the:

- Global mean
- Global median
- Global standard deviation
- Rolling mean
- Rolling median
- Rolling standard deviation
- Simulate max production without info
- SImulated max production with info


In [None]:
targetName = "a001395"

X = systemsData_MeasuredDailyEnergy.drop(columns=targetName)
y = systemsData_MeasuredDailyEnergy[targetName]
y.index = pd.to_datetime(y.index)
# Global mean of the daily energy production of the target system
globalMean = y.mean()

# Global std of the daily energy production of the target system
globalStd = y.std()

# Gloabl median of the daily energy production of the target system
globalMedian = y.median()

roll = y.rolling(window='30D', min_periods=1, center=True)
# Rolling mean of the daily energy production of the target system. The window is 1 month
rollingMean = roll.mean()

# Rolling std of the daily energy production of the target system. The window is 7 days.
rollingStd = roll.std()

# Rolling Mean Absolute Deviation of the daily energy production of the target system. the function is mad with the arguments how='median' and center='median'
rollingMAD = roll.apply(mad)
# Rolling median of the daily energy production of the target system. The window is 7 days.
rollingMedian = roll.median()

# Rolling z score of the daily energy production of the target system. The function is modified_z_score
# modifiedZScore = 0.673 * (y - rollingMedian) / rollingMAD

# Plot the global and rolling mean, std, and median of the daily energy production of the target system
fig = go.Figure()
fig.add_trace(go.Scatter(x=y.index, y=y, mode='markers', name='Daily energy production'))
# fig.add_trace(go.Scatter(x=y.index, y=[globalMean]*len(y), mode='lines', name='Global mean'))
# fig.add_trace(go.Scatter(x=y.index, y=[globalMean+globalStd]*len(y), mode='lines', name='Global mean + std'))
# fig.add_trace(go.Scatter(x=y.index, y=[globalMean-globalStd]*len(y), mode='lines', name='Global mean - std'))
# fig.add_trace(go.Scatter(x=y.index, y=[globalMedian]*len(y), mode='lines', name='Global median'))
fig.add_trace(go.Scatter(x=rollingMean.index, y=rollingMean, mode='lines', name='Rolling mean'))
# fig.add_trace(go.Scatter(x=rollingMean.index, y=rollingMean+rollingStd, mode='lines', name='Rolling mean + std'))
# fig.add_trace(go.Scatter(x=rollingMean.index, y=rollingMean-rollingStd, mode='lines', name='Rolling mean - std'))
fig.add_trace(go.Scatter(x=rollingMedian.index, y=rollingMedian, mode='lines', name='Rolling median'))
fig.add_trace(go.Scatter(x=rollingMAD.index, y=rollingMedian + 4 * rollingMAD, mode='lines', name='Rolling Median + 4*Rolling MAD'))
fig.add_trace(go.Scatter(x=rollingMAD.index, y=rollingMAD, mode='lines', name='Rolling MAD'))

# fig.add_trace(go.Scatter(x=modifiedZScore.index, y=rollingMedian+modifiedZScore, mode='lines', name='Rolling Median + Rolling Z score'))

fig.update_layout(title=f'Global and rolling mean, std, and median of the daily energy production of system {targetName}', yaxis_title='Daily energy production (kWh)')
# fig.update_layout(width=1000, height=666)
fig.show()

In [None]:
def zscore(s, window, thresh=3, return_all=False):
    roll = s.rolling(window=window, min_periods=1, center=True)
    avg = roll.mean()
    std = roll.std(ddof=0)
    z = s.sub(avg).div(std)
    m = z.between(-thresh, thresh)

    if return_all:
        return z, avg, std, m
    return s.where(m, avg)


z, avg, std, m = zscore(y, window=50, return_all=True)

ax = plt.subplot()

y.plot(label='data')
avg.plot(label='mean')
y.loc[~m].plot(label='outliers', marker='o', ls='')
# avg[~m].plot(label='replacement', marker='o', ls='')
plt.legend()

# Plot results


In [None]:
fig = go.Figure(layout_yaxis_title="Daily Energy (kWh)")

# Add initial traces
systemName = systemsName_Remaining[0]
fig.add_trace(go.Scatter(x=systemsData_SimulatedMaxDailyEnergy[systemName].index, y=systemsData_SimulatedMaxDailyEnergy[systemName], mode='markers', name='Simulated Max Daily Energy'))
fig.add_trace(go.Scatter(x=systemsData_MeasuredDailyEnergy[systemName].index, y=systemsData_MeasuredDailyEnergy[systemName], mode='markers', name='Measured Daily Energy'))
fig.add_trace(go.Scatter(x=systemsData_ExpectedDailyEnergy[systemName].index, y=systemsData_ExpectedDailyEnergy[systemName], mode='markers', name='Expected Daily Energy'))

# Create dropdown menu
buttons = []
for systemName in systemsName_Remaining:
    button = dict(
        label=systemName,
        method="update",
        args=[{"x": [systemsData_SimulatedMaxDailyEnergy[systemName].index, systemsData_MeasuredDailyEnergy[systemName].index, systemsData_ExpectedDailyEnergy[systemName].index],
               "y": [systemsData_SimulatedMaxDailyEnergy[systemName], systemsData_MeasuredDailyEnergy[systemName], systemsData_ExpectedDailyEnergy[systemName]]}]
    )
    buttons.append(button)

fig.update_layout(
    updatemenus=[
        dict(
            buttons=buttons,
            direction="down",
            showactive=True,
            x=0.05,
            xanchor="left",
            y=1.15,
            yanchor="top"
        ),
    ]
)

# add axis labels
# fig.update_yaxes(title_text="Daily Energy (kWh)")

# Set size
# fig.update_layout(
#     autosize=False,
#     width=1000,
#     height=666,
# )
fig.show()

In [None]:
fig = go.Figure(layout_yaxis_title="Features Importance", layout_legend_x=0.7, layout_legend_y=0.9)

# Add initial traces
systemName = systemsName_Remaining[0]
fig.add_trace(go.Bar(x=features_importance_df.columns, y=features_importance_df.loc[systemName], name='Impurity-based Importance'))
fig.add_trace(go.Bar(x=permutation_importance_df.columns, y=permutation_importance_df.loc[systemName], name='Permutation Importance'))

# Create dropdown menu
buttons = []
for systemName in systemsName_Remaining:
    button = dict(
        label=systemName,
        method="update",
        args=[{"x": [features_importance_df.columns, permutation_importance_df.columns],
               "y": [features_importance_df.loc[systemName], permutation_importance_df.loc[systemName]]}]
    )
    buttons.append(button)

fig.update_layout(
    updatemenus=[
        dict(
            buttons=buttons,
            direction="down",
            showactive=True,
            x=0.05,
            xanchor="left",
            y=1.15,
            yanchor="top"
        ),
    ]
)

# add axis labels
# fig.update_yaxes(title_text="Daily Energy (kWh)")

# Set size
fig.update_layout(
    autosize=False,
    width=1000,
    height=666,
)
fig.show()

## Optimise regressor parameters

Find the appropriate parameters with https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html


In [None]:
# TODO

## Find the importance of each feature (PV System) in the regression

https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html


In [None]:
# TODO

rf_regressors[0]