## Setup & Import

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pvlib
import json
import os
from pvlib.pvsystem import PVSystem, Array, FixedMount
from pvlib.location import Location
from pvlib.modelchain import ModelChain
from pvlib.temperature import TEMPERATURE_MODEL_PARAMETERS
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = "browser" # render plotly figures in browser

PARENT_DATA_DIR = os.getenv('PARENT_DATA_DIR')
if PARENT_DATA_DIR is None:
    raise ValueError("PARENT_DATA_DIR environment variable is not set")


dataDirpath = PARENT_DATA_DIR + r"\PRiOT\dataExport_3400_daily"
logsDirpath = r"..\logs"


## Import PRiOT data

In [None]:
# Load the metadata JSON file
metadataFilepath = os.path.join(dataDirpath, "metadata.json")

with open(metadataFilepath, 'r') as f:
    metadata = json.load(f)

# Load all csv files from the data directory
systemsData = {}
for file in os.listdir(dataDirpath):
    if file.endswith(".csv"):
        systemName = file.split("_")[0]
        systemsData[systemName] = pd.read_csv(os.path.join(dataDirpath, file))
        systemsData[systemName]['Datetime'] = pd.to_datetime(systemsData[systemName]['Timestamp'], unit='ms', utc=True).dt.tz_convert('Europe/Zurich')
        systemsData[systemName]['Date'] = (systemsData[systemName]['Datetime']+pd.Timedelta(hours=1)).dt.date # Convert the datetime to only the date, as the production is the daily production. The +1h is to manage the saving time. Normally PRiOT exports the data at midnight (local time) for the day after (e.g. the energy for the July 1st is saved at July 1st 00:00 Europe/Zurich). However it seams that the saving time is not always correctly handled, and sometime the export is done at 23:00 the day before (e.g. the energy for the July 1st is saved at June 30th 23:00 Europe/Zurich). This is why we add 1h to the datetime to be sure to have the correct date.
        # systemsData[systemName]['energy_daily_norm'] = systemsData[systemName]['tt_forward_active_energy_total_toDay'] / metadata[systemName]['metadata']['pv_kwp']

systemsName = list(systemsData.keys())

df_duplicate_list = list()
for systemName, systemData in systemsData.items():
    # Save duplicate dates to log list, and the in a log file
    df_duplicate_list.append(systemData[systemsData[systemName]['Date'].duplicated(keep=False)])

    # Remove duplicate date where tt_forward_active_energy_total_toDay is the smallest 
    # TODO maybe we should sum the energy of the duplicates instead of removing the smallest one. However, when looking in PRiOT Portal, it seams that in the daily energy, only the biggest value is represented. We do the same here.
    systemData.sort_values('tt_forward_active_energy_total_toDay', ascending=True, inplace=True)
    systemsData[systemName].drop_duplicates(subset='Date', keep='last', inplace=True)

    # Set date as the index and sort the data by date
    systemsData[systemName].set_index('Date', inplace=True)
    systemData.sort_index(ascending=True, inplace=True)

# Save duplicate dates to log file
df_duplicate = pd.concat(df_duplicate_list)
print(f"Number of duplicate dates found: {len(df_duplicate)}")
df_duplicate.to_csv(os.path.join(logsDirpath,'duplicateDates.csv'), index=True)



### Convert data & Filter out invalid PRiOT systems

In [None]:
systemsNameRemaining = systemsName.copy()
for systemName in systemsName:
    missingData = False
    if len(systemsData[systemName]) == 0:
        missingData = True
        print(f"No measures found for system {systemName}")
    for key in ['loc_latitude', 'loc_longitude', 'pv_kwp']:
        if key not in metadata[systemName]['metadata']:
            missingData = True
            print(f"No {key} found for {systemName}")
        # test that the value is a number
        elif not isinstance(metadata[systemName]['metadata'][key], (int, float)):
            try:
                metadata[systemName]['metadata'][key] = int(metadata[systemName]['metadata'][key])
            except ValueError:
                try:
                    metadata[systemName]['metadata'][key] = float(metadata[systemName]['metadata'][key])
                except ValueError:
                    missingData = True
                    print(f"The key-value '{key}:{metadata[systemName]['metadata'][key]}' is not a number for system {systemName}")


    if(len(metadata[systemName]['arrays'])==0):
        print(f"No PV arrays found for system {systemName}")
        missingData = True  
    for array_num, arrayData in metadata[systemName]['arrays'].items():
        for key in ['pv_tilt', 'pv_azimut', 'pv_wp', 'pv_number']:
            if key not in arrayData:
                missingData = True
                print(f"No {key} found for array {array_num} of system {systemName}")
            # test that the value is a number
            elif not isinstance(arrayData[key], (int, float)):
                try:
                    arrayData[key] = int(arrayData[key])
                except ValueError:
                    try:
                        arrayData[key] = float(arrayData[key])
                    except ValueError:
                        missingData = True
                        print(f"The key-value '{key}:{arrayData[key]}' is not a number for array {array_num} of system {systemName}")

    if missingData:
        systemsNameRemaining.remove(systemName)
        print(f"-> Removing system {systemName} from the list of systems")

print(f"Number of systems with all the necessary data: {len(systemsNameRemaining)}/{len(systemsName)}")


In [None]:
import pandas as pd

# Create an empty dataframe to store the concatenated column
columns = []

# Iterate over each key-value pair in the systemsData dictionary
for system_name, system_data in systemsData.items():
    # Extract the 'tt_forward_active_energy_total_toDay' column from the current dataframe
    column = system_data['tt_forward_active_energy_total_toDay']
    
    # Rename the column with the system name
    column = column.rename(system_name)
    
    columns.append(column)
    # Concatenate the column to the new_dataframe
    
new_dataframe = pd.concat(columns, axis=1)
new_dataframe.sort_index(inplace=True)
# Print the new_dataframe
new_dataframe