In [1]:
import os
import numpy as np
import pandas as pd
import glob
import xml.etree.ElementTree as et

os.chdir("../../..")
from xai_green_tech_adoption.utils.utils import *

pd.options.mode.chained_assignment = None

In [2]:
# Get list with names of all xml-files
xml_list = glob.glob("data/raw_data/Target_Feature/xml_data/EinheitenSolar_*.xml")

In [3]:
def get_df_from_xml(xml_path, list_relevant_features, elem_name):
    """
    Transforms xml-file containing PV-data by Marktstammdatenregister into dataframe listing PV installations.
    @param xml_path: path of the relevant xml-file.
    @param list_relevant_features: Features of the PV installations to be considered in the dataframe as columns.
    @param elem_name: name of elements (children of root element).
    @return: Dataframe with rows corresponding to PV installations and columns corresponding to the features of the installations given by list_relevant_features.
    """
    tree = et.parse(xml_path)
    root = tree.getroot()
    entities_list = []
    for solar_entity in root.findall(elem_name):
        # iterate over all PV installations
        # dict features saves all relevant features of a PV installation
        features = {}
        for feature in list_relevant_features:
            # add all relevant features of PV installations to dataframe corresponding to installation
            val_feature = solar_entity.find(feature)
            if val_feature is not None:
                features.update({feature: val_feature.text})
            else:
                features.update({feature: np.nan})
        entities_list.append(features)
    return pd.DataFrame(entities_list)

In [4]:
def all_xml_to_df(xml_list, list_relevant_features, elem_name):
    """
    Creates a dataframe containing data on PV installations contained in multiple xml-files.
    @param xml_list: List of xml-files that give data for dataframe.
    @param list_relevant_features: list of features of PV installations to be included in the dataframe.
    @param elem_name: name of elements (children of root element).
    @return: Dataframe with PV data given by xml-files of xml_list.
    """
    df_complete = pd.DataFrame(columns=list_relevant_features)
    for xml_file in xml_list:
        df_complete = pd.concat(
            [df_complete, get_df_from_xml(xml_file, list_relevant_features, elem_name)],
            axis=0,
        )
    return df_complete

In [5]:
df_complete_raw = all_xml_to_df(xml_list, list_relevant_features, elem_name_xml_pv)
display(df_complete_raw)

Unnamed: 0,EinheitMastrNummer,Gemeinde,Gemeindeschluessel,Registrierungsdatum,Inbetriebnahmedatum,EinheitSystemstatus,EinheitBetriebsstatus,Energietraeger,Nettonennleistung,AnzahlModule
0,SEE906512237036,Frielendorf,06634004,2019-01-31,1997-07-01,472,35,2495,1.500,8
1,SEE913696975296,Blaubeuren,08425020,2019-01-31,2007-03-08,472,35,2495,16.114,92
2,SEE954246547006,Wesel,05170048,2019-01-31,2010-09-30,472,35,2495,10.550,57
3,SEE924152369610,Peine,03157006,2019-01-31,2012-10-01,472,35,2495,5.197,25
4,SEE997129649626,Hütschenhausen,07335016,2019-01-31,2018-02-06,472,35,2495,8.000,35
...,...,...,...,...,...,...,...,...,...,...
99995,SEE969238275898,Neuhardenberg,12064340,2020-05-08,2005-07-19,472,35,2495,3.200,20
99996,SEE979445195230,Hamburg,02000000,2020-05-08,2010-04-22,472,35,2495,4.000,22
99997,SEE991266469536,Aiterhofen,09278113,2020-05-08,2007-11-26,472,35,2495,13.800,145
99998,SEE963380719293,Lügde,05766052,2020-05-08,2010-09-25,472,35,2495,8.000,45


In [6]:
# change column names
df_complete_raw.rename(
    {
        "Inbetriebnahmedatum": col_com_date_pv,
        "Registrierungsdatum": col_date_registration_pv,
        "EinheitMastrNummer": col_installation_id_pv,
        "Gemeinde": col_name_m,
        "Gemeindeschluessel": col_id_m,
        "Nettonennleistung": col_power_pv,
    },
    axis=1,
    inplace=True,
)

In [7]:
# fix data types of columns of raw dataframe
df_complete_raw[col_com_date_pv] = pd.to_datetime(
    df_complete_raw[col_com_date_pv], yearfirst=True
)
df_complete_raw[col_date_registration_pv] = pd.to_datetime(
    df_complete_raw[col_date_registration_pv], yearfirst=True
)
df_complete_raw[col_power_pv] = pd.to_numeric(df_complete_raw[col_power_pv])
print(
    f"There are {df_complete_raw[col_id_m].isna().sum()} instances without values for the AGS. I drop these instances."
)
df_complete_raw = df_complete_raw.loc[df_complete_raw[col_id_m].notna()]
df_complete_raw[col_id_m] = df_complete_raw[col_id_m].astype(int)

There are 13 instances without values for the AGS. I drop these instances.


In [None]:
# keep registrations before Nov 2022
df_reg_filtered = df_complete_raw[
    (df_complete_raw[col_date_registration_pv].dt.year < t_reg_pv_latest_year)
    | (
        (df_complete_raw[col_date_registration_pv].dt.year == t_reg_pv_latest_year)
        & (df_complete_raw[col_date_registration_pv].dt.month <= t_reg_pv_latest_month)
    )
]


In [9]:
def aggregate_pv_m(
    df_raw,
    col_id_m,
    col_name_m,
    col_installation_id,
    col_count_pv,
    col_power,
    col_power_accum,
    col_t,
    pow_min,
    pow_max,
    t,
):
    """
    The function generates a dataframe giving the number and accumulated capacity of all pv systems for all municipalities. It considers all pv systems installed in year "t" (i.e., commissioning date lies in year t) with a net nominal capacity larger or equal to pow_min and smaller or equal pow_max.
    @param df_raw: Dataframe with listing pv system registered at Marktstammdatenregister
    @param col_id_m: ags of municipalities
    @param col_name_m: names of municipalities
    @param col_installation_id: unique id/code of py system
    @param col_count_pv: number of pv installations
    @param col_power: net nomial capacity
    @param col_t: (column of) commissioning date
    @param pow_min: minimum net nominal capacity of pv systems considered
    @param pow_max: maximum net nominal capacity of pv systems considered
    @param t: year of interest (the output dataframe contains all installations with a commissioning date within this year)
    @return: Dataframe with municipalities as rows, accumulated net nominal capacity and number of pv installation ( and names and ags of municipalities) as columns. Municipalities are identifies ba ags.
    """
    df_t = df_raw[df_raw[col_t].dt.year == t]
    df_pv = df_t[(df_t[col_power] >= pow_min) & (df_t[col_power] <= pow_max)]
    df_aggr = pd.DataFrame(
        df_pv[[col_id_m, col_name_m, col_installation_id, col_power]]
        .groupby(by=col_id_m, as_index=False)
        .agg({col_name_m: "unique", col_installation_id: "count", col_power: "sum"})
    )
    df_aggr[col_name_m] = df_aggr[col_name_m].apply(lambda elem: elem[0])
    df_aggr.rename(
        {col_installation_id: col_count_pv, col_power: col_power_accum},
        axis=1,
        inplace=True,
    )
    return df_aggr

In [10]:
min_power = [0]
max_power = [10]

In [None]:
for year in list(df_reg_filtered[col_com_date_pv].dt.year.dropna().unique()):
    for min_p, max_p in zip(min_power, max_power):
        df_m_acc = aggregate_pv_m(
            df_reg_filtered,
            col_id_m=col_id_m,
            col_name_m=col_name_m,
            col_installation_id=col_installation_id_pv,
            col_count_pv=col_count_pv,
            col_power=col_power_pv,
            col_power_accum=col_power_accum_pv,
            col_t=col_com_date_pv,
            t=year,
            pow_min=min_p,
            pow_max=max_p,
        )
        df_m_acc.to_csv(
            f"data/intermediate_data/PV_in_municipalities/pv_m_{int(year)}_max_{max_p}.csv",
            index=False,
            sep=";",
        )