In [1]:
import os
import pandas as pd

os.chdir("../../..")
from xai_green_tech_adoption.utils.utils import *

pd.options.mode.chained_assignment = None

# Using previous changes in ags to update mapping of ags (of m) to rs (of ma)


In [2]:
df_map_m_ma = pd.read_csv(
    "data/intermediate_data/mapping_municipalities_raw.csv", sep=";"
)
df_map_m_ma.rename(
    {
        "Gemeindename": col_name_m,
        "Amtlicher Gemeindesschlüssel (AGS)": col_id_m,
        "Gemeindeverbandsname": col_name_ma,
        "Regionalschlüssel (RS) Gemeindeverband": col_id_ma,
    },
    axis=1,
    inplace=True,
)

In [3]:
def get_changes_type_dependent(
    df_complete, col_type, m_ma_type, col_old_id, col_new_id
):
    """
    Given a dataframe (changes_ags_XXX.csv), it returns the change of ags for the type of spatial entity (m or ma) given by m_ma_type. It considers (one-step) transitivity: ags1 -> ags2, ags2 -> ags3 results in ags1 -> ags3.
    @param df_complete: a dataframe given by 'changes_ags_XXX.csv' including changes in ags ans rs
    @param col_type: column indicating the type of spatial entity (municipality or municipality association)
    @param m_ma_type: string indicating type of spatial entities (municipalities ('Gemeinde') or municipality association ('Gemeindeverband'))
    @param col_old_id: column of df_complete giving old ags
    @param col_new_id: column of df_complete giving new ags
    @return: Dataframe indicating the changes of ags occuring within one year.
    """
    df_type = df_complete[df_complete[col_type] == m_ma_type]
    df_type_id = df_type[[col_old_id, col_new_id]]
    # only consider changes in ags if old and new ags are not NaN
    df_type_id = df_type_id[df_type_id.notna().all(axis=1)]
    df_changes_id = df_type_id[df_type_id[col_old_id] != df_type_id[col_new_id]]
    # apply transitivity
    df_changes_id_copy = df_changes_id.copy()
    col_dummy = "transitivity"
    df_changes_id_copy.rename({col_old_id: col_dummy}, axis=1, inplace=True)
    df_changes_id.rename({col_new_id: col_dummy}, axis=1, inplace=True)
    df_changes_id = df_changes_id.merge(df_changes_id_copy, on=col_dummy, how="left")
    df_changes_id.loc[df_changes_id[col_new_id].isna(), col_new_id] = df_changes_id[
        col_dummy
    ]
    # if A==C, do not consider transitivity but only include mappings A->B and B->C
    df_changes_id = df_changes_id[
        df_changes_id[col_old_id] != df_changes_id[col_new_id]
    ]
    df_changes_id.drop([col_dummy], axis=1, inplace=True)
    return df_changes_id

In [4]:
def merge_new_ids(df_existing_ids, df_new_ids, col_old_ids, col_new_ids):
    """
    Given changes in ags of previous time steps (df_existing_ids), it adds new changes of time steps (given by df_new_ids) considering transitivity (i.e., if df_existing_ids contains m1 -> m2, and df_new_ids m2 -> m3, the resulting df contains m1 -> m3 and m2 -> m3.
    @param df_existing_ids: Dataframe containing changes of ags in previous timesteps
    @param df_new_ids: Dataframe with new ags changes
    @param col_old_ids: name (in both df) of column containing old ags
    @param col_new_ids: name (in both df) of column containing new ags
    @return: Dataframe containing all changes in ags ("old" changes from df_existing_ids, "new" changes from df_new_ids)
    """

    # only consider changes in ags if there is no mapping to a new ags for an old ags -> consider first change in ags
    df_new_ids = df_new_ids[
        ~df_new_ids[col_old_ids].isin(list(df_existing_ids[col_old_ids]))
    ]

    # there are duplicates in col_old_ids of df_new_ids. -> choose first change, drop the other.
    df_new_ids.drop_duplicates(
        subset=[col_old_ids], keep="first", inplace=True, ignore_index=True
    )

    assert (
        sum(df_new_ids[col_old_ids].isin(list(df_existing_ids[col_old_ids]))) == 0
    ), f"Attention: Multiple changes for an ags. Ambiguous information."

    col_dummy = "temp name"
    df_existing_ids_dummy_col = df_existing_ids.rename({col_new_ids: col_dummy}, axis=1)
    df_new_ids_dummy_col = df_new_ids.rename({col_old_ids: col_dummy}, axis=1)
    df_transitivity_considered_dummy_col = df_existing_ids_dummy_col.merge(
        df_new_ids_dummy_col, on=col_dummy, how="left"
    )
    df_transitivity_considered_dummy_col.loc[
        df_transitivity_considered_dummy_col[col_new_ids].isna(), col_new_ids
    ] = df_transitivity_considered_dummy_col[col_dummy]
    # only consider transitivity (A->B & B->C => A->C) if A!=C
    df_transitivity_considered_dummy_col = df_transitivity_considered_dummy_col[
        df_transitivity_considered_dummy_col[col_old_ids]
        != df_transitivity_considered_dummy_col[col_new_ids]
    ]
    df_transitivity_considered = df_transitivity_considered_dummy_col.drop(
        [col_dummy], axis=1
    )
    return pd.concat([df_transitivity_considered, df_new_ids], ignore_index=True)

In [5]:
def fix_changes_ags(
    df_mapping_m_ma,
    t_start,
    t_end,
    col_type,
    type_m_ma,
    col_old_ags,
    col_new_ags,
    col_mapping_ma,
    col_mapping_m,
):
    """
    Extends the mapping of m to ma given by df_mapping_m_ma by the changes of ags occurring between t_start and t_end
    @param df_mapping_m_ma: Dataframe giving from m (AGS) to ma (RS). Areal units as at 31.12.2019.
    @param t_start: First year of changes to be considered
    @param t_end: Year of last changes in AGS to be considered (2019)
    @param col_type: column name giving type of areal unit
    @param type_m_ma: municipalities (m) or municipal associations (ma)
    @param col_old_ags: Column name which is used for old AGS in created dataframe
    @param col_new_ags: Column name which is used for new AGS in created dataframe
    @param col_mapping_ma: Column name of RS in df_mapping_m_ma
    @param col_mapping_m: Column name of AGS in df_mapping_m_ma
    @return:
    """
    t_inkar = 2019

    df_changes_m = pd.DataFrame(columns=[col_old_ags, col_new_ags])

    if t_end <= t_inkar:
        # data published earlier than inkar
        for year in range(t_start, t_end + 1):
            df_changes_ags_year = pd.read_csv(
                f"data/intermediate_data/changes_ags/changes_ags_{year}.csv", sep=";"
            )
            df_changes_m_year = get_changes_type_dependent(
                df_changes_ags_year,
                col_type=col_type,
                m_ma_type=type_m_ma,
                col_old_id=col_old_ags,
                col_new_id=col_new_ags,
            )
            df_changes_m_year = df_changes_m_year.astype(int)
            df_changes_m = merge_new_ids(
                df_changes_m,
                df_new_ids=df_changes_m_year,
                col_old_ids=col_old_ags,
                col_new_ids=col_new_ags,
            )
    else:
        # data published later than inkar
        df_changes_m_forward = pd.DataFrame(columns=[col_old_ags, col_new_ags])
        for year in range(t_start, t_inkar + 1):
            df_changes_ags_year = pd.read_csv(
                f"data/intermediate_data/changes_ags/changes_ags_{year}.csv", sep=";"
            )
            df_changes_m_year = get_changes_type_dependent(
                df_changes_ags_year,
                col_type=col_type,
                m_ma_type=type_m_ma,
                col_old_id=col_old_ags,
                col_new_id=col_new_ags,
            )
            df_changes_m_year = df_changes_m_year.astype(int)
            df_changes_m_forward = merge_new_ids(
                df_changes_m_forward,
                df_new_ids=df_changes_m_year,
                col_old_ids=col_old_ags,
                col_new_ids=col_new_ags,
            )

        df_changes_m_backward = pd.DataFrame(columns=[col_old_ags, col_new_ags])
        for year in reversed(range(t_inkar + 1, t_end + 1)):
            df_changes_ags_year = pd.read_csv(
                f"data/intermediate_data/changes_ags/changes_ags_{year}.csv", sep=";"
            )
            df_changes_ags_year.rename(
                {
                    col_destatis_old_ags: col_destatis_new_ags,
                    col_destatis_new_ags: col_destatis_old_ags,
                },
                inplace=True,
                axis=1,
            )
            df_changes_m_year = get_changes_type_dependent(
                df_changes_ags_year,
                col_type=col_type,
                m_ma_type=type_m_ma,
                col_old_id=col_new_ags,
                col_new_id=col_old_ags,
            )
            df_changes_m_year = df_changes_m_year.astype(int)
            df_changes_m_backward = merge_new_ids(
                df_changes_m_backward,
                df_new_ids=df_changes_m_year,
                col_old_ids=col_old_ags,
                col_new_ids=col_new_ags,
            )
        df_changes_m = pd.concat(
            [df_changes_m_forward, df_changes_m_backward], ignore_index=True
        )


    # df_changes_m yields a dataframe with all changes in ags between t_start and t_end

    # extent mapping of m to ma using the changes in ags
    col_dummy = "transitivity column"
    df_mapping_m_ma_dummy = df_mapping_m_ma.rename({col_mapping_m: col_dummy}, axis=1)
    df_changes_m.rename({col_new_ags: col_dummy}, inplace=True, axis=1)
    df_changes_m_map = df_changes_m.merge(
        df_mapping_m_ma_dummy, on=col_dummy, how="left"
    )
    # only consider changes in ags if there is no mapping to ma (rs) for old ags
    df_changes_m_map = df_changes_m_map[
        ~df_changes_m_map[col_old_ags].isin(list(df_mapping_m_ma[col_mapping_m]))
    ]
    df_changes_m_map.drop([col_dummy], axis=1, inplace=True)
    df_changes_m_map.rename({col_old_ags: col_mapping_m}, axis=1, inplace=True)
    df_mapping_m_ma_extended = pd.concat(
        [df_mapping_m_ma, df_changes_m_map], ignore_index=True
    )
    assert (
        df_mapping_m_ma_extended[col_id_m].nunique()
        == df_mapping_m_ma_extended.shape[0]
    ), "Attention: The mapping of m to ma is not unique!"
    print(
        f"{t_start}-{t_end}: There are {df_changes_m_map[col_mapping_ma].isna().sum()} municipalities without a mapping value. Dropping these municipalities."
    )
    df_mapping_m_ma_extended = df_mapping_m_ma_extended[
        df_mapping_m_ma_extended[col_id_ma].notna()
    ]
    return df_mapping_m_ma_extended

In [6]:
# Creating mapping for Census data
t_start_zensus = 2011
t_end = 2019
df_mapping_2011_2019 = fix_changes_ags(
    df_map_m_ma,
    t_start=t_start_zensus,
    t_end=t_end,
    col_type=col_destatis_type,
    type_m_ma=type_m,
    col_old_ags=col_destatis_old_ags,
    col_new_ags=col_destatis_new_ags,
    col_mapping_ma=col_id_ma,
    col_mapping_m=col_id_m,
)
df_mapping_2011_2019.to_csv(
    f"data/intermediate_data/mapping_municipalities_{t_start_zensus}_{t_end}.csv",
    sep=";",
    index=False,
)

# Mapping for PV data
t_start_pv = 2000
df_mapping_2000_2019 = fix_changes_ags(
    df_map_m_ma,
    t_start=t_start_pv,
    t_end=t_end,
    col_type=col_destatis_type,
    type_m_ma=type_m,
    col_old_ags=col_destatis_old_ags,
    col_new_ags=col_destatis_new_ags,
    col_mapping_ma=col_id_ma,
    col_mapping_m=col_id_m,
)
df_mapping_2000_2019.to_csv(
    f"data/intermediate_data/mapping_municipalities_{t_start_pv}_{t_end}.csv",
    sep=";",
    index=False,
)

# Mapping for data from regional database
t_start_regiostat = 2017
df_mapping_2017_2019 = fix_changes_ags(
    df_map_m_ma,
    t_start=t_start_regiostat,
    t_end=t_end,
    col_type=col_destatis_type,
    type_m_ma=type_m,
    col_old_ags=col_destatis_old_ags,
    col_new_ags=col_destatis_new_ags,
    col_mapping_ma=col_id_ma,
    col_mapping_m=col_id_m,
)
df_mapping_2017_2019.to_csv(
    f"data/intermediate_data/mapping_municipalities_{t_start_regiostat}_{t_end}.csv",
    sep=";",
    index=False,
)

2011-2019: There are 0 municipalities without a mapping value. Dropping these municipalities.
2000-2019: There are 50 municipalities without a mapping value. Dropping these municipalities.
2017-2019: There are 0 municipalities without a mapping value. Dropping these municipalities.
