In [1]:
import os
import pandas as pd

os.chdir("../../..")
from xai_green_tech_adoption.utils.utils import *

pd.options.mode.chained_assignment = None

In [2]:
# choose target feature
max_capacity = 10

t_start_target = 0
t_end_target = 2021
target_col = col_power_accum_pv
denominator_column = col_count_households

## 1. Prepare descriptive features

### 1.1. Load INKAR dataset and delete irrelevant columns

In [3]:
df_inkar = pd.read_csv("data/intermediate_data/preprocessed_inkar_data.csv", sep=";")
df_inkar.head()

Unnamed: 0,Name of municipality ass.,Code of municipality associations (RS),County code,State code,Arbeitslose,sozialversicherungspflichtig Beschäftigte am Arbeitsort,sozialversicherungspflichtig Beschäftigte am Wohnort,Bevölkerung gesamt,Bevölkerung männlich,Bevölkerung weiblich,...,Umsatz Bauhauptgewerbe,Anteil Bruttowertschöpfung Primärer Sektor,Anteil Bruttowertschöpfung Sekundärer Sektor,Anteil Bruttowertschöpfung Tertiärer Sektor,Beschäftigte im Handwerk,Kleinstbetriebe,Kleinbetriebe,Mittlere Unternehmen,Großunternehmen,Umsatz im Handwerk
0,"Flensburg, Stadt",10010000,1001,1,4174.0,44607.0,32323.0,90164.0,44904.0,45260.0,...,120.63,0.01,21.13,78.86,11.61,82.77,13.34,3.28,0.6,116.25
1,"Kiel, Landeshauptstadt",10020000,1002,1,10556.0,125483.0,91908.0,246794.0,120198.0,126596.0,...,111.89,0.02,16.81,83.17,7.99,83.3,12.79,3.16,0.75,88.41
2,"Lübeck, Hansestadt",10030000,1003,1,8400.0,99053.0,80035.0,216530.0,104032.0,112498.0,...,138.15,0.11,23.01,76.89,15.44,85.35,11.14,2.92,0.59,81.05
3,"Neumünster, Stadt",10040000,1004,1,3386.0,40641.0,30015.0,80196.0,39723.0,40473.0,...,112.16,0.28,24.1,75.62,10.73,82.8,13.3,3.36,0.54,128.06
4,"Brunsbüttel, Stadt",10510011,1051,1,335.0,7025.0,4711.0,12380.0,6240.0,6140.0,...,123.86,4.05,37.91,58.04,15.95,88.09,10.1,1.65,0.16,106.46


In [4]:
df_inkar.drop([col_id_s, col_id_c], inplace=True, axis=1)
df_inkar.head()

Unnamed: 0,Name of municipality ass.,Code of municipality associations (RS),Arbeitslose,sozialversicherungspflichtig Beschäftigte am Arbeitsort,sozialversicherungspflichtig Beschäftigte am Wohnort,Bevölkerung gesamt,Bevölkerung männlich,Bevölkerung weiblich,Erwerbsfähige Bevölkerung (15 bis unter 65 Jahre),Bevölkerung (mit Korrektur VZ 1987/Zensus 2011),...,Umsatz Bauhauptgewerbe,Anteil Bruttowertschöpfung Primärer Sektor,Anteil Bruttowertschöpfung Sekundärer Sektor,Anteil Bruttowertschöpfung Tertiärer Sektor,Beschäftigte im Handwerk,Kleinstbetriebe,Kleinbetriebe,Mittlere Unternehmen,Großunternehmen,Umsatz im Handwerk
0,"Flensburg, Stadt",10010000,4174.0,44607.0,32323.0,90164.0,44904.0,45260.0,60031.0,90164.0,...,120.63,0.01,21.13,78.86,11.61,82.77,13.34,3.28,0.6,116.25
1,"Kiel, Landeshauptstadt",10020000,10556.0,125483.0,91908.0,246794.0,120198.0,126596.0,169760.0,246794.0,...,111.89,0.02,16.81,83.17,7.99,83.3,12.79,3.16,0.75,88.41
2,"Lübeck, Hansestadt",10030000,8400.0,99053.0,80035.0,216530.0,104032.0,112498.0,138758.0,216530.0,...,138.15,0.11,23.01,76.89,15.44,85.35,11.14,2.92,0.59,81.05
3,"Neumünster, Stadt",10040000,3386.0,40641.0,30015.0,80196.0,39723.0,40473.0,51220.0,80196.0,...,112.16,0.28,24.1,75.62,10.73,82.8,13.3,3.36,0.54,128.06
4,"Brunsbüttel, Stadt",10510011,335.0,7025.0,4711.0,12380.0,6240.0,6140.0,7593.0,12380.0,...,123.86,4.05,37.91,58.04,15.95,88.09,10.1,1.65,0.16,106.46


In [5]:
# translate German variable names from INKAR dataset to English
df_inkar_translation = pd.read_csv(
    "data/intermediate_data/variable_names_INKAR_engl_german.csv",
    sep=";",
    usecols=["German name in original Inkar data set", "English name"],
)
translation = {
    df_inkar_translation["German name in original Inkar data set"][
        variable
    ]: df_inkar_translation["English name"][variable]
    for variable in df_inkar_translation.index
}
df_inkar_engl = df_inkar.rename(translation, axis=1)

### 1.2. Load and add election results published by Regional Database (Regionalstatistik) and ownership-occupation ratio and household sizes published by Census 2011

Firstly, add election results from 2017 published by Regional Database.

In [6]:
df_election = pd.read_csv(
    "data/intermediate_data/preprocessed_election_data.csv", 
    sep=";", 
    usecols=[
        col_id_ma,
        col_vote_count_afd,
        col_vote_count_cdu,
        col_vote_count_fdp,
        col_vote_count_green,
        col_vote_count_left,
        col_vote_count_spd,
        col_vote_count_other,
    ]
)

df_descriptive = df_inkar_engl.merge(df_election, on=col_id_ma, how="left")
display(df_descriptive[df_descriptive[col_id_ma].isna()])

Unnamed: 0,Name of municipality ass.,Code of municipality associations (RS),unemployed,employees at place of work,employees at place of residence,population,male population,female population,population of working age,population (Census 2011),...,medium-sized enterprises,large-scale enterprises,business volume of skilled crafts and trades,CDU/CSU,SPD,The Greens,FDP,The Left,AfD,other parties


Second, add data on the owner occupation and the household sizes published by the Census 2011.

In [7]:
df_owner_occ = pd.read_csv(
    "data/intermediate_data/preprocessed_ownership_ratio.csv", sep=";"
)
df_hh_size = pd.read_csv(
    "data/intermediate_data/preprocessed_household_sizes_with_hh_counts.csv", sep=";"
)
df_descriptive = df_descriptive.merge(df_owner_occ, on=col_id_ma, how="left")
df_descriptive = df_descriptive.merge(df_hh_size, on=col_id_ma, how="left")
# check if data is available for all municipal associations
display(
    df_descriptive[df_descriptive[[owner_occ_ratio] + col_hh_sizes].isna().any(axis=1)]
)

Unnamed: 0,Name of municipality ass._x,Code of municipality associations (RS),unemployed,employees at place of work,employees at place of residence,population,male population,female population,population of working age,population (Census 2011),...,ownership occupation ratio,count households,share 1-person households,share 2-person households,share 3-person households,share 4-person households,share 5-person households,share 6+-person households,Municipality name,Name of municipality ass._y


### 1.3. Load and add radiation data

In [8]:
df_radiation = pd.read_csv(
    "data/intermediate_data/ma_global_radiation.csv",
    sep=";",
    usecols=[col_id_ma, col_radiation],
)
df_descriptive = df_descriptive.merge(df_radiation, on=col_id_ma, how="left")
# check if data is available for all municipal associations
display(df_descriptive[df_descriptive[col_radiation].isna()])

Unnamed: 0,Name of municipality ass._x,Code of municipality associations (RS),unemployed,employees at place of work,employees at place of residence,population,male population,female population,population of working age,population (Census 2011),...,count households,share 1-person households,share 2-person households,share 3-person households,share 4-person households,share 5-person households,share 6+-person households,Municipality name,Name of municipality ass._y,global radiation


## 2. Prepare and add target feature

In [9]:
df_incl_target = df_descriptive.merge(
    pd.read_csv(
        f"data/intermediate_data/pv_ma_{t_start_target}_{t_end_target}_max_{max_capacity}.csv",
        sep=";",
        usecols=[col_id_ma, target_col],
    ),
    on=col_id_ma,
    how="left",
)
df_incl_target[target_col] = (
    df_incl_target[target_col] / df_incl_target[denominator_column]
)

# check if PV systems are installed in all municipal associations
display(df_incl_target[df_incl_target[target_col].isna()])
# col_count_pv_accum is NaN -> no pv installations in ma -> change to zero
df_incl_target.loc[df_incl_target[target_col].isna(), target_col] = 0

# save input data
df_incl_target.to_csv(f"data/input/input.csv", sep=";", index=False)

Unnamed: 0,Name of municipality ass._x,Code of municipality associations (RS),unemployed,employees at place of work,employees at place of residence,population,male population,female population,population of working age,population (Census 2011),...,share 1-person households,share 2-person households,share 3-person households,share 4-person households,share 5-person households,share 6+-person households,Municipality name,Name of municipality ass._y,global radiation,accumulated power
