In [1]:
import os
import numpy as np
import pandas as pd

os.chdir("../..")
from utils.utils import *
os.chdir("..")

pd.options.mode.chained_assignment = None

In [2]:
# display entire dataframes instead of contractions
pd.set_option("display.max_rows", None)

In [3]:
list_voters_cols = [col_eligible_voters, col_electoral_turnout, col_vote_count_total]
parties = [
    col_vote_count_cdu,
    col_vote_count_spd,
    col_vote_count_green,
    col_vote_count_fdp,
    col_vote_count_left,
    col_vote_count_afd,
    col_vote_count_other,
]
col_names = (
    [col_id_regionalstatistik, col_name_regionalstatistik] + list_voters_cols + parties
)
col_names_import = ["Date"] + col_names

In [4]:
df_votes_raw = pd.read_csv(
    "data/raw_data/descriptive_features/regiostat_bundestagswahl_changed_encoding.csv",
    sep="\t",
    encoding="iso8859_15",
    skiprows=9,
    header=0,
    names=col_names_import,
    na_values="-",
    decimal=",",
    index_col=False,
)
# drop date column
df_votes_raw = df_votes_raw[col_names]
# drop last four rows as they only consist of NaNs
df_votes_raw = df_votes_raw.iloc[:-4]

In [5]:
df_votes_raw[list_voters_cols + parties] = df_votes_raw[
    list_voters_cols + parties
].apply(pd.to_numeric)
df_votes_raw[col_id_regionalstatistik] = df_votes_raw[col_id_regionalstatistik].astype(
    int
)

# create dataframe with data of city states, is merged to remaining data later
df_city_states = df_votes_raw[
    df_votes_raw[col_id_regionalstatistik].isin([11, 2, 4011, 4012])
]
df_city_states.rename(
    {col_id_regionalstatistik: col_id_ma, col_name_regionalstatistik: col_name_ma},
    axis=1,
    inplace=True,
)
# Berlin
df_city_states.loc[df_city_states[col_id_ma] == 11, col_id_ma] = 110000000
# Hamburg
df_city_states.loc[df_city_states[col_id_ma] == 2, col_id_ma] = 20000000
# Bremen
df_city_states.loc[df_city_states[col_id_ma] == 4011, col_id_ma] = 40110000
# Bremerhaven
df_city_states.loc[df_city_states[col_id_ma] == 4012, col_id_ma] = 40120000

# drop all cities and counties (incl. city states)
df_votes_wo_cities = df_votes_raw[df_votes_raw[col_id_regionalstatistik] >= 1000000]
# drop districts of Berlin
df_votes_wo_cities = df_votes_wo_cities[
    ~df_votes_wo_cities[col_name_regionalstatistik].str.contains("Berlin-")
]

df_votes_wo_cities.rename(
    {col_id_regionalstatistik: col_id_m, col_name_regionalstatistik: col_name_m},
    axis=1,
    inplace=True,
)

In [6]:
print(
    f"There are {df_votes_wo_cities.shape[0]} municipalities considered in the dataframe."
)
print(
    f"{df_votes_wo_cities[parties + [col_electoral_turnout, col_eligible_voters, col_vote_count_total]].isna().any(axis=1).sum()} municipalities have at least one relevant value missing."
)
print(
    f"Regionalstatistik does not provide any election data for {df_votes_wo_cities[parties].isna().all(axis=1).sum()} of those municipalities."
)

There are 13384 municipalities considered in the dataframe.
2704 municipalities have at least one relevant value missing.
Regionalstatistik does not provide any election data for 2444 of those municipalities.


In [7]:
# drop all municipalities with NaN for all vote count of parties
df_votes_wo_cities = df_votes_wo_cities[df_votes_wo_cities[parties].notna().any(axis=1)]

In [8]:
# check if voters for parties sum up to total vote count. If so, replace the NaNs with 0.
for party in parties:
    df_votes_wo_cities.loc[
        df_votes_wo_cities[party].isna(), party
    ] = df_votes_wo_cities.apply(
        lambda row: 0 if row[col_vote_count_total] == row[parties].sum() else np.nan,
        axis=1,
    )
print(
    f"There are {df_votes_wo_cities.isna().sum().sum()} remaining missing values in the dataframe."
)

There are 0 remaining missing values in the dataframe.


In [9]:
# import and add mapping of municipalities to municipality associations
df_map_m_to_ma = pd.read_csv(
    "data/intermediate_data/mapping_municipalities_2017_2019.csv",
    sep=";",
    usecols=[col_id_m, col_id_ma, col_name_ma],
)
df_votes_to_merge = df_votes_wo_cities.copy()
df_votes_incl_map = df_votes_to_merge.merge(df_map_m_to_ma, on=col_id_m, how="left")
assert (
    df_votes_incl_map[col_id_ma].isna().sum() == 0
), "Attention: Not all municipalities are considered in the mapping to municipality associations."
display(df_votes_incl_map.head())

Unnamed: 0,Official municipality code (AGS),Municipality name,count eligible voters,electoral turnout (percentage),valid vote count,CDU/CSU,SPD,The Greens,FDP,The Left,AfD,other parties,Name of municipality ass.,Code of municipality associations (RS)
0,1051001,Albersdorf,2755.0,59.9,1638.0,656.0,344.0,133.0,229.0,99.0,138.0,39.0,Mitteldithmarschen,10515175.0
1,1051002,Arkebek,160.0,68.8,109.0,49.0,14.0,13.0,18.0,2.0,9.0,4.0,Mitteldithmarschen,10515175.0
2,1051003,Averlak,490.0,56.3,274.0,101.0,76.0,18.0,37.0,5.0,26.0,11.0,Burg-St. Michaelisdonn,10515163.0
3,1051004,Bargenstedt,745.0,62.8,465.0,206.0,101.0,25.0,51.0,34.0,37.0,11.0,Mitteldithmarschen,10515175.0
4,1051005,Barkenholm,143.0,78.3,110.0,39.0,24.0,8.0,16.0,4.0,19.0,0.0,Eider,10515169.0


In [10]:
df_votes_incl_map.drop([col_id_m, col_name_m], axis=1, inplace=True)
df_election_ma = pd.DataFrame(
    df_votes_incl_map.groupby(by=col_id_ma, as_index=False).agg(
        {
            col_name_ma: "unique",
            col_eligible_voters: "sum",
            col_electoral_turnout: "sum",
            col_vote_count_total: "sum",
            col_vote_count_cdu: "sum",
            col_vote_count_spd: "sum",
            col_vote_count_green: "sum",
            col_vote_count_fdp: "sum",
            col_vote_count_left: "sum",
            col_vote_count_afd: "sum",
            col_vote_count_other: "sum",
        }
    )
)
df_election_ma[col_name_ma] = df_election_ma[col_name_ma].apply(
    lambda ma_name: ma_name[0]
)
df_election_ma = pd.concat([df_election_ma, df_city_states], ignore_index=True)

# check if df_election_ma includes election data for all ma's contained in inkar dataset
df_ma_in_inkar = pd.read_csv(
    "data/intermediate_data/preprocessed_inkar_data.csv", sep=";", usecols=[col_id_ma]
)
list_missing_ids_cities = [
    int(ma / 10000)
    for ma in list(df_ma_in_inkar[col_id_ma])
    if ma not in list(df_election_ma[col_id_ma])
]
print(
    f"Election data is missing for {len(list_missing_ids_cities)} municipality associations. Examination of the municipality associations revealed that data is missing for cities. Fixing these cities."
)
# RS/10000 yields the city ids

df_missing_election_data_cities = df_votes_raw.loc[
    df_votes_raw[col_id_regionalstatistik].isin(list_missing_ids_cities),
    [col_id_regionalstatistik, col_name_regionalstatistik] + list_voters_cols + parties,
]

# transfer the keys/ ids of cities to RS of municipality associations
df_missing_election_data_cities[col_id_regionalstatistik] = (
    df_missing_election_data_cities[col_id_regionalstatistik] * 10000
)

df_missing_election_data_cities.rename(
    {col_id_regionalstatistik: col_id_ma, col_name_regionalstatistik: col_name_ma},
    axis=1,
    inplace=True,
)

df_election_ma = pd.concat(
    [df_election_ma, df_missing_election_data_cities], ignore_index=True
)

df_election_ma[col_id_ma] = df_election_ma[col_id_ma].astype(int)

assert (
    len(
        [
            1
            for ma in list(df_ma_in_inkar[col_id_ma])
            if ma not in list(df_election_ma[col_id_ma])
        ]
    )
    == 0
), "Election data is not available for all municipality associations."

Election data is missing for 102 municipality associations. Examination of the municipality associations revealed that data is missing for cities. Fixing these cities.


In [11]:
# consider percentages of voters instead of absolute numbers
for party in parties:
    df_election_ma[party] = (
        df_election_ma[party] / df_election_ma[col_vote_count_total] * 100
    )
df_election_ma.to_csv(
    "data/intermediate_data/preprocessed_election_data.csv", index=False, sep=";"
)