In [1]:
import json
import pandas as pd

### Process person data

#### Auxiliary functions

In [2]:
# We need this function to replace all occurences of {'-self-closing': 'true'} by None.
def remove_self_closing(elem):
    if type(elem) != dict and type(elem) != list:
        return
    
    if type(elem) == list:
        for idx, value in enumerate(elem):
            if value == {'-self-closing': 'true'}:
                elem[key] = None
            elif type(value) == dict or type(value) == list:
                remove_self_closing(value)
    
    if type(elem) == dict:
        for (key, value) in elem.items():
            if value == {'-self-closing': 'true'}:
                elem[key] = None
            elif type(value) == dict  or type(value) == list:
                remove_self_closing(value)

#### Import converted/MDB_STAMMDATEN.json, reorganize the data and remove parts that are not interesting for us

In [3]:
with open("__converted/MDB_STAMMDATEN.json", "r") as f:
    data = json.load(f)

persons_raw = data["DOCUMENT"]["MDB"]

persons = []
retired = []
for p in persons_raw:
    person = {}
    
    name = p["NAMEN"]["NAME"]
    name = name[-1] if type(name) == list else name
    bio = p["BIOGRAFISCHE_ANGABEN"]
    wp_raw = p["WAHLPERIODEN"]["WAHLPERIODE"]
    wp_raw = [wp_raw] if type(wp_raw) != list else wp_raw
    
    # We don't need the data if the person is/was not a MP in the current term
    if "19" not in [wp["WP"] for wp in wp_raw]:
        continue
    
    person["nachname"]      = name["NACHNAME"]
    person["vorname"]       = name["VORNAME"]
    person["geburtsdatum"]  = bio["GEBURTSDATUM"]
    person["geburtsort"]    = bio["GEBURTSORT"]
    person["sterbedatum"]   = bio["STERBEDATUM"]
    person["geschlecht"]    = bio["GESCHLECHT"]
    person["familienstand"] = bio["FAMILIENSTAND"]
    person["religion"]      = bio["RELIGION"]
    person["beruf"]         = bio["BERUF"]
    person["anrede_titel"]  = name["ANREDE_TITEL"]
    person["akad_titel"]    = name["AKAD_TITEL"]
    person["vita"]          = bio["VITA_KURZ"]
    
    person["partei"]        = bio["PARTEI_KURZ"]
    person["partei_id"]     = person["partei"]
    if bio["PARTEI_KURZ"] == "CDU" or bio["PARTEI_KURZ"] == "CSU":
        person["partei_id"] = "CDU/CSU"
    elif bio["PARTEI_KURZ"] == "Plos":
        person["partei"]    = "fraktionslos"
        person["partei_id"] = "fraktionslos"
    elif bio["PARTEI_KURZ"] == "BÜNDNIS 90/DIE GRÜNEN":
        person["partei"]    = "GRÜNE"
        person["partei_id"] = "GRÜNE"
        
    wahlperioden = []
    active = True
    for wp in wp_raw:
        if wp["WP"] == "19" and wp["MDBWP_BIS"] != {'-self-closing': 'true'}:
            active = False
            break
            
        wahlperiode = {}
        wahlperiode["wp"]         = wp["WP"]
        wahlperiode["md_von"]     = wp["MDBWP_VON"]
        wahlperiode["md_bis"]     = wp["MDBWP_BIS"]
        wahlperiode["liste"]      = wp["LISTE"]
        wahlperiode["mandatsart"] = wp["MANDATSART"]
        wahlperiode["wkr_land"]   = wp["WKR_LAND"]
        wahlperiode["wkr_name"]   = wp["WKR_NAME"]
        wahlperiode["wkr_nummer"] = wp["WKR_NUMMER"]
        
        institutionen = wp["INSTITUTIONEN"]["INSTITUTION"]
        institutionen = [institutionen] if type(institutionen) != list else institutionen
        wahlperiode["institutionen"] = institutionen
        
        wahlperioden.append(wahlperiode)
        
    person["wahlperioden"] = wahlperioden

    remove_self_closing(person)
    
    # we only want persons that are currently in the parliament
    if not active:
        retired.append((person["nachname"], person["vorname"]))
        continue
    
    persons.append(person)

#### Import and map image URLs

In [4]:
with open("img/urls.json", "r") as f:
    urls = json.load(f)

In [5]:
# Apply some corrections to the names to match the names from stammdaten.

names = [(p["nachname"], p["vorname"]) for p in persons]

CORRECTIONS = {
    ("Altenkamp", "Norbert"): ("Altenkamp", "Norbert Maria"),
    ("in der Beek", "Olaf"): ("In der Beek", "Olaf"),
    ("Mackensen-Geis", "Isabel"): ("Mackensen", "Isabel"),
    ("Michelbach", "(Univ Kyiv) Hans"): ("Michelbach", "Hans"),
    ("Merkel", "Angela Dorothea"): ("Merkel", "Angela")
}

def check_names(urls):
    l = [(url["name"].split(", ")[0], url["name"].split(", ")[1]) for url in urls]
    for name in names:    
        assert name in l, name

def correct_names(urls):
    for url in urls:
        vorname = url["name"].split(", ")[1]
        nachname = url["name"].split(", ")[0]

        nachname = nachname.split(" (")[0]

        vorname = vorname.replace("Dr. ", "")
        vorname = vorname.replace("h. c. ", "")
        vorname = vorname.replace("Prof. ", "")
        vorname = vorname.replace(" von der", "")
        vorname = vorname.replace(" von", "")
        vorname = vorname.replace(" de", "")
        vorname = vorname.replace(" Graf", "")
        vorname = vorname.replace(" Freiherr", "")
        
        if (nachname, vorname) in CORRECTIONS:
            nachname, vorname = CORRECTIONS[(nachname, vorname)]
        
        url["name"] = nachname + ", " + vorname
        
correct_names(urls)
check_names(urls)

In [6]:
d = {url["name"]: url for url in urls}
for person in persons:
    person["img"] = d[f"{person['nachname']}, {person['vorname']}"]["img"]

#### Export data to final/stammdaten.json

In [7]:
with open("___final/stammdaten.json", "w") as outfile:
    json.dump(persons, outfile, indent = 4, ensure_ascii = False)

#### Other stuff

In [8]:
# Make sure there are no duplicate names (because we want to use them as id)
for i in range(len(persons)):
    for j in range(i+1, len(persons)):
        p1 = persons[i]
        p2 = persons[j]
        if p1["nachname"] == p2["nachname"] and p1["vorname"] == p2["vorname"]:
            print(p1["nachname"], p1["vorname"])

### Process vote data

In [9]:
CORRECTIONS = {
    ("Özoguz", "Aydan"): ("Özoğuz", "Aydan"),
    ("Dagdelen", "Sevim"): ("Dağdelen", "Sevim")
}

# Remove some columns we don't use, check if all names match a name in stammdaten.json and
# correct them if necessary. Store result in <vote_nr>.json and <vote_nr>.csv
def process_vote_data(filename, vote_nr):
    # Read csv
    csv_data = pd.read_csv("__converted/votes/" + filename, sep=";")
    csv_data = csv_data[["Fraktion/Gruppe", "Name", "Vorname", "ja", "nein", "Enthaltung", "ungültig", "nichtabgegeben"]]
    
    # some format changes
    csv_data.columns = ["partei_id", "name", "vorname", "ja", "nein", "Enthaltung", "ungültig", "nichtabgegeben"]
    csv_data["vote"] = "nö"
    for idx, row in csv_data.iterrows():
        csv_data.at[idx, "name"] = row["name"].split(" (")[0]
        if row["partei_id"] == "Fraktionslos":
            csv_data.at[idx, "partei_id"] = "fraktionslos"
        if row["partei_id"] == "BÜ90/GR":
            csv_data.at[idx, "partei_id"] = "GRÜNE"
        if row["ja"] == 1:
            csv_data.at[idx, "vote"] = 0
        elif row["nein"] == 1:
            csv_data.at[idx, "vote"] = 1
        elif row["Enthaltung"] == 1:
            csv_data.at[idx, "vote"] = 2
        elif row["ungültig"] == 1:
            csv_data.at[idx, "vote"] = 3
        elif row["nichtabgegeben"] == 1:
            csv_data.at[idx, "vote"] = 4
            
    csv_data = csv_data[["partei_id", "name", "vorname", "vote"]]

    # Apply corrections for names that differ from stammdaten.json
    names = [(p["nachname"], p["vorname"]) for p in persons]

    for idx, row in csv_data[["name", "vorname"]].iterrows():
        name = (row["name"], row["vorname"])
        if name in CORRECTIONS:
            csv_data.at[idx, "name"]    = CORRECTIONS[name][0]
            csv_data.at[idx, "vorname"] = CORRECTIONS[name][1]

    # Check if there are more names that require manual intervention
    for _, row in csv_data[["name", "vorname"]].iterrows():
        nachname = row["name"]
        vorname = row["vorname"]
        #if (nachname, vorname) not in retired: TODO
            #assert (nachname, vorname) in names, (nachname, vorname)

    return json.loads(csv_data.to_json(orient = "records"))

In [10]:
files = [
    ("20191220_2_xls-data.csv", "20191220_2"),
    ("20210610_4_xls-data.csv", "20210610_4"),
    ("20210624_1_xls-data.csv", "20210624_1"),
    ("20210624_2_xls-data.csv", "20210624_2"),
    ("20210624_4_xls-data.csv", "20210624_4"),
]

with open("__converted/votes/meta.json", "r") as f:
    data = json.load(f)

for file, vote_nr in files:
    data[vote_nr]["votes"] = process_vote_data(file, vote_nr)
    
l = [d for (_, d) in data.items()]
    
with open("___final/votes.json", "w") as outfile:
    json.dump(l, outfile, indent = 4, ensure_ascii = False)