# De Memorie van Toelichting overvraagd?
## Experimentanalyse 2

Copyright (c) 2024 Martijn Staal `<de-mvt-overvraagd [a t ] martijn-staal.nl>`

This file is available under the European Union Public License, v1.2 or later (EUPL-1.2).

SPDX-License-Identifier: EUPL-1.2

Import the dependencies that we use in the analysis.

In [1]:
import datetime
import json
import statistics

from pathlib import Path
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np

import kamerstuk

Load the the citation information. These datasets consist of all metadata of the documents, and the citations found by nllegalcit.

In [2]:
arsaequi_data_base_filename = "arsaequi_results_2024-04-08"
arsaequi_data_filename = f"{arsaequi_data_base_filename}.json"
rechtspraak_data_base_filename = "rechtspraak_results_2024-03-29"
rechtspraak_data_filename = f"{rechtspraak_data_base_filename}.json"

with open(arsaequi_data_filename, "rt") as jsonfile:
    arsaequi_data = json.load(jsonfile)

with open(rechtspraak_data_filename, "rt") as jsonfile:
    rechtspraak_data = json.load(jsonfile)

print(f"Totaal aantal artikelen in AA-dataset: {len(arsaequi_data)}")
print(f"Totaal aantal uitspraken in rechtspraak dataset: {len(rechtspraak_data)}")

Totaal aantal artikelen in AA-dataset: 4610
Totaal aantal uitspraken in rechtspraak dataset: 241906


As an example, this data looks like this:

In [3]:
voorbeeldartikel_aa = arsaequi_data[551]
pprint(voorbeeldartikel_aa)

voorbeelduitspraak = rechtspraak_data[551]
pprint(voorbeelduitspraak)

{'archiefcode': 'AA19980694',
 'data': {'experiments': {'citations_all_1': {'citations': [],
                                              'datetime': '2024-04-09T08:21:59',
                                              'errors': [],
                                              'experiment': 'citations',
                                              'id': 'citations_all_1'}}},
 'issue': 'Ars Aequi 1998-7',
 'publicatiedatum': '1998-7-01',
 'rechtsgebieden': [],
 'tags': [],
 'titel': 'Nieuwe wetsvoorstellen',
 'url': 'https://arsaequi.nl/product/nieuwe-wetsvoorstellen-6/'}
{'data': {'experiments': {'citations_all_1': {'citations': [],
                                              'datetime': '2024-03-24T17:15:59',
                                              'errors': [],
                                              'experiment': 'citations',
                                              'id': 'citations_all_1'}}},
 'ecli': 'ECLI:NL:CRVB:1996:ZB6248',
 'inhoudsindicatie': '\n'
     

Then, we to find out what type of Kamerstuk has actually been cited, we (try) to find the metadata of a Kamerstuk using the KOOP SRU API. The code for this lookup can be found in the separate `kamerstuk.py` file.

The result is then saved in a separate json file, so that we only have to run this once.

In [4]:
def add_kamerstuk_metadata(documents: list) -> list:
    for document in documents:
        for citation in document["data"]["experiments"]["citations_all_1"]["citations"]:
            if "ondernummer" in citation:
                # Only KamerstukCitations have the "ondernummer" key, so we know that this is a citation that nllegalcit thinks is a Kamerstuk.
                try:
                    citation_info = kamerstuk.get_kst_information(citation["dossiernummer"], citation["ondernummer"])
                except Exception as exc:
                    # Something went wrong resolving the citation information. The most likely reason is that it is not actually a citation to a Kamerstuk.
                    print(f"ERROR Could not get kst citation information for {citation}: {exc}, skipping")
                    citation_info = {"error": True}
                citation["kst_info"] = citation_info

    return documents

arsaequi_data_with_kst_metadata_path = Path(f"{arsaequi_data_base_filename}_with_kamerstuk_metadata.json")
rechtspraak_data_with_kst_metadata_path = Path(f"{rechtspraak_data_base_filename}_with_kamerstuk_metadata.json")

if not arsaequi_data_with_kst_metadata_path.exists():
    print("Results file with Kamerstuk metadata for Ars Aequi dataset does not exist, resolving metadata")
    add_kamerstuk_metadata(arsaequi_data)

    with open(arsaequi_data_with_kst_metadata_path, "wt", encoding="utf-8") as jsonfile:
        json.dump(arsaequi_data, jsonfile)
else:
    print("Results file with Kamerstuk metadata for Ars Aequi dataset exists, using that.")
    with open(arsaequi_data_with_kst_metadata_path, "rt", encoding="utf-8") as jsonfile:
        arsaequi_data = json.load(jsonfile)

if not rechtspraak_data_with_kst_metadata_path.exists():
    print("Results file with Kamerstuk metadata for rechtspraak dataset does not exist, resolving metadata")
    add_kamerstuk_metadata(rechtspraak_data)

    with open(rechtspraak_data_with_kst_metadata_path, "wt", encoding="utf-8") as jsonfile:
        json.dump(rechtspraak_data, jsonfile)
else:
    print("Results file with Kamerstuk metadata for rechtspraak dataset exists, using that.")
    with open(rechtspraak_data_with_kst_metadata_path, "rt", encoding="utf-8") as jsonfile:
        rechtspraak_data = json.load(jsonfile)

Results file with Kamerstuk metadata for Ars Aequi dataset exists, using that.
Results file with Kamerstuk metadata for rechtspraak dataset exists, using that.


Now that we have the metadata information about each citation to Kamerstukken, we can calculate statistics about verwijzingen per document.

In [5]:
def get_citation_relative_age(citation: dict, document: dict) -> int:
    """Calculate the approximate relative age in years of a citation in a document"""

    if "uitspraakdatum" in document:
        document_year = int(document["uitspraakdatum"][0:4])
    else:
        document_year = int(document["publicatiedatum"][0:4])

    try:
        citation_year = int(citation["kst_info"]["vergaderjaar"].split('-')[1])
    except Exception as exc:
        citation_year = int(citation["kst_info"]["vergaderjaar"])

    return document_year - citation_year

def find_information_per_document(dataset: list) -> list:
    results = {
        "geen_enkele_kst_verwijzing": 0,
        "alleen_mvt_verwijzingen": 0,
        "gemixte_verwijzingen": 0, # zowel MvT als niet-MvT
        "alleen_niet_mvt_kst_verwijzingen": 0,
        "verhoudingen_per_document": [] # results["verhoudingen_per_document"][docid] = {"mvt_verwijzingen": 0, "andere_verwijzingen": 0}
    }

    for document in dataset:
        docresults = {
            "mvt_verwijzingen_count": 0,
            "mvt_verwijzingen_dossiernummers": set(),
            "mvt_verwijzingen_leeftijden": [],
            "overige_kst_verwijzingen_count": 0,
            "overige_kst_verwijzingen_dossiernummers": set(),
            "overige_kst_verwijzingen_leeftijden": []
        }

        for citation in document["data"]["experiments"]["citations_all_1"]["citations"]:
            if "ondernummer" in citation:
                if "error" in citation["kst_info"]:
                    # No metadata available, skipping this citation
                    continue
                # kst citation with data!
                if citation["kst_info"]["kamerstuktype"] == "Memorie van toelichting":
                    docresults["mvt_verwijzingen_count"] += 1
                    docresults["mvt_verwijzingen_dossiernummers"].add(citation["kst_info"]["dossiernummer"])
                    docresults["mvt_verwijzingen_leeftijden"].append(get_citation_relative_age(citation, document))
                else:
                    docresults["overige_kst_verwijzingen_count"] += 1
                    docresults["overige_kst_verwijzingen_dossiernummers"].add(citation["kst_info"]["dossiernummer"])
                    docresults["overige_kst_verwijzingen_leeftijden"].append(get_citation_relative_age(citation, document))
        docresults["mvt_verwijzingen_dossiernummers"] = list(docresults["mvt_verwijzingen_dossiernummers"])
        docresults["overige_kst_verwijzingen_dossiernummers"] = list(docresults["overige_kst_verwijzingen_dossiernummers"])
        document["docresults"] = docresults

    return dataset

arsaequi_data_with_docresults_path = Path(f"{arsaequi_data_base_filename}_with_docresults.json")
rechtspraak_data_with_docresults_path = Path(f"{rechtspraak_data_base_filename}_with_docresults.json")

if not arsaequi_data_with_docresults_path.exists():
    print("Results with docresults for Ars Aequi dataset does not yet exists")

    arsaequi_data = find_information_per_document(arsaequi_data)
    with open(arsaequi_data_with_docresults_path, "wt", encoding="utf-8") as jsonfile:
        json.dump(arsaequi_data, jsonfile)
else:
    print("Results with docresults for Ars Aequi dataset exists, using previous result")
    with open(arsaequi_data_with_docresults_path, "rt", encoding="utf-8") as jsonfile:
        arsaequi_data = json.load(jsonfile)

if not rechtspraak_data_with_docresults_path.exists():
    print("Results with docresults for rechtspraak dataset does not yet exists")

    rechtspraak_data = find_information_per_document(rechtspraak_data)
    with open(rechtspraak_data_with_docresults_path, "wt", encoding="utf-8") as jsonfile:
        json.dump(rechtspraak_data, jsonfile)
else:
    print("Results with docresults for rechtspraak dataset exists, using previous result")
    with open(rechtspraak_data_with_docresults_path, "rt", encoding="utf-8") as jsonfile:
        rechtspraak_data = json.load(jsonfile)


Results with docresults for Ars Aequi dataset exists, using previous result
Results with docresults for rechtspraak dataset exists, using previous result


In [15]:
def get_statistics_per_year(l: list) -> dict:
    """Get an overview of the documents per year in a specific (subset of a) dataset"""

    ret = {}
    for i in range(1995, 2023):
        ret[i] = 0

    for document in l:
        if "uitspraakdatum" in document:
            year = int(document["uitspraakdatum"][0:4])
        else:
            year = int(document["publicatiedatum"][0:4])
        ret[year] += 1

    return ret

def filter_instantie_type(t: str, dataset: list) -> list:
    """Get only the results of the specified instantie_type"""
    ret = []

    for uitspraak in dataset:
        if uitspraak["instantie_type"] == t:
            ret.append(uitspraak)

    return ret

In [22]:
def calculate_statistics_for_dataset(dataset: list, dataset_name: str):
    total_documents_per_year = get_statistics_per_year(dataset)

    documents_total = len(dataset)
    documents_with_kst_citations = []
    kst_citations_total_count = 0
    kst_citations_error_count = 0
    kst_citations_success_count = 0
    kst_citations_success = []
    kst_citations_error = []

    kst_non_mvt_citation_ages = []
    kst_mvt_citation_ages = []

    within_document_results = {
        "geen_enkele_kst_verwijzing": 0,
        "alleen_mvt_verwijzingen": 0,
        "gemixte_verwijzingen": 0, # zowel MvT als niet-MvT
        "gemixte_verwijzingen_uit_zelfde_dossier": 0, # Zowel MvT als niet-MvT kst uit hetzelfde dossier zijn geciteerd, subset van "gemixte_verwijzingen"
        "alleen_niet_mvt_kst_verwijzingen": 0,
    }

    # Find all kst citations and separate the errors and succesfull ones,
    # i.e. those with and without resolved metadata.
    for document in dataset:
        kst_non_mvt_citation_ages += document["docresults"]["mvt_verwijzingen_leeftijden"]
        kst_mvt_citation_ages += document["docresults"]["overige_kst_verwijzingen_leeftijden"]
        for citation in document["data"]["experiments"]["citations_all_1"]["citations"]:
            if "ondernummer" in citation:
                kst_citations_total_count += 1
                if "error" in citation["kst_info"]:
                    kst_citations_error_count += 1
                    kst_citations_error.append(citation)
                    continue
                else:
                    kst_citations_success_count += 1
                    kst_citations_success.append(citation)

        docresults = document["docresults"]

        if docresults["mvt_verwijzingen_count"] > 0 and docresults["overige_kst_verwijzingen_count"] == 0:
            within_document_results["alleen_mvt_verwijzingen"] += 1

            documents_with_kst_citations.append(document)
        elif docresults["mvt_verwijzingen_count"] > 0 and docresults["overige_kst_verwijzingen_count"] > 0:
            within_document_results["gemixte_verwijzingen"] += 1
            if not set(docresults["mvt_verwijzingen_dossiernummers"]).isdisjoint(set(docresults["overige_kst_verwijzingen_dossiernummers"])):
                within_document_results["gemixte_verwijzingen_uit_zelfde_dossier"] += 1

            documents_with_kst_citations.append(document)
        elif docresults["mvt_verwijzingen_count"] == 0 and docresults["overige_kst_verwijzingen_count"] > 0:
            within_document_results["alleen_niet_mvt_kst_verwijzingen"] += 1

            documents_with_kst_citations.append(document)
        else:
            within_document_results["geen_enkele_kst_verwijzing"] += 1

    kst_all_citation_ages = kst_non_mvt_citation_ages + kst_mvt_citation_ages

    # Calculate the citations per type and originating chamber
    kamerstuktypes = [v._value_ for v in kamerstuk.KamerstukType.__members__.values()]
    totals_per_kamerstuktype = {}
    for kamerstuktype in kamerstuktypes:
        totals_per_kamerstuktype[kamerstuktype] = 0

    totals_per_kamer = {
        "II": 0,
        "I": 0
    }

    for citation in kst_citations_success:
        try:
            totals_per_kamerstuktype[citation["kst_info"]["kamerstuktype"]] += 1
        except KeyError:
            print(f"No kamerstuktype available for {citation}")

        try:
            totals_per_kamer[citation["kst_info"]["kamer"]] +=1
        except KeyError:
            print(f"No kamer available for {citation}")

    total_documents_with_kst_citations_per_year = get_statistics_per_year(documents_with_kst_citations)

    percentage_documents_with_kst_citations_per_year = {}
    for year in range(1995, 2023):
        try:
            percentage_documents_with_kst_citations_per_year[year] = total_documents_with_kst_citations_per_year[year] / total_documents_per_year[year]
        except ZeroDivisionError:
            percentage_documents_with_kst_citations_per_year[year] = 0

    statistics_for_dataset = {
        "dataset": dataset_name,
        "documenten_total": len(dataset),
        "total_documents_per_year": total_documents_per_year,
        "total_documents_with_kst_citations_per_year": total_documents_with_kst_citations_per_year,
        "percentage_documents_with_kst_citations_per_year": percentage_documents_with_kst_citations_per_year,
        "kst_citations_error_count": kst_citations_error_count,
        "kst_citations_success_count": kst_citations_success_count,
        "total_citations": kst_citations_total_count,
        "totals_per_kamerstuktype": totals_per_kamerstuktype,
        "totals_per_kamer": totals_per_kamer,
        "average_age_mvt": statistics.mean(kst_mvt_citation_ages),
        "median_age_mvt": statistics.median(kst_mvt_citation_ages),
        "average_age_non_mvt": statistics.mean(kst_non_mvt_citation_ages),
        "median_age_non_mvt": statistics.median(kst_non_mvt_citation_ages),
        "average_age_all_kst": statistics.mean(kst_all_citation_ages),
        "median_age_all_kst": statistics.median(kst_all_citation_ages),
        "within_document_results": within_document_results
    }

    return statistics_for_dataset


In [23]:
# Calculate and store all the results

stats_arsaequi = calculate_statistics_for_dataset(arsaequi_data, "Ars Aequi")
stats_aa_path = Path(f"{arsaequi_data_base_filename}_statistics.json")

with open(stats_aa_path, "wt", encoding="utf-8") as jsonfile:
    json.dump(stats_arsaequi, jsonfile)

instantie_types = ["TypeHr", "Parket", "TypeRvS", "TypeCRvB", "TypeCBb"]

for instantie in instantie_types:
    instantie_subdataset = filter_instantie_type(instantie, rechtspraak_data)

    stats_instantie = calculate_statistics_for_dataset(instantie_subdataset, instantie)

    stats_instantie_path = Path(f"{rechtspraak_data_base_filename}_{instantie}_statistics.json")

    with open(stats_instantie_path, "wt", encoding="utf-8") as jsonfile:
        json.dump(stats_instantie, jsonfile)