# Data insights
This script provides methods to get insights into statistics and characteristics of the datasets used.

--- _run the following cell before proceeding to any section_ ---

In [1]:
import pandas as pd
from os.path import basename, join
from os import makedirs
import sys
sys.path.append('..')

out_folder = 'insights_output'
makedirs(out_folder, exist_ok=True)

def save_unique_attribute_values(path_to_csv, attribute_name):
    print(f'Loading {attribute_name}s from {basename(path_to_csv)}...')
    df = pd.read_csv(path_to_csv, usecols=[attribute_name])
    uniques = pd.Series(df[attribute_name].unique())
    uniques.fillna('', inplace=True)
    uniques = uniques.apply(lambda x: x.strip())
    uniques.sort_values()
    print(f'{basename(path_to_csv)} has {len(uniques)} unique {attribute_name}s.')
    out_path = join(out_folder, f'unique_{attribute_name}s_{basename(path_to_csv)}')
    uniques.to_csv(out_path, index=False, header=[attribute_name])
    print(f'The list of unique {attribute_name}s is saved to {out_path}.')
    return uniques

## General dataset stats
#### Rechtspraak cases raw

In [2]:
from definitions.storage_handler import CSV_RS_CASES, get_path_raw
path = get_path_raw(CSV_RS_CASES)

In [10]:
# attributes:

pd.read_csv(path, nrows=0)

Unnamed: 0,ecli,issued,language,creator,date_decision,zaaknummer,type,procedure,spatial,subject,relation,references,hasVersion,identifier2,title,inhoudsindicatie,info,full_text,source,jurisdiction_country


In [None]:
# number of cases:

df = pd.read_csv(path, usecols=['ecli'])
len(df)

## Unique instances
#### Rechtspraak cases raw

In [82]:
from definitions.storage_handler import CSV_RS_CASES, get_path_raw
unique_instances_RS_raw = save_unique_attribute_values(get_path_raw(CSV_RS_CASES), 'creator')

Loading creators from RS_cases.csv...
RS_cases.csv has 1224 unique creators.
The list of unique creators is saved to insights_output/unique_creators_RS_cases.csv.


#### Rechtspraak cases processed

In [84]:
from definitions.terminology.field_names import RS_CREATOR
from definitions.storage_handler import CSV_RS_CASES, get_path_processed
unique_instances_RS_clean = save_unique_attribute_values(get_path_processed(CSV_RS_CASES), RS_CREATOR)

Loading instances from RS_cases_clean.csv...
RS_cases_clean.csv has 1222 unique instances.
The list of unique instances is saved to insights_output/unique_instances_RS_cases_clean.csv.


#### Legal Intelligence cases raw

In [85]:
from definitions.storage_handler import CSV_LI_CASES, get_path_raw
unique_instances_LI_raw = save_unique_attribute_values(get_path_raw(CSV_LI_CASES), 'IssuingInstitution')

Loading IssuingInstitutions from LI_cases.csv...
LI_cases.csv has 51 unique IssuingInstitutions.
The list of unique IssuingInstitutions is saved to insights_output/unique_IssuingInstitutions_LI_cases.csv.


#### Legal Intelligence cases processed

In [86]:
from definitions.terminology.field_names import LI_ISSUING_INSTITUTION
from definitions.storage_handler import CSV_LI_CASES, get_path_processed
unique_instances_LI_clean = save_unique_attribute_values(get_path_processed(CSV_LI_CASES), LI_ISSUING_INSTITUTION)

Loading instances from LI_cases_clean.csv...
LI_cases_clean.csv has 33 unique instances.
The list of unique instances is saved to insights_output/unique_instances_LI_cases_clean.csv.


Differences between raw and clean sets

In [102]:
a = pd.Series(unique_instances_RS_raw).apply(lambda x: x.strip())
b = pd.Series(unique_instances_RS_clean).apply(lambda x: x.strip())
c = pd.Series(unique_instances_LI_raw).apply(lambda x: x.strip())
d = pd.Series(unique_instances_LI_clean).apply(lambda x: x.strip())
print('Uniquely in RS raw:', set(a).difference(set(b)))
print('Uniquely in RS clean:', set(b).difference(set(a)))
print('Uniquely in LI raw:', set(c).difference(set(d)))
print('Uniquely in LI clean:', set(d).difference(set(c)))

Uniquely in RS raw: {'Gerechtshof Den Haag', 'Rechtbank Den Haag', "Regionaal Medisch Tuchtcollege 's Gravenhage", 'Kamer voor het notariaat in het ressort Den Haag'}
Uniquely in RS clean: {"Regionaal Medisch Tuchtcollege 's-Gravenhage", "Kamer voor het notariaat in het ressort 's-Gravenhage"}
Uniquely in LI raw: {'Sector kanton Rechtbank Roermond', 'Sector kanton Rechtbank Breda', 'Sector kanton Rechtbank Rotterdam', 'Sector kanton Rechtbank Zutphen', 'Sector kanton Rechtbank Middelburg', 'Sector kanton Rechtbank Alkmaar', 'Sector kanton Rechtbank Haarlem', 'Sector kanton Rechtbank Leeuwarden', 'Sector kanton Rechtbank Utrecht', 'Sector kanton Rechtbank Maastricht', 'Sector kanton Rechtbank Almelo', 'Gerechtshof Den Haag', 'Sector kanton Rechtbank Amsterdam', 'Sector kanton Rechtbank Dordrecht', 'Sector kanton Rechtbank Arnhem', 'Rechtbank Den Haag', 'Sector kanton Rechtbank Groningen', 'Sector kanton Rechtbank Den Haag', 'Sector kanton Rechtbank Assen', "Sector kanton Rechtbank 's-He

Clear memory

In [107]:
%reset

Nothing done.


## Unique domains
#### Rechtspraak cases raw

In [117]:
from definitions.storage_handler import CSV_RS_CASES, get_path_raw
unique_domains_RS_raw = save_unique_attribute_values(get_path_raw(CSV_RS_CASES), 'subject')

Loading subjects from RS_cases.csv...
RS_cases.csv has 32 unique subjects.
The list of unique subjects is saved to insights_output/unique_subjects_RS_cases.csv.


#### Rechtspraak cases processed

In [118]:
from definitions.terminology.field_names import RS_SUBJECT
from definitions.storage_handler import CSV_RS_CASES, get_path_processed
unique_domains_RS_clean = save_unique_attribute_values(get_path_processed(CSV_RS_CASES), RS_SUBJECT)

Loading domainss from RS_cases_clean.csv...
RS_cases_clean.csv has 32 unique domainss.
The list of unique domainss is saved to insights_output/unique_domainss_RS_cases_clean.csv.


#### Legal Intelligence cases raw

In [119]:
from definitions.storage_handler import CSV_LI_CASES, get_path_raw
unique_domains_LI_raw = save_unique_attribute_values(get_path_raw(CSV_LI_CASES), 'LawArea')

Loading LawAreas from LI_cases.csv...
LI_cases.csv has 572 unique LawAreas.
The list of unique LawAreas is saved to insights_output/unique_LawAreas_LI_cases.csv.


#### Legal Intelligence cases processed

In [120]:
from definitions.terminology.field_names import LI_LAW_AREA
from definitions.storage_handler import CSV_LI_CASES, get_path_processed
unique_domains_LI_clean = save_unique_attribute_values(get_path_processed(CSV_LI_CASES), LI_LAW_AREA)

Loading domainss from LI_cases_clean.csv...
LI_cases_clean.csv has 489 unique domainss.
The list of unique domainss is saved to insights_output/unique_domainss_LI_cases_clean.csv.


Differences between raw and clean sets

In [121]:
print('Uniquely in RS raw:', set(unique_domains_RS_raw).difference(set(unique_domains_RS_clean)))
print('Uniquely in RS clean:', set(unique_domains_RS_clean).difference(set(unique_domains_RS_raw)))
print('Uniquely in LI raw:', set(unique_domains_LI_raw).difference(set(unique_domains_LI_clean)))
print('Uniquely in LI clean:', set(unique_domains_LI_clean).difference(set(unique_domains_LI_raw)))

Uniquely in RS raw: {'Bestuursrecht', 'Civiel recht', 'Strafrecht', 'Internationaal publiekrecht'}
Uniquely in RS clean: {'Bestuursrecht; Bestuursrecht-Algemeen/Overig/Niet-gelabeld', 'Civiel recht; Civiel recht-Algemeen/Overig/Niet-gelabeld', 'Internationaal publiekrecht; Internationaal publiekrecht-Algemeen/Overig/Niet-gelabeld', 'Strafrecht; Strafrecht-Algemeen/Overig/Niet-gelabeld'}
Uniquely in LI raw: {"['Arbeids/Sociaal Recht', 'Belastingrecht', 'Gezondheidsrecht', 'Internationaal Publiekrecht (niet EU)', 'Personen- en Familierecht', 'Staats- en Bestuursrecht']", "['Burgerlijk recht', 'Intellectuele eigendom', 'Ondernemingsrecht', 'Telecom/ICT/Mediarecht', 'Verbintenissenrecht']", "['Burgerlijk recht', 'Burgerlijke rechtsvordering', 'Internationaal Publiekrecht (niet EU)', 'Ruimtelijk Bestuursrecht/Milieurecht/Energierecht', 'Staats- en Bestuursrecht']", "['Belastingrecht', 'Gezondheidsrecht', 'Internationaal Publiekrecht (niet EU)', 'Onbekend', 'Staats- en Bestuursrecht', 'Straf

Clear memory

In [None]:
%reset

