# Process Gesetzessammlung

**Imports**

In [1]:
import os
import pandas as pd
from pandarallel import pandarallel
from dotenv import load_dotenv

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
from staatsarchiv_utils import read_XML_files
from staatsarchiv_utils import parse_XML_files
from staatsarchiv_utils import fix_missing_dates
from staatsarchiv_utils import add_missing_month_and_day_to_dates
from staatsarchiv_utils import parse_dates
from staatsarchiv_utils import clean_identifiers
from staatsarchiv_utils import generic_text_cleaning

In [3]:
load_dotenv()

DATA_INPUT_OS = os.getenv("DATA_INPUT_OS")
RAW_OUTPUT_OS = os.getenv("RAW_OUTPUT_OS")
PREP_OUTPUT_OS = os.getenv("PREP_OUTPUT_OS")

**Prepare data**

In [4]:
file_paths = read_XML_files(DATA_INPUT_OS)
df = parse_XML_files(file_paths)
df.to_parquet(RAW_OUTPUT_OS)

9,527 files found.


100%|██████████| 9527/9527 [00:28<00:00, 333.15it/s]


In [5]:
df = pd.read_parquet(RAW_OUTPUT_OS)

# TODO: Here the correct url of zszh has to be set
link_prolog = "https://www.zentraleserien.zh.ch/os/"
df["new_link"] = df.filename.apply(lambda x: link_prolog + x.replace(".xml", ""))
assert df.new_link.nunique() == len(df)

df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9527 entries, 0 to 9526
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       9527 non-null   object
 1   date_when  9250 non-null   object
 2   date_from  277 non-null    object
 3   date_to    277 non-null    object
 4   date_text  9527 non-null   object
 5   ident      9527 non-null   object
 6   ref        9527 non-null   object
 7   title      9527 non-null   object
 8   text       9527 non-null   object
 9   filename   9527 non-null   object
 10  new_link   9527 non-null   object
dtypes: object(11)
memory usage: 104.1 MB


In [6]:
df.sample(3).T

Unnamed: 0,9283,953,1907
path,_01_data-input/OS/STAZH_OGD_eOSZH_V4_NER/OS_XM...,_01_data-input/OS/STAZH_OGD_eOSZH_V4_NER/OS_XM...,_01_data-input/OS/STAZH_OGD_eOSZH_V4_NER/OS_XM...
date_when,1973-11-05,1851-06-24,1857-08-05
date_from,,,
date_to,,,
date_text,05.11.1973,24.06.1851,05.08.1857
ident,StAZH OS 44 (S. 952),"StAZH OS 20, Suppl. 2 (S. 48, Eintrag 2)",StAZH OS 10 (S. 429-433)
ref,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...
title,Beschluss des Kantonsrates über die Erhöhung d...,"15. Gesetz betr. die Wahl, Amtsdauer und Entsc...",Bundesbeschluß betreffend die Eisenbahn von Zü...
text,"Der Kantonsrat, nach Einsichtnahme in einen An...","1 Die zwei Mitglieder, welche der Kanton Züric...",Die Bundesversammlung der schweizerischen Eidg...
filename,OS_44__S__952__t.xml,OS_20__Suppl__2__S__48__Eintrag_2__t.xml,OS_10__S__429-433__t.xml


In [7]:
# Fix missing dates: Feature `date_when` is crucial. If this feature is missing, we agreed to fill in with the date from `date_from`.
# Add missing month and day to dates: Some dates are missing the month and day.
# Clean identifiers: Several ident values contain line breaks and multiple spaces. We agredd to remove these programmatically.

df = (
    df.pipe(fix_missing_dates)
    .pipe(add_missing_month_and_day_to_dates)
    .pipe(parse_dates)
    .pipe(clean_identifiers)
    .assign(year=df.date_when.dt.year)
    .assign(identifier=df.index.astype(int))
    .rename({"new_link": "link"}, axis=1)  # TODO: Rename here (new_link : link)
    .rename({"date_when": "date"}, axis=1)
    .rename({"ident": "stazh_ident"}, axis=1)
)

# Sanity checks.
assert df.date.isna().sum() == 0
tmp = pd.to_datetime(df.date, errors="coerce", format="%Y-%m-%d")
assert len(tmp[tmp.isna()].index) == 0
assert df.stazh_ident.str.contains("\n").sum() == 0
assert df.identifier.nunique() == len(df)
print(df.date.min(), df.date.max())

1803-01-01 00:00:00 1998-12-23 00:00:00


In [8]:
df.identifier = ["os_" + str(x) for x in df.identifier]

# Generic text cleaning.
df.title = df.title.parallel_map(generic_text_cleaning)
df.text = df.text.parallel_map(generic_text_cleaning)

# Reduce to relevant columns and save to disk.
cols = ["identifier", "date", "year", "title", "text", "link", "stazh_ident", "ref"]
df = df[cols]
df.to_parquet(PREP_OUTPUT_OS)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9527 entries, 0 to 9526
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   identifier   9527 non-null   object        
 1   date         9527 non-null   datetime64[ns]
 2   year         9527 non-null   int32         
 3   title        9527 non-null   object        
 4   text         9527 non-null   object        
 5   link         9527 non-null   object        
 6   stazh_ident  9527 non-null   object        
 7   ref          9527 non-null   object        
dtypes: datetime64[ns](1), int32(1), object(6)
memory usage: 155.5 MB
