# Process Amtsblatt

**Imports**

In [2]:
import os
import pandas as pd
from pandarallel import pandarallel
from dotenv import load_dotenv

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
from staatsarchiv_utils import read_XML_files
from staatsarchiv_utils import parse_XML_files
from staatsarchiv_utils import fix_missing_dates
from staatsarchiv_utils import add_missing_month_and_day_to_dates
from staatsarchiv_utils import parse_dates
from staatsarchiv_utils import clean_identifiers
from staatsarchiv_utils import generic_text_cleaning

In [4]:
load_dotenv()

DATA_INPUT_ABl = os.getenv("DATA_INPUT_ABl")
RAW_OUTPUT_ABl = os.getenv("RAW_OUTPUT_ABl")
PREP_OUTPUT_ABl = os.getenv("PREP_OUTPUT_ABl")

**Prepare data**

In [5]:
file_paths = read_XML_files(DATA_INPUT_ABl)
df = parse_XML_files(file_paths)
df.to_parquet(RAW_OUTPUT_ABl)

9,388 files found.


100%|██████████| 9388/9388 [02:02<00:00, 76.74it/s] 


In [6]:
df = pd.read_parquet(RAW_OUTPUT_ABl)

# TODO: Here the correct URL of ZSZH has to be set.
link_prolog = "https://www.zentraleserien.zh.ch/abl/"
df["new_link"] = df.filename.apply(lambda x: link_prolog + x.replace(".xml", ""))
assert df.new_link.nunique() == len(df)

df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9388 entries, 0 to 9387
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       9388 non-null   object
 1   date_when  9388 non-null   object
 2   date_from  0 non-null      object
 3   date_to    0 non-null      object
 4   date_text  9388 non-null   object
 5   ident      9388 non-null   object
 6   ref        9388 non-null   object
 7   title      9388 non-null   object
 8   text       9388 non-null   object
 9   filename   9388 non-null   object
 10  new_link   9388 non-null   object
dtypes: object(11)
memory usage: 87.7 MB


In [7]:
df.sample(3).T

Unnamed: 0,2312,5237,4092
path,_01_data-input/ABl/08_XML_Segmentiert_mit_Scop...,_01_data-input/ABl/08_XML_Segmentiert_mit_Scop...,_01_data-input/ABl/08_XML_Segmentiert_mit_Scop...
date_when,1993-11-05,1985-03-29,1988-02-05
date_from,,,
date_to,,,
date_text,05.11.1993,29.03.1985,05.02.1988
ident,StAZH ABl 1993 (S. 1547-1556),StAZH ABl 1985 (S. 570-573),StAZH ABl 1988 (S. 300-301)
ref,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...
title,DER KANTONSRAT DES EIDGENÖSSISCHEN\n ...,2669 Bericht und Antrag des\n Regie...,[Aus den Verhandlungen des Regierungsrates] vo...
text,\nVerhandlungsgegenstände\n\t\t\t\t\n\n\n1.\n\...,\nDer Kantonsrat hat am 5. Dezember\n ...,\nDer Regierungsrat unterbreitet dem Kantonsra...
filename,ABl_1993__S__1547-1556_.xml,ABl_1985__S__570-573_.xml,ABl_1988__S__300-301_.xml


In [8]:
# Fix missing dates: Feature `date_when` is crucial. If this feature is missing, we agreed to fill in with the date from `date_from`.
# Add missing month and day to dates: Some dates are missing the month and day.
# Clean identifiers: Several ident values contain line breaks and multiple spaces. We agredd to remove these programmatically.

df = (
    df.pipe(fix_missing_dates)
    .pipe(add_missing_month_and_day_to_dates)
    .pipe(parse_dates)
    .pipe(clean_identifiers)
    .assign(year=df.date_when.dt.year)
    .assign(identifier=df.index.astype(int))
    .rename({"new_link": "link"}, axis=1)  # TODO: Rename here (new_link : link)
    .rename({"date_when": "date"}, axis=1)
    .rename({"ident": "stazh_ident"}, axis=1)
)

# Sanity checks.
assert df.date.isna().sum() == 0
tmp = pd.to_datetime(df.date, errors="coerce", format="%Y-%m-%d")
assert len(tmp[tmp.isna()].index) == 0
assert df.stazh_ident.str.contains("\n").sum() == 0
assert df.identifier.nunique() == len(df)
print(df.date.min(), df.date.max())

1980-01-11 00:00:00 2001-12-21 00:00:00


In [9]:
df.identifier = ["abl_" + str(x) for x in df.identifier]

# Generic text cleaning.
df.title = df.title.parallel_map(generic_text_cleaning)
df.text = df.text.parallel_map(generic_text_cleaning)

# Reduce to relevant columns and save to disk.
cols = ["identifier", "date", "year", "title", "text", "link", "stazh_ident", "ref"]
df = df[cols]
df.to_parquet(PREP_OUTPUT_ABl)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9388 entries, 0 to 9387
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   identifier   9388 non-null   object        
 1   date         9388 non-null   datetime64[ns]
 2   year         9388 non-null   int32         
 3   title        9388 non-null   object        
 4   text         9388 non-null   object        
 5   link         9388 non-null   object        
 6   stazh_ident  9388 non-null   object        
 7   ref          9388 non-null   object        
dtypes: datetime64[ns](1), int32(1), object(6)
memory usage: 133.4 MB
