# Process Regierungsratsbeschlüsse


**Imports**


In [1]:
import os
import pandas as pd
from pandarallel import pandarallel
from dotenv import load_dotenv

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
from staatsarchiv_utils import read_XML_files
from staatsarchiv_utils import parse_XML_files
from staatsarchiv_utils import fix_missing_dates
from staatsarchiv_utils import fix_incomplete_dates
from staatsarchiv_utils import parse_dates
from staatsarchiv_utils import clean_identifiers
from staatsarchiv_utils import generic_text_cleaning

In [3]:
load_dotenv()

DATA_INPUT_RRB = os.getenv("DATA_INPUT_RRB")
RAW_OUTPUT_RRB = os.getenv("RAW_OUTPUT_RRB")
PREP_OUTPUT_RRB = os.getenv("PREP_OUTPUT_RRB")

**Prepare data**


In [4]:
file_paths = read_XML_files(DATA_INPUT_RRB, remove_memberlists=True)
df = parse_XML_files(file_paths)  # 9min
df.to_parquet(RAW_OUTPUT_RRB)

479,146 files found.


100%|██████████| 479146/479146 [06:36<00:00, 1208.74it/s]


In [5]:
df = pd.read_parquet(RAW_OUTPUT_RRB)

# TODO: Here the correct URL of ZSZH has to be set.
link_prolog = "https://www.zentraleserien.zh.ch/rrb/"
df["new_link"] = df.filename.apply(lambda x: link_prolog + x.replace(".xml", ""))
assert df.new_link.nunique() == len(df)

df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479146 entries, 0 to 479145
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   path       479146 non-null  object
 1   date_when  479144 non-null  object
 2   date_from  2 non-null       object
 3   date_to    2 non-null       object
 4   date_text  479146 non-null  object
 5   ident      479146 non-null  object
 6   ref        479146 non-null  object
 7   title      479146 non-null  object
 8   text       479146 non-null  object
 9   filename   479146 non-null  object
 10  new_link   479146 non-null  object
dtypes: object(11)
memory usage: 1.7 GB


In [6]:
df.sample(4)

Unnamed: 0,path,date_when,date_from,date_to,date_text,ident,ref,title,text,filename,new_link
302368,_01_data-input/RRB/TKR_RRB_1903_1995_OCR _XML_...,1969-11-27,,,27.11.1969,StAZH MM 3.127 RRB 1969/5313,https://suche.staatsarchiv.djiktzh.ch/detail.a...,Nationalstrassen.,In der Gemeinde Birmensdorf ist der Bau der N ...,MM_3_127_RRB_1969_5313_t.xml,https://www.zentraleserien.zh.ch/rrb/MM_3_127_...
156071,_01_data-input/RRB/TKR_RRB_1903_1995_OCR _XML_...,1963-02-07,,,07.02.1963,StAZH MM 3.107 RRB 1963/0469,https://suche.staatsarchiv.djiktzh.ch/detail.a...,Amt für Luftverkehr (Diensttelefone).,Mit Beschlüssen Nr. 3367 vom 23. Juli 1959 und...,MM_3_107_RRB_1963_0469_t.xml,https://www.zentraleserien.zh.ch/rrb/MM_3_107_...
447073,_01_data-input/RRB/TKR_RRB_Transkripte_XML_NER...,1818-10-13,,,13.10.1818,StAZH MM 1.68 RRB 1818/0874,https://suche.staatsarchiv.djiktzh.ch/detail.a...,"Dem Schmid Jacob Bodmer wird bewilligt, sein e...","Es haben UHHerren und Obern, nach Anhörung ein...",MM_1_68_RRB_1818_0874.xml,https://www.zentraleserien.zh.ch/rrb/MM_1_68_R...
430544,_01_data-input/RRB/TKR_RRB_Transkripte_XML_NER...,1808-11-01,,,01.11.1808,StAZH MM 1.27 RRB 1808/1237,https://suche.staatsarchiv.djiktzh.ch/detail.a...,Veränderung in der Canzleydirection des Canton...,Da der Lob[liche] Stand AppenzellInner Rhoden ...,MM_1_27_RRB_1808_1237.xml,https://www.zentraleserien.zh.ch/rrb/MM_1_27_R...


In [7]:
# Fix missing dates: Feature `date_when` is crucial. If this feature is missing, we agreed to fill in with the date from `date_from`.
# Fix incomplete dates: More than 100 dates can't be parsed because the day is missing. We therefore add the first day of the time range as the day.
# Clean identifiers: Several ident values contain line breaks and multiple spaces. We agredd to remove these programmatically.

df = (
    df.pipe(fix_missing_dates)
    .pipe(fix_incomplete_dates)
    .pipe(parse_dates)
    .pipe(clean_identifiers)
)

# Sanity checks.
assert df.date_when.isna().sum() == 0
tmp = pd.to_datetime(df.date_when, errors="coerce", format="%Y-%m-%d")
assert len(tmp[tmp.isna()].index) == 0
assert df.ident.str.contains("\n").sum() == 0
print(df.date_when.min(), df.date_when.max())

1803-04-25 00:00:00 1995-12-20 00:00:00


In [8]:
df["year"] = df.date_when.dt.year
df["identifier"] = df.index.astype(int)
df.identifier = ["rrb_" + str(x) for x in df.identifier]
df.rename(
    {"new_link": "link"}, axis=1, inplace=True
)  # TODO: Rename here (new_link : link)
df.rename({"date_when": "date"}, axis=1, inplace=True)
df.rename({"ident": "stazh_ident"}, axis=1, inplace=True)

# Generic text cleaning.
df.title = df.title.parallel_map(generic_text_cleaning)
df.text = df.text.parallel_map(generic_text_cleaning)

# Reduce to relevant columns and save to disk.
cols = ["identifier", "date", "year", "title", "text", "link", "stazh_ident", "ref"]
df = df[cols]
df.to_parquet(PREP_OUTPUT_RRB)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479146 entries, 0 to 479145
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   identifier   479146 non-null  object        
 1   date         479146 non-null  datetime64[ns]
 2   year         479146 non-null  int32         
 3   title        479146 non-null  object        
 4   text         479146 non-null  object        
 5   link         479146 non-null  object        
 6   stazh_ident  479146 non-null  object        
 7   ref          479146 non-null  object        
dtypes: datetime64[ns](1), int32(1), object(6)
memory usage: 2.6 GB
