# Process Regierungsratsbeschlüsse

**Imports**

In [2]:
import os
import pandas as pd
from pandarallel import pandarallel
from dotenv import load_dotenv

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
from staatsarchiv_utils import read_XML_files
from staatsarchiv_utils import parse_XML_files
from staatsarchiv_utils import fix_missing_dates
from staatsarchiv_utils import fix_incomplete_dates
from staatsarchiv_utils import parse_dates
from staatsarchiv_utils import clean_identifiers
from staatsarchiv_utils import generic_text_cleaning

In [4]:
load_dotenv()

DATA_INPUT_RRB = os.getenv("DATA_INPUT_RRB")
RAW_OUTPUT_RRB = os.getenv("RAW_OUTPUT_RRB")
PREP_OUTPUT_RRB = os.getenv("PREP_OUTPUT_RRB")

**Prepare data**

In [5]:
file_paths = read_XML_files(DATA_INPUT_RRB, remove_memberlists=True)
df = parse_XML_files(file_paths)  # 9min
df.to_parquet(RAW_OUTPUT_RRB)

479,146 files found.


100%|██████████| 479146/479146 [13:42<00:00, 582.73it/s] 


In [6]:
df = pd.read_parquet(RAW_OUTPUT_RRB)

# TODO: Here the correct URL of ZSZH has to be set.
link_prolog = "https://www.zentraleserien.zh.ch/rrb/"
df["new_link"] = df.filename.apply(lambda x: link_prolog + x.replace(".xml", ""))
assert df.new_link.nunique() == len(df)

df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479146 entries, 0 to 479145
Data columns (total 11 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   path       479146 non-null  object
 1   date_when  479144 non-null  object
 2   date_from  2 non-null       object
 3   date_to    2 non-null       object
 4   date_text  479146 non-null  object
 5   ident      479146 non-null  object
 6   ref        479146 non-null  object
 7   title      479146 non-null  object
 8   text       479146 non-null  object
 9   filename   479146 non-null  object
 10  new_link   479146 non-null  object
dtypes: object(11)
memory usage: 1.8 GB


In [7]:
df.sample(4)

Unnamed: 0,path,date_when,date_from,date_to,date_text,ident,ref,title,text,filename,new_link
469657,_01_data-input/RRB/TKR_RRB_Transkripte_XML_NER...,1848-04-05,,,05.04.1848,StAZH MM 2.100 RRB 1848/0476,https://suche.staatsarchiv.djiktzh.ch/detail.a...,Note der preuß. Gesandtschaft betr. angebliche...,Von den nachstehenden Schreiben ist lediglich ...,MM_2_100_RRB_1848_0476.xml,https://www.zentraleserien.zh.ch/rrb/MM_2_100_...
388669,_01_data-input/RRB/TKR_RRB_Transkripte_XML_NER...,1881-03-19,,,19.03.1881,StAZH MM 2.231 RRB 1881/0488,https://suche.staatsarchiv.djiktzh.ch/detail.a...,Kantonalbank; Benutzung des Amtsblattes.,"Der Regierungsrath hat, in Sachen der Zürcher ...",MM_2_231_RRB_1881_0488.xml,https://www.zentraleserien.zh.ch/rrb/MM_2_231_...
246290,_01_data-input/RRB/TKR_RRB_1903_1995_OCR _XML_...,1984-08-08,,,08.08.1984,StAZH MM 3.172 RRB 1984/2999,https://suche.staatsarchiv.djiktzh.ch/detail.a...,"Universität Zürich-Irchel, zweite Bauetappe.",Mit Kantonsratsbeschluss vom 7. November 1977 ...,MM_3_172_RRB_1984_2999_t.xml,https://www.zentraleserien.zh.ch/rrb/MM_3_172_...
375461,_01_data-input/RRB/TKR_RRB_Transkripte_XML_NER...,1833-02-09,,,09.02.1833,StAZH MM 2.10 RRB 1833/0231,https://suche.staatsarchiv.djiktzh.ch/detail.a...,Bestätiung mehrerer von dem Erziehungsrathe ge...,Nach Anhörung eines von dem Erziehungsrathe mi...,MM_2_10_RRB_1833_0231.xml,https://www.zentraleserien.zh.ch/rrb/MM_2_10_R...


In [8]:
# Fix missing dates: Feature `date_when` is crucial. If this feature is missing, we agreed to fill in with the date from `date_from`.
# Fix incomplete dates: More than 100 dates can't be parsed because the day is missing. We therefore add the first day of the time range as the day.
# Clean identifiers: Several ident values contain line breaks and multiple spaces. We agredd to remove these programmatically.

df = (
    df.pipe(fix_missing_dates)
    .pipe(fix_incomplete_dates)
    .pipe(parse_dates)
    .pipe(clean_identifiers)
)

# Sanity checks.
assert df.date_when.isna().sum() == 0
tmp = pd.to_datetime(df.date_when, errors="coerce", format="%Y-%m-%d")
assert len(tmp[tmp.isna()].index) == 0
assert df.ident.str.contains("\n").sum() == 0
print(df.date_when.min(), df.date_when.max())

1803-04-25 00:00:00 1995-12-20 00:00:00


In [9]:
df["year"] = df.date_when.dt.year
df["identifier"] = df.index.astype(int)
df.identifier = ["rrb_" + str(x) for x in df.identifier]
df.rename(
    {"new_link": "link"}, axis=1, inplace=True
)  # TODO: Rename here (new_link : link)
df.rename({"date_when": "date"}, axis=1, inplace=True)
df.rename({"ident": "stazh_ident"}, axis=1, inplace=True)

# Generic text cleaning.
df.title = df.title.parallel_map(generic_text_cleaning)
df.text = df.text.parallel_map(generic_text_cleaning)

# Reduce to relevant columns and save to disk.
cols = ["identifier", "date", "year", "title", "text", "link", "stazh_ident", "ref"]
df = df[cols]
df.to_parquet(PREP_OUTPUT_RRB)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479146 entries, 0 to 479145
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   identifier   479146 non-null  object        
 1   date         479146 non-null  datetime64[ns]
 2   year         479146 non-null  int32         
 3   title        479146 non-null  object        
 4   text         479146 non-null  object        
 5   link         479146 non-null  object        
 6   stazh_ident  479146 non-null  object        
 7   ref          479146 non-null  object        
dtypes: datetime64[ns](1), int32(1), object(6)
memory usage: 2.6 GB
