# Process Kantonsratsprotokolle

**Imports**

In [1]:
import os
import pandas as pd
from pandarallel import pandarallel
from dotenv import load_dotenv

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
from staatsarchiv_utils import read_XML_files
from staatsarchiv_utils import parse_XML_files
from staatsarchiv_utils import fix_missing_dates
from staatsarchiv_utils import fix_incomplete_dates
from staatsarchiv_utils import parse_dates
from staatsarchiv_utils import clean_identifiers
from staatsarchiv_utils import generic_text_cleaning

In [3]:
load_dotenv()

DATA_INPUT_KRP = os.getenv("DATA_INPUT_KRP")
RAW_OUTPUT_KRP = os.getenv("RAW_OUTPUT_KRP")
PREP_OUTPUT_KRP = os.getenv("PREP_OUTPUT_KRP")

**Prepare data**

In [4]:
file_paths = read_XML_files(DATA_INPUT_KRP)
df = parse_XML_files(file_paths)
df.to_parquet(RAW_OUTPUT_KRP)

44,418 files found.


100%|██████████| 44418/44418 [01:42<00:00, 435.46it/s]


In [5]:
df = pd.read_parquet(RAW_OUTPUT_KRP)

# Sanity check that we haven't imported duplicated data.
assert df.duplicated().sum() == 0

print(f"{len(df):,.0f} documents in data set.\n")

# TODO: Here the correct url of zszh has to be set
link_prolog = "https://www.zentraleserien.zh.ch/krp/"
df["new_link"] = df.filename.apply(lambda x: link_prolog + x.replace(".xml", ""))
assert df.new_link.nunique() == len(df)

df.info(memory_usage="deep")

44,418 documents in data set.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44418 entries, 0 to 44417
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   path       44418 non-null  object
 1   date_when  44321 non-null  object
 2   date_from  97 non-null     object
 3   date_to    97 non-null     object
 4   date_text  44418 non-null  object
 5   ident      44418 non-null  object
 6   ref        44418 non-null  object
 7   title      44418 non-null  object
 8   text       44418 non-null  object
 9   filename   44418 non-null  object
 10  new_link   44418 non-null  object
dtypes: object(11)
memory usage: 874.6 MB


In [6]:
# Fix missing dates: Feature `date_when` is crucial. If this feature is missing, we agreed to fill in with the date from `date_from`.
# Fix incomplete dates: More than 100 dates can't be parsed because the day is missing. We therefore add the first day of the time range as the day.
# Clean identifiers: Several ident values contain line breaks and multiple spaces. We agredd to remove these programmatically.

df = (
    df.pipe(fix_missing_dates)
    .pipe(fix_incomplete_dates)
    .pipe(parse_dates)
    .pipe(clean_identifiers)
)

# Sanity checks.
assert df.date_when.isna().sum() == 0
tmp = pd.to_datetime(df.date_when, errors="coerce", format="%Y-%m-%d")
assert len(tmp[tmp.isna()].index) == 0
assert df.ident.str.contains("\n").sum() == 0
print(df.date_when.min(), df.date_when.max())

1803-04-18 00:00:00 1995-04-10 00:00:00


In [7]:
df["year"] = df.date_when.dt.year
df["identifier"] = df.index.astype(int)
df.identifier = ["krp_" + str(x) for x in df.identifier]
df.rename(
    {"new_link": "link"}, axis=1, inplace=True
)  # TODO: Rename here (new_link : link)
df.rename({"date_when": "date"}, axis=1, inplace=True)
df.rename({"ident": "stazh_ident"}, axis=1, inplace=True)

# Generic text cleaning.
df.title = df.title.parallel_map(generic_text_cleaning)
df.text = df.text.parallel_map(generic_text_cleaning)

# Reduce to relevant columns and save to disk.
cols = ["identifier", "date", "year", "title", "text", "link", "stazh_ident", "ref"]
df = df[cols]
df.to_parquet(PREP_OUTPUT_KRP)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44418 entries, 0 to 44417
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   identifier   44418 non-null  object        
 1   date         44418 non-null  datetime64[ns]
 2   year         44418 non-null  int32         
 3   title        44418 non-null  object        
 4   text         44418 non-null  object        
 5   link         44418 non-null  object        
 6   stazh_ident  44418 non-null  object        
 7   ref          44418 non-null  object        
dtypes: datetime64[ns](1), int32(1), object(6)
memory usage: 843.6 MB
