# Chunk and embed data

**Imports**

In [1]:
import pandas as pd
from pandarallel import pandarallel
import os
import re
from dotenv import load_dotenv

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=True)

# Suppress Hugginface warning about tokenizers.
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from staatsarchiv_utils import chunk_text

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [2]:
load_dotenv()

PREP_OUTPUT_KRP = os.getenv("PREP_OUTPUT_KRP")
PREP_OUTPUT_RRB = os.getenv("PREP_OUTPUT_RRB")
PREP_OUTPUT_OS = os.getenv("PREP_OUTPUT_OS")
PREP_OUTPUT_ABl = os.getenv("PREP_OUTPUT_ABl")

DATA_OUTPUT_FULL = os.getenv("DATA_OUTPUT_FULL")
DATA_OUTPUT_CHUNKS = os.getenv("DATA_OUTPUT_CHUNKS")
DATA_EMBEDDINGS = os.getenv("DATA_EMBEDDINGS")

# Prepare corpora

Load and concatenate data sets.

Data sets that we ingest here need to have the following properties:
- `identifier`: Unique identifier of the document. Consists of dataframe index prefixed with a signifier for the document series, e.g. `krp-`.
- `date`: Date of creation of the document.
- `year`: Creation year, derived from `date`.
- `title`: Cleaned title of document.
- `text`: Cleaned text of document.
- `link`: Link to the document on the search portal of Staatsarchiv or zh.ch.
- `stazh_ident`: Original identifier, derived from XML tag `ident`.

Additional properties:
- `series`: From document series KRP, RRB, GSZH or Amtsblatt. 
- `word_count`: Number of words in text, used to filter out short or empty documents.
- `business_no`: Business number of RRBs. We don't have these for KRPs and GSZH yet.

In [4]:
dfa = pd.read_parquet(PREP_OUTPUT_KRP)
dfb = pd.read_parquet(PREP_OUTPUT_RRB)
dfc = pd.read_parquet(PREP_OUTPUT_OS)
dfd = pd.read_parquet(PREP_OUTPUT_ABl)

# To get more semantic signal we add titles to texts.
dfa["text"] = dfa.title + " " + dfa.text
dfb["text"] = dfb.title + " " + dfb.text
dfc["text"] = dfc.title + " " + dfc.text
dfd["text"] = dfd.title + " " + dfd.text
dfa["series"] = "krp"
dfb["series"] = "rrb"
dfc["series"] = "os"
dfd["series"] = "abl"

df = pd.concat([dfa, dfb, dfc, dfd])
df.reset_index(drop=True, inplace=True)
del dfa, dfb, dfc, dfd

# Add word count to filter out short documents in search app.
df["word_count"] = df.text.apply(lambda x: len(x.split(" ")))

In [None]:
# Additional text cleaning for semantic search.
# Mostly remove digits and single characters that do not add meaning.
DIGITS = re.compile(r"\d+")
MULTIPLE_SPACES = re.compile(r"[\s]+")
SINGLE_CHARACTERS = re.compile(r"\s.\s")


def semantic_text_cleaning(d):
    d = re.sub(DIGITS, " ", d)
    d = re.sub(MULTIPLE_SPACES, " ", d)
    d = re.sub(r"([!?.,;:•-]+){2,}", r"\1", d)
    d = re.sub(SINGLE_CHARACTERS, " ", d)
    d = d.strip()
    return d


df["text_semantic"] = df.text.parallel_apply(semantic_text_cleaning)

In [6]:
df.to_parquet(DATA_OUTPUT_FULL)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542479 entries, 0 to 542478
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   identifier     542479 non-null  object        
 1   date           542479 non-null  datetime64[ns]
 2   year           542479 non-null  int32         
 3   title          542479 non-null  object        
 4   text           542479 non-null  object        
 5   link           542479 non-null  object        
 6   stazh_ident    542479 non-null  object        
 7   ref            542479 non-null  object        
 8   series         542479 non-null  object        
 9   word_count     542479 non-null  int64         
 10  text_semantic  542479 non-null  object        
dtypes: datetime64[ns](1), int32(1), int64(1), object(8)
memory usage: 7.1 GB


# Chunk documents

In [7]:
df = pd.read_parquet(DATA_OUTPUT_FULL)
df.series = df.series.astype("category")
df.drop(columns=["text_semantic"], inplace=True)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 542479 entries, 0 to 542478
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   identifier   542479 non-null  object        
 1   date         542479 non-null  datetime64[ns]
 2   year         542479 non-null  int32         
 3   title        542479 non-null  object        
 4   text         542479 non-null  object        
 5   link         542479 non-null  object        
 6   stazh_ident  542479 non-null  object        
 7   ref          542479 non-null  object        
 8   series       542479 non-null  category      
 9   word_count   542479 non-null  int64         
dtypes: category(1), datetime64[ns](1), int32(1), int64(1), object(6)
memory usage: 2.3 GB


In [None]:
results = df.sample(frac=1).parallel_apply(
    chunk_text, max_token_count=500, overlap_tokens=100, axis=1
)
df_chunks = pd.DataFrame(
    [y for x in results.tolist() for y in x], columns=["identifier", "chunk_text"]
)

df_chunks = pd.merge(
    df.drop(columns=["text"]), df_chunks, left_on="identifier", right_on="identifier"
)

df_chunks.info(memory_usage="deep")
df_chunks.to_parquet(DATA_OUTPUT_CHUNKS)