# Index Data

**Imports**

In [2]:
import pandas as pd
from pandarallel import pandarallel
import os
from sentence_transformers import SentenceTransformer
import weaviate
from weaviate.classes.config import Property, DataType
import weaviate.classes as wvc
import weaviate.classes.config as wc

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=True)

# Suppress Hugginface warning about tokenizers.
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from utils import chunk_text
import tiktoken

enc = tiktoken.encoding_for_model("gpt-4o")

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


# Load Data

In [3]:
df = pd.read_parquet("_data/01_KRP_1994.parq")
df["token_count"] = df["text"].apply(lambda x: len(enc.encode(x)))
# For simplicity, we will filter out very long documents.
df = df[df["token_count"] <= 5_000]
cols = ["identifier", "date", "title", "ref", "text", "token_count"]
df = df[cols]
df.rename(columns={"ref": "link"}, inplace=True)
df.reset_index(drop=True, inplace=True)
df.info(memory_usage="deep")
df.to_parquet("_data/02_KRP_selec.parq", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 333 entries, 0 to 332
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   identifier   333 non-null    object        
 1   date         333 non-null    datetime64[ns]
 2   title        333 non-null    object        
 3   link         333 non-null    object        
 4   text         333 non-null    object        
 5   token_count  333 non-null    int64         
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 5.4 MB


# Chunk Documents

In [4]:
df = pd.read_parquet("_data/02_KRP_selec.parq")

In [5]:
# We shuffle the dataframe to make sure that parallel processing is more efficient.
results = df.sample(frac=1).parallel_apply(
    chunk_text, max_token_count=1000, overlap_tokens=200, axis=1
)
df_chunks = pd.DataFrame(
    [y for x in results.tolist() for y in x], columns=["identifier", "chunk_text"]
)

df_chunks = pd.merge(
    df.drop(columns=["text"]), df_chunks, left_on="identifier", right_on="identifier"
)

df_chunks.info(memory_usage="deep")
df_chunks.to_parquet("_data/03_KRP_chunks.parq")

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=34), Label(value='0 / 34'))), HBox…

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 867 entries, 0 to 866
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   identifier   867 non-null    object        
 1   date         867 non-null    datetime64[ns]
 2   title        867 non-null    object        
 3   link         867 non-null    object        
 4   token_count  867 non-null    int64         
 5   chunk_text   867 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(4)
memory usage: 4.2 MB


# Embed Documents

In [6]:
df = pd.read_parquet("_data/03_KRP_chunks.parq")

In [None]:
model_path = "jinaai/jina-embeddings-v2-base-de"
model = SentenceTransformer(
    model_path,
    trust_remote_code=True,
    device="mps",  # Use "cuda" for CUDA GPUs, "mps" for Mac, "cpu" for CPU
)
print("Max Sequence Length:", model.max_seq_length)
model.max_seq_length = 1200

Max Sequence Length: 8192


In [None]:
embeddings = model.encode(
    df.chunk_text.values,
    batch_size=16,
    convert_to_tensor=False,
    normalize_embeddings=True,
    show_progress_bar=True,
)

In [9]:
df["embeddings"] = list(embeddings)
df.to_parquet("_data/04_KRP_embed.parq")

# Index Data

In [10]:
df = pd.read_parquet("_data/04_KRP_embed.parq")
df.date = pd.to_datetime(df.date, format="%Y-%m-%d")

In [None]:
try:
    client = weaviate.connect_to_embedded(persistence_data_path="_weaviate_index/")
    print("Connected to embedded")
except:
    client = weaviate.connect_to_local(
        port=8079,
        grpc_port=50050,
    )
    print("Connected to local")

print(client.is_live())
print(client.is_ready())

# If the collection already exists, delete it.
client.collections.delete("research_app")

In [12]:
# # Get the meta endpoint description of weaviate.
# display(client.get_meta())

# Ping Weaviate’s live and ready state.
print(client.is_live())
print(client.is_ready())

True
True


In [None]:
client.collections.create(
    "research_app",
    vectorizer_config=wc.Configure.Vectorizer.none(),
    inverted_index_config=wvc.config.Configure.inverted_index(
        bm25_b=0.75,
        bm25_k1=1.2,
        # stopwords_additions=None,
        # stopwords_preset=None,
        # stopwords_removals=None,
    ),
    properties=[
        Property(name="identifier", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="text", data_type=DataType.TEXT),
    ],
)

In [14]:
# List all collections.
for v in client.collections.list_all().values():
    print(v.name)

# # Get detailed information about all collections.
# schema = client.collections.list_all(simple=False)
# print(schema)

Research_app


In [None]:
# # Delete collection
# client.collections.delete("Research_app")

In [15]:
# Ingest data
collection = client.collections.get("Research_app")

with collection.batch.dynamic() as batch:
    for idx, data in enumerate(df.to_dict(orient="records")):
        properties = {
            "identifier": data["identifier"],
            "title": data["title"],
            "text": data["chunk_text"],
        }
        batch.add_object(properties=properties, vector=data["embeddings"].tolist())

In [16]:
# Get total count of all items in the collection.
collection = client.collections.get("Research_app")
response = collection.aggregate.over_all(total_count=True)

print(response.total_count)

867


## Test Lexical Search

In [17]:
collection = client.collections.get("Research_app")
response = collection.query.bm25(
    query="Was ist zu Steuerreformen entschieden worden?",
    # query_properties=["title"], # Define which fields to search over.
    offset=0,
    limit=100,
    auto_limit=4,
    return_metadata=wvc.query.MetadataQuery(score=True, distance=True, certainty=True),
    # filters=wvc.query.Filter.by_property("year").equal(2012),
    #  filters=wvc.query.Filter.by_property("year").less_than(2012),
    #  auto_limit=True,
)

seen = []
final_results = []

for item in response.objects:
    if item.properties["identifier"] in seen:
        continue
    final_results.append(item.properties["title"])
    seen.append(item.properties["identifier"])
for elem in final_results:
    print(elem)

Einzelinitiative Odile Wild, Zürich, vom 9. Juni 1994 betreffend Gleichstellung von Mann und Frau im Steuergesetz KR-Nr. 193/1994
Dringliche Interpellation Christian Bretscher, Birmensdorf, und Franziska Troesch-Schnyder, Zollikon, vom 28. März 1994 betreffend Schutz der öffentlichen Sicherheit im Zusammenhang mit bedingten Entlassungen und Urlaubsgewährungen schriftlich begründet KR-Nr. 98/1994, RRB-Nr. 1140/20. 4. 1994
Einzelinitiative Julia Anderegg, Zürich, vom 30. August 1993 betreffend Änderung des Tierschutzgesetzes KR-Nr. 251/1993
Einzelinitiative Roland Bachmann, Horgen, vom 3. Juni 1992 betreffend Änderung des Gesetzes über das Vorschlagsrecht des Volkes Anhebung des Quorums für Einzelinitiativen Bericht und Antrag des Regierungsrates vom 15. Dezember 1993 und geänderter Antrag der Kommission vom 27. Januar 1994 3358a
Mitteilungen
Postulat Franziska Frey-Wettstein, Zürich, und Mitunterzeichnende vom 21. September 1992 betreffend kontrollierte Opiatabgabe schriftlich begründet

## Test Hybrid Search

In [18]:
model_path = "jinaai/jina-embeddings-v2-base-de"
model = SentenceTransformer(
    model_path,
    trust_remote_code=True,
    device="mps",  # Use "cuda" for GPU, "mps" for Mac, "cpu" for CPU
)
model.max_seq_length = 1200


def embed_query(query):
    return model.encode(query, convert_to_tensor=False, normalize_embeddings=True)

In [19]:
query = "Was ist zu Steuerreformen entschieden worden?"
query_embedding = embed_query(query)

collection = client.collections.get("Research_app")
response = collection.query.hybrid(
    query=query,
    vector=list(query_embedding),
    limit=20,
    auto_limit=10,
    alpha=0.7,
    fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
)

seen = []
final_results = []

for item in response.objects:
    if item.properties["identifier"] in seen:
        continue
    final_results.append(item.properties["title"])
    seen.append(item.properties["identifier"])

for elem in final_results:
    print(elem)

Verhandlungsgegenstände Geschäftsordnung
Einzelinitiative Odile Wild, Zürich, vom 9. Juni 1994 betreffend Gleichstellung von Mann und Frau im Steuergesetz KR-Nr. 193/1994
Einzelinitiative Hans Heusser, Zürich, vom 26. April 1994 betreffend Änderung von Paragraph 31 des Steuergesetzes KR-Nr. 132/1994
Verordnung über die Quellensteuer für ausländische Arbeitnehmer Quellensteuerverordnung Antrag des Regierungsrates vom 2. Februar 1994 und gleichlautender Antrag der Kommission vom 5. Mai 1994 3374 Verordnung über die Quellensteuer für natürliche und juristische Personen ohne steuerrechtlichen Wohnsitz oder Aufenthalt in der Schweiz Quellensteuerverordnung Antrag des Regierungsrates vom 2. Februar 1994 und gleichlautender Antrag der Kommission vom 5. Mai 1994 3375
Statuten der Versicherungskasse für das Staatspersonal Änderung Antrag des Regierungsrates vom 24. November 1993 und gleichlautender Antrag der Kommission vom 23. Juni 1994 3352
Einzelinitiative Beat Müller, Zürich, vom 5. Juli 19