# Hybrid Search with Weaviate

**Imports**

In [None]:
import pandas as pd
from pandarallel import pandarallel
import os
from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer

import weaviate
from weaviate.classes.config import Property, DataType
import weaviate.classes as wvc
import weaviate.classes.config as wc

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500

pandarallel.initialize(progress_bar=True)

# Suppress Hugginface warning about tokenizers.
os.environ["TOKENIZERS_PARALLELISM"] = "false"

**Constants**

In [3]:
load_dotenv()

PREP_OUTPUT_KRP = os.getenv("PREP_OUTPUT_KRP")
PREP_OUTPUT_RRB = os.getenv("PREP_OUTPUT_RRB")
PREP_OUTPUT_GSZH = os.getenv("PREP_OUTPUT_GSZH")

DATA_OUTPUT_FULL = os.getenv("DATA_OUTPUT_FULL")
DATA_OUTPUT_CHUNKS = os.getenv("DATA_OUTPUT_CHUNKS")
DATA_EMBEDDINGS = os.getenv("DATA_EMBEDDINGS")

# Load data

In [4]:
df = pd.read_parquet(DATA_EMBEDDINGS)
df["date"] = pd.to_datetime(df["date"]).dt.tz_localize("UTC")
df.drop(columns=["word_count"], inplace=True)
df.drop(columns=["year"], inplace=True)
df["year"] = df["date"].dt.year

In [5]:
display(df.info(memory_usage="deep"))
df.sample(10).T

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1196936 entries, 0 to 1196935
Data columns (total 10 columns):
 #   Column       Non-Null Count    Dtype              
---  ------       --------------    -----              
 0   identifier   1196936 non-null  object             
 1   date         1196936 non-null  datetime64[ns, UTC]
 2   title        1196936 non-null  object             
 3   link         1196936 non-null  object             
 4   stazh_ident  1196936 non-null  object             
 5   ref          1196936 non-null  object             
 6   series       1196936 non-null  category           
 7   chunk_text   1196936 non-null  object             
 8   embeddings   1196936 non-null  object             
 9   year         1196936 non-null  int32              
dtypes: category(1), datetime64[ns, UTC](1), int32(1), object(7)
memory usage: 3.2 GB


None

Unnamed: 0,42671,437429,1120181,163208,510832,10568,342425,538117,852668,47404
identifier,krp_6275,rrb_103572,os_1889,krp_22124,rrb_138494,krp_2043,rrb_59013,rrb_151139,rrb_300601,krp_6579
date,1981-02-16 00:00:00+00:00,1929-07-17 00:00:00+00:00,1910-11-09 00:00:00+00:00,1928-08-27 00:00:00+00:00,1956-04-26 00:00:00+00:00,1907-02-19 00:00:00+00:00,1969-07-03 00:00:00+00:00,1946-09-20 00:00:00+00:00,1900-02-22 00:00:00+00:00,1987-11-23 00:00:00+00:00
title,Gesetz über die Jugendhilfe Antrag des Regieru...,Augustfeier.,Statuten der Witwen- und Waisenstiftung für di...,Verhandlungsgegenstände.,Landwirtschaftliche Winterschule Wädenswil Gut...,"Gesetz betreffend die Wahlen und Abstimmungen,...",Strassen.,Kanalisation.,Vormundschaft.,"Interpellation Th. Isler FDP, Rüschlikon vom 3..."
link,https://www.zentraleserien.zh.ch/krp/MM_24_104...,https://www.zentraleserien.zh.ch/rrb/MM_3_43_R...,https://www.zentraleserien.zh.ch/os/OS_29__S__...,https://www.zentraleserien.zh.ch/krp/MM_24_56_...,https://www.zentraleserien.zh.ch/rrb/MM_3_93_R...,https://www.zentraleserien.zh.ch/krp/MM_24_47_...,https://www.zentraleserien.zh.ch/rrb/MM_3_126_...,https://www.zentraleserien.zh.ch/rrb/MM_3_73_R...,https://www.zentraleserien.zh.ch/rrb/MM_3_14_R...,https://www.zentraleserien.zh.ch/krp/MM_24_121...
stazh_ident,StAZH MM 24.104 KRP 1981/100/0002,StAZH MM 3.43 RRB 1929/1596,StAZH OS 29 (S. 4-8),StAZH MM 24.56 KRP 1928/083/0645,StAZH MM 3.93 RRB 1956/1338,StAZH MM 24.47 KRP 1907/050/0313,StAZH MM 3.126 RRB 1969/2969,StAZH MM 3.73 RRB 1946/2962,StAZH MM 3.14 RRB 1900/0335,StAZH MM 24.121 KRP 1987/030/0012
ref,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...,https://suche.staatsarchiv.djiktzh.ch/detail.a...
series,krp,rrb,os,krp,rrb,krp,rrb,rrb,rrb,krp
chunk_text,"Ein Aspekt, der eher als Begleitmusik zu betra...",Die von Ihnen genannten Städte Zürich und Wint...,Paragraph 7. Die Jahresprämie beträgt Fr. 134 ...,Verhandlungsgegenstände. Das Einladungsschreib...,"Je nach den Bodenverhältnissen, die sich währe...",Wir sollen in Berücksichtigung der Verhältniss...,"Dieser Betrag wird dem Baukonto Nr. 216, Diels...",Kanalisation. Am 9. August 1946 legte der Geme...,Vormundschaft. A. Am 12. September 1899 starb ...,"Die kilometerlangen Staus, die jeweils in den ..."
embeddings,"[0.008183597, 0.032262735, -0.032888006, 0.036...","[0.06320367, -0.030679815, -0.10145651, -0.005...","[0.034152642, -0.052723937, -0.022596657, 0.03...","[0.037741706, -0.0052157263, -0.059702527, 0.0...","[0.018297978, -0.033366762, -0.02372579, 0.005...","[0.04490772, -0.0054620416, -0.059707154, 0.00...","[-0.014549205, 0.04603853, -0.0035448235, 0.02...","[-0.033636056, 0.0109341275, -0.014067844, -0....","[0.009306606, -0.0066017574, -0.054891843, -0....","[0.018641187, 0.0025090538, -0.004500187, -0.0..."
year,1981,1929,1910,1928,1956,1907,1969,1946,1900,1987


# Weaviate

In [None]:
client = weaviate.connect_to_embedded()
# Use this code line if Weaviate is already running, e.g. from the Streamlit app.
# client = weaviate.connect_to_local(port=8079, grpc_port=50050)

In [7]:
# Get the meta endpoint description of weaviate.
display(client.get_meta())

# Ping Weaviate’s live and ready state.
print(client.is_live())
print(client.is_ready())

{'hostname': 'http://127.0.0.1:8079',
 'modules': {'generative-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'Generative Search - OpenAI'},
  'qna-openai': {'documentationHref': 'https://platform.openai.com/docs/api-reference/completions',
   'name': 'OpenAI Question & Answering Module'},
  'ref2vec-centroid': {},
  'reranker-cohere': {'documentationHref': 'https://txt.cohere.com/rerank/',
   'name': 'Reranker - Cohere'},
  'text2vec-cohere': {'documentationHref': 'https://docs.cohere.ai/embedding-wiki/',
   'name': 'Cohere Module'},
  'text2vec-huggingface': {'documentationHref': 'https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task',
   'name': 'Hugging Face Module'},
  'text2vec-openai': {'documentationHref': 'https://platform.openai.com/docs/guides/embeddings/what-are-embeddings',
   'name': 'OpenAI Module'}},
 'version': '1.23.7'}

True
True


In [None]:
client.collections.create(
    "stazh",
    vectorizer_config=wc.Configure.Vectorizer.none(),
    inverted_index_config=wvc.config.Configure.inverted_index(
        bm25_b=0.75,
        bm25_k1=1.2,
        # stopwords_additions=None,
        # stopwords_preset=None,
        # stopwords_removals=None,
    ),
    properties=[
        Property(name="identifier", data_type=DataType.TEXT),
        Property(name="date", data_type=DataType.DATE),
        Property(name="year", data_type=DataType.INT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="link", data_type=DataType.TEXT),
        Property(name="stazh_ident", data_type=DataType.TEXT),
        Property(name="series", data_type=DataType.TEXT),
        Property(name="chunk_text", data_type=DataType.TEXT),
        Property(name="ref", data_type=DataType.TEXT),
    ],
)

In [None]:
# List all collections.
for v in client.collections.list_all().values():
    print(v.name)

# Get detailed information about all collections.
schema = client.collections.list_all(simple=False)
print(schema)

In [9]:
# # Delete collection
# client.collections.delete("stazh")

In [None]:
# Ingest data
collection = client.collections.get("stazh")

with collection.batch.dynamic() as batch:
    for idx, data in enumerate(df.to_dict(orient="records")):
        properties = {
            "identifier": data["identifier"],
            "date": data["date"],
            "year": data["year"],
            "title": data["title"],
            "link": data["link"],
            "stazh_ident": data["stazh_ident"],
            "series": data["series"],
            "chunk_text": data["chunk_text"],
            "ref": data["ref"],
        }
        batch.add_object(properties=properties, vector=data["embeddings"].tolist())

In [None]:
# List all items in the collection.
collection = client.collections.get("stazh")
for item in collection.iterator():
    print(item)
    break

In [None]:
# Get total count of all items in the collection.
collection = client.collections.get("stazh")
response = collection.aggregate.over_all(total_count=True)

print(response.total_count)

# Lexical search

In [None]:
collection = client.collections.get("stazh")
response = collection.query.bm25(
    query="Steuerreform",
    # query_properties=["title"], # Define which fields to search over.
    offset=0,
    limit=100,
    auto_limit=4,
    return_metadata=wvc.query.MetadataQuery(score=True, distance=True, certainty=True),
    # filters=wvc.query.Filter.by_property("year").equal(2012),
    #  filters=wvc.query.Filter.by_property("year").less_than(2012),
    #  auto_limit=True,
)

seen = []
final_results = []

for item in response.objects:
    if item.properties["identifier"] in seen:
        continue
    final_results.append(item.properties["title"])
    final_results.append(item.properties["series"])
    seen.append(item.properties["identifier"])
for elem in final_results:
    print(elem)

# Vector search

In [16]:
model_path = "jinaai/jina-embeddings-v2-base-de"
model = SentenceTransformer(
    model_path,
    trust_remote_code=True,
    device="mps",
)
model.max_seq_length = 512


def embed_query(query):
    return model.encode(query, convert_to_tensor=False, normalize_embeddings=True)



In [None]:
query = "Steuerreform"
query_embedding = embed_query(query)

collection = client.collections.get("stazh")
response = collection.query.near_vector(
    near_vector=list(query_embedding),
    target_vector="text",
    limit=20,
    auto_limit=3,
    return_metadata=wvc.query.MetadataQuery(distance=True),
)

seen = []
final_results = []

for item in response.objects:
    if item.properties["identifier"] in seen:
        continue
    final_results.append(item.properties["title"])
    final_results.append(item.properties["series"])
    seen.append(item.properties["identifier"])
for elem in final_results:
    print(elem)

# Hybrid search

In [None]:
query = "Steuerreform"
query_embedding = embed_query(query)

collection = client.collections.get("stazh")
response = collection.query.hybrid(
    query=query,
    vector=list(query_embedding),
    limit=5,
    auto_limit=2,
    alpha=0.7,
    fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
    filters=wvc.query.Filter.by_property("year").greater_or_equal(1803)
    & wvc.query.Filter.by_property("year").less_or_equal(1995),
)

seen = []
final_results = []

for item in response.objects:
    if item.properties["identifier"] in seen:
        continue
    final_results.append(item.properties["title"])
    final_results.append(item.properties["series"])
    seen.append(item.properties["identifier"])
    
for elem in final_results:
    print(elem)

# Search by document

In [None]:
ident = "StAZH ABl 1987 (S. 1079)"

collection = client.collections.get("stazh")
response = collection.query.fetch_objects(
    filters=wvc.query.Filter.by_property("stazh_ident").equal(year)
)

uuid = response.objects[0].uuid

response = collection.query.near_object(near_object=uuid)

for item in response.objects:
    print(
        item.properties["title"],
        item.properties["year"],
    )