# Hybrid search for the metadata catalogue of the Canton of Zurich
In this notebook we prepare the data for our search application and setup the search index.

- We load the data from the API. 
- We prepare the data and lemmatize the text data (for lexical search).
- We embed via the OpenAI embedding API. 
- We test the embeddings with cosine similarity.
- We setup the Weaviate index.
- We create a collection with our data.
- We test the index in regard to lexical, and vector search as well as the combination of both - hybrid search. The latter is what we use in the app.

# Imports

In [2]:
import pandas as pd
from pandarallel import pandarallel
import numpy as np

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

from time import time
import os
import re
import requests
from dotenv import load_dotenv

import warnings
from bs4 import MarkupResemblesLocatorWarning
from sklearn.metrics.pairwise import cosine_similarity
import weaviate
from weaviate.classes.config import Property, DataType
import weaviate.classes as wvc
import weaviate.classes.config as wc
import spacy
from sentence_transformers import SentenceTransformer

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

load_dotenv()

# Initialize sentence-transformers model
embedding_model = SentenceTransformer('intfloat/multilingual-e5-small')

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


**Constants**

In [3]:
DATA_FOLDER = "_data/"
DATASETS = DATA_FOLDER + "01_data.parq"
DATA_WITH_EMBEDDINGS = DATA_FOLDER + "02_data_embedded.parq"

BASELINK_API = "https://www.web.statistik.zh.ch/ogd/daten/zhweb.json"

# Dataset links are composed of this baselink and the identifier for each dataset.
BASELINK_DATASHOP = (
    "https://www.zh.ch/de/politik-staat/statistik-daten/datenkatalog.html#/datasets/"
)

# Load and parse data

In [4]:
def retrieve_mdv_metadata():
    """Retrieve metadata for all datasets from the data catalog API.

    Args:
        None

    Returns:
        raw (list): Metadata for all datasets.
    """
    try:
        raw = requests.get(BASELINK_API)
        if raw.status_code == 200:
            raw = raw.json()
            raw = raw["dataset"]
            return raw
    except Exception as e:
        print(e)
        return None


def clean_text(d):
    """Clean text. Remove square brackets that often enclose units or other information. Replace multiple spaces with single spaces and strip leading and trailing spaces.

    Args:
        d (str): Text to clean.

    Returns:
        str: Cleaned text.
    """
    # Remove square brackets.
    d = re.sub(r"(\[|\])", " ", d)
    # Replace multiple spaces with single space.
    d = re.sub(r"[\s]+", " ", d)
    d = d.strip()
    return d


def parse_and_prepare_metadata(raw):
    """Parse and prepare metadata for all available datasets.

    Args:
        raw (list): Raw metadata for all available datasets.

    Returns:
        pd.DataFrame: Parsed and prepared metadata for all datasets.
    """

    identifier = []
    links = []
    titles = []
    descriptions = []
    themes = []
    keywords = []
    distributions = []
    is_study = []

    # Iterate over all datasets and retrieve metadata.
    for dataset in raw:
        identifier.append(dataset["identifier"])

        # The link to the dataset is composed of the base link and the dataset identifier.
        links.append(BASELINK_DATASHOP + dataset["identifier"])

        titles.append(dataset["title"].strip())

        if dataset["description"] not in ["NA", "keine", "null"]:
            descriptions.append(dataset["description"].strip())
        else:
            descriptions.append("")

        if isinstance(dataset["theme"], list):
            kwds = [x for x in dataset["theme"] if x != None]
            themes.append(" ".join(kwds))
        else:
            themes.append("")

        if isinstance(dataset["keyword"], list):
            keywords.append(" ".join(dataset["keyword"]))
        else:
            keywords.append("")

        # Go through all resources and retrieve their textual content.
        tmp_res = []
        for resource in dataset["distribution"]:
            is_pdf = False
            if resource["format"] == "PDF":
                is_pdf = True
            tmp_res.append(resource["title"])
            if isinstance(resource["description"], list):
                tmp_res.append(resource["description"])
            else:
                tmp_res.append("")
        is_study.append(is_pdf)

        # Join textual content of dataset and resources.
        distributions.append(" ".join(tmp_res))

    df = pd.DataFrame(identifier, columns=["identifier"])
    df["link"] = links
    df["title"] = titles
    df["description"] = descriptions
    df["theme"] = themes
    df["keyword"] = keywords
    df["distribution"] = distributions
    df["is_study"] = is_study

    return df

In [5]:
raw = retrieve_mdv_metadata()
df = parse_and_prepare_metadata(raw)
df.title = df.title.apply(lambda x: x.strip())
df.description = df.description.apply(lambda x: x.strip())

# Combine title and description for embedding.
df["text_for_embedding"] = df.title + " " + df.description
df["text_for_embedding"] = df["text_for_embedding"].apply(lambda x: x.strip())
df.to_parquet(DATASETS)
df.info(memory_usage="deep")
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 813 entries, 0 to 812
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   identifier          813 non-null    object
 1   link                813 non-null    object
 2   title               813 non-null    object
 3   description         813 non-null    object
 4   theme               813 non-null    object
 5   keyword             813 non-null    object
 6   distribution        813 non-null    object
 7   is_study            813 non-null    bool  
 8   text_for_embedding  813 non-null    object
dtypes: bool(1), object(8)
memory usage: 1.5 MB


Unnamed: 0,identifier,link,title,description,theme,keyword,distribution,is_study,text_for_embedding
0,364@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,MIV-Anteil (Modal Split) [%],Grundlage für die Berechnung des MIV-Anteils s...,http://publications.europa.eu/resource/authori...,bezirke gemeinden kanton_zuerich ogd verkehr v...,MIV-Anteil (Modal Split) [%],False,MIV-Anteil (Modal Split) [%] Grundlage für die...
1,363@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,ÖV-Anteil (Modal Split) [%],Grundlage für die Berechnung des ÖV-Anteils si...,http://publications.europa.eu/resource/authori...,bezirke gemeinden kanton_zuerich oeffentlicher...,ÖV-Anteil (Modal Split) [%],False,ÖV-Anteil (Modal Split) [%] Grundlage für die ...
2,369@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,"MIV-Wege Quell-, Ziel- und Binnenverkehr [Anz.]",Modelliertes Verkehrsaufkommen als Summe aller...,http://publications.europa.eu/resource/authori...,bezirke gemeinden kanton_zuerich ogd verkehr v...,"MIV-Wege Quell-, Ziel- und Binnenverkehr [Anz.]",False,"MIV-Wege Quell-, Ziel- und Binnenverkehr [Anz...."
3,366@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,"ÖV-Wege Quell-, Ziel- und Binnenverkehr [Anz.]",Modelliertes Verkehrsaufkommen als Summe aller...,http://publications.europa.eu/resource/authori...,bezirke gemeinden kanton_zuerich oeffentlicher...,"ÖV-Wege Quell-, Ziel- und Binnenverkehr [Anz.]",False,"ÖV-Wege Quell-, Ziel- und Binnenverkehr [Anz.]..."
4,57@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,Hotelbetten [Anz.],Anzahl Hotelbetten im Juni des Erhebungsjahres...,http://publications.europa.eu/resource/authori...,bezirke gastgewerbe gemeinden hotellerie kanto...,Hotelbetten [Anz.],False,Hotelbetten [Anz.] Anzahl Hotelbetten im Juni ...


# Lemmatize text

In [7]:
nlp = spacy.load("de_core_news_lg")

LETTERS_AND_DIGITS = re.compile(r"[^a-zäüöA-ZÜÄÖ0-9.]")
MULTIPLE_SPACES = re.compile(r"\s+")


def prepare_for_lexical_search(text, lower=False, remove_umlauts=False):
    """Lemmatize text, and optionally lower case and remove umlauts for lexical search.

    Args:
        text (str): Text to process.

    Keyword Arguments:
        lower (bool): Lower case text (default: {True}).
        remove_umlauts (bool): Remove umlauts from text (default: {True}).

    Returns:
        str: Lemmatized text, optionally lower cased, and without umlauts.
    """
    doc = nlp(text)
    text = " ".join([token.lemma_ if token.is_alpha else token.text for token in doc])
    text = re.sub(LETTERS_AND_DIGITS, " ", text)
    text = re.sub(MULTIPLE_SPACES, " ", text)
    if lower:
        text = text.lower()
    if remove_umlauts:
        text = text.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue")
    return text

In [8]:
df["title_lemma"] = df.title.parallel_apply(lambda x: prepare_for_lexical_search(x))
df["description_lemma"] = df.description.parallel_apply(
    lambda x: prepare_for_lexical_search(x)
)
df["theme_lemma"] = df.theme.parallel_apply(lambda x: prepare_for_lexical_search(x))
df["distribution_lemma"] = df.distribution.parallel_apply(
    lambda x: prepare_for_lexical_search(x)
)

Have a quick look at the data.

In [9]:
df.sample(2).T

Unnamed: 0,719,702
identifier,1522@stadt-winterthur,1281@statistisches-amt-kanton-zuerich
link,https://www.zh.ch/de/politik-staat/statistik-d...,https://www.zh.ch/de/politik-staat/statistik-d...
title,"Betriebe, Beschäftigte, Vollzeitäquivalente in...",Archiv Wahlresultate Gemeindebehördenwahlen im...
description,"Betriebe, Beschäftigte und Vollzeitäquivalente...",Ergebnisse der Erneuerungswahlen der Zürcher G...
theme,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/resource/authori...
keyword,beschaeftigte betriebe branchen ogd vollzeitae...,demokratie gemeinden gemeindewahlen kanton_zue...
distribution,"Betriebe, Beschäftigte, Vollzeitäquivalente na...",Dokumentation Gemeindebehördenwahlen Ergebnis...
is_study,False,False
text_for_embedding,"Betriebe, Beschäftigte, Vollzeitäquivalente in...",Archiv Wahlresultate Gemeindebehördenwahlen im...
title_lemma,Betrieb beschäftigter vollzeitäquivalenter in ...,Archiv Wahlresultate Gemeindebehördenwahle in ...


# Embed data

We use the [intfloat/multilingual-e5-small](https://huggingface.co/intfloat/multilingual-e5-small) model from sentence-transformers for embeddings. This is a multilingual model that supports German and has a context length of 512 tokens.

We also tested [Jina AI jina-embeddings-v2-base-de](https://huggingface.co/jinaai/jina-embeddings-v2-base-de) with good results.

In [10]:
def embed_with_sentence_transformer(texts):
    """Embed text using sentence-transformers.

    Args:
        texts (list or str): Text(s) to embed. Can be a single string or list of strings.

    Returns:
        list or np.ndarray: Embeddings as a list (for single text) or numpy array (for multiple texts).
    """
    if isinstance(texts, str):
        return embedding_model.encode([texts])[0].tolist()
    else:
        return embedding_model.encode(texts).tolist()

In [11]:
df["embedding"] = embed_with_sentence_transformer(df.text_for_embedding.tolist())

In [12]:
cols = [
    "identifier",
    "link",
    "title",
    "title_lemma",
    "description",
    "description_lemma",
    "text_for_embedding",
    "theme",
    "theme_lemma",
    "keyword",
    "distribution",
    "distribution_lemma",
    "is_study",
    "embedding",
]

df = df[cols]

In [13]:
df.sample(1).T

Unnamed: 0,605
identifier,644@statistisches-amt-kanton-zuerich
link,https://www.zh.ch/de/politik-staat/statistik-d...
title,Hohe finanzielle Stabilität trotz Finanzkrise
title_lemma,hoch finanziell Stabilität trotz Finanzkrise
description,
description_lemma,
text_for_embedding,Hohe finanzielle Stabilität trotz Finanzkrise
theme,http://publications.europa.eu/resource/authori...
theme_lemma,http publications.europa.eu resource authority...
keyword,gemeindefinanzen statistik.info


In [14]:
df.to_parquet(DATA_WITH_EMBEDDINGS)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 813 entries, 0 to 812
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   identifier          813 non-null    object
 1   link                813 non-null    object
 2   title               813 non-null    object
 3   title_lemma         813 non-null    object
 4   description         813 non-null    object
 5   description_lemma   813 non-null    object
 6   text_for_embedding  813 non-null    object
 7   theme               813 non-null    object
 8   theme_lemma         813 non-null    object
 9   keyword             813 non-null    object
 10  distribution        813 non-null    object
 11  distribution_lemma  813 non-null    object
 12  is_study            813 non-null    bool  
 13  embedding           813 non-null    object
dtypes: bool(1), object(13)
memory usage: 4.6 MB


# Test embeddings

Perform a quick check with scikit's cosine similarity function.

In [15]:
df = pd.read_parquet(DATA_WITH_EMBEDDINGS)

In [16]:
query = "wohnungsnot in zürich"
show_n_results = 20

# Vector search based on sentence-transformers embeddings.
now = time()
embedding = embed_with_sentence_transformer(query)
cosine_sim = cosine_similarity([embedding], df.embedding.tolist())
top_k = np.argsort(cosine_sim[0])
top_k = top_k[::-1]
display(df.iloc[top_k][["title"]].head(show_n_results).values)

array([['Räumliche Segregation in der Agglomeration Zürich'],
       ['Kleineres Armutsrisiko dank günstiger Wohnung'],
       ['Zürcher Immobilienpreise'],
       ['Bürgerrecht, Ferien und Zweitwohnungen'],
       ['Mobilisierung jenseits des Gewohnten'],
       ['Personenfreizügigkeit unter Druck'],
       ['Abfall im Kanton Zürich'],
       ['Abfall im Kanton Zürich'],
       ['Abfall im Kanton Zürich'],
       ['Zürcher Immobilienhandel'],
       ['Zuwanderung und Wohnungsmarkt'],
       ['Wohnungswechsel'],
       ['Dynamischer Zürcher Wohnungsmarkt'],
       ['Bevölkerungsdynamik und Wohnungsbau'],
       ['Die Sozialhilfe im Kanton Zürich 2001'],
       ['Sicheres Wohnen im Alter und Jugendmusik'],
       ['Volkszählung 2000'],
       ['Segregation in der Stadt Winterthur und Umgebung'],
       ['Sterbezahl stabil - trotz wachsender Bevölkerung'],
       ['Entspannung auf dem Wohnungsmarkt?']], dtype=object)

# Implement search

Now we create the actual search index for lexical and semantic search.

Note that [Weaviate's](https://weaviate.io/developers/weaviate) default location for the index is `~/.local/share/weaviate`. If you want to use the app on a different machine, simply copy the files from this location to the same location on the other machine. You can also set a specific path with parameter `persistence_data_path` – see [documentation here](https://weaviate.io/developers/weaviate/installation/embedded).

In [17]:
df = pd.read_parquet(DATA_WITH_EMBEDDINGS)

In [18]:
# This will either connect to an existing Weaviate instance or create a new one.
# We only can execute this once, as is. If we want to run it again, we need to restart the kernel.
# More info regarding the Embedded Weaviate client:
# https://weaviate.io/developers/weaviate/installation/embedded
# By default the index data is stored in ~/.local/share/weaviate/

client = weaviate.connect_to_embedded()

# # Use this line if another application on your machine is already using Weaviate on the default port.
# # This notebook also counts as another application. If your notebook is running, you need to use a different port for the Streamlit app. Otherwise, you get an error.
# client = weaviate.connect_to_local(port=8079, grpc_port=50050)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
{"action":"startup","build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","level":"info","msg":"Feature flag LD integration disabled: could not locate WEAVIATE_LD_API_KEY env variable","time":"2025-12-27T13:14:03+01:00"}
{"action":"startup","build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2025-12-27T13:14:03+01:00"}
{"action":"startup","auto_schema_enabled":{},"build_git

In [19]:
# # Get the meta endpoint description of Weaviate.
# display(client.get_meta())

# # Ping Weaviate’s live and ready state.
# print(client.is_live())
# print(client.is_ready())

In [20]:
# Delete collection.
client.collections.delete("MDV")

In [21]:
# First we need to create a schema that defines how data is stored, organized and retrieved in Weaviate.
# A schema is called a "collection". We can define as many collections as we want.

client.collections.create(
    "MDV",
    vectorizer_config=wc.Configure.Vectorizer.none(),
    inverted_index_config=wvc.config.Configure.inverted_index(
        bm25_b=0.75,  # default 0.75
        bm25_k1=1.2,  # default 1.2
        stopwords_additions=None,
        stopwords_preset=None,
        stopwords_removals=None,
    ),
    properties=[
        Property(name="identifier", data_type=DataType.TEXT),
        Property(name="link", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="title_lemma", data_type=DataType.TEXT),
        Property(name="description", data_type=DataType.TEXT),
        Property(name="description_lemma", data_type=DataType.TEXT),
        Property(name="theme", data_type=DataType.TEXT),
        Property(name="theme_lemma", data_type=DataType.TEXT),
        Property(name="keyword", data_type=DataType.TEXT),
        Property(name="distribution", data_type=DataType.TEXT),
        Property(name="distribution_lemma", data_type=DataType.TEXT),
        Property(name="is_study", data_type=DataType.BOOL),
    ],
)

<weaviate.collections.collection.sync.Collection at 0x13111c050>

{"action":"hnsw_prefill_cache_async","build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","level":"info","msg":"not waiting for vector cache prefill, running in background","time":"2025-12-27T13:14:42+01:00","wait_for_cache_prefill":false}
{"build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","level":"info","msg":"Created shard mdv_yDbVlW70J6fg in 4.828375ms","time":"2025-12-27T13:14:42+01:00"}
{"action":"hnsw_vector_cache_prefill","build_git_commit":"62dcafac32","build_go_version":"go1.24.3","build_image_tag":"HEAD","build_wv_version":"1.30.5","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2025-12-27T13:14:42+01:00","took":41917}


In [22]:
# # List all collections.
# for v in client.collections.list_all().values():
#     print(v.name)

# # # Get detailed information about all collections.
# # schema = client.collections.list_all(simple=False)
# # print(schema)

In [23]:
# Now we ingest data into the collection in the form of "items".

collection = client.collections.get("MDV")

with collection.batch.dynamic() as batch:
    for data in df.to_dict(orient="records"):
        properties = {
            "identifier": data["identifier"],
            "link": data["link"],
            "title": data["title"],
            "title_lemma": data["title_lemma"],
            "description": data["description"],
            "description_lemma": data["description_lemma"],
            "theme": data["theme"],
            "theme_lemma": data["theme_lemma"],
            "keyword": data["keyword"],
            "distribution": data["distribution"],
            "distribution_lemma": data["distribution_lemma"],
            "is_study": data["is_study"],
        }
        batch.add_object(properties=properties, vector=data["embedding"])

In [24]:
# List all items in the collection.
collection = client.collections.get("MDV")
for item in collection.iterator():
    print(item)
    break

Object(uuid=_WeaviateUUIDInt('004986e0-cf57-4405-80a4-536b76b70a98'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'link': 'https://www.zh.ch/de/politik-staat/statistik-daten/datenkatalog.html#/datasets/35@statistisches-amt-kanton-zuerich', 'theme_lemma': 'http publications.europa.eu resource authority data theme SOCI', 'description_lemma': 'Anzahl verkauft Einfamilienhaus EFH . der letzter drei Jahr sein provisorisch .', 'is_study': False, 'distribution': 'Verkäufe EFH [Anz.] ', 'theme': 'http://publications.europa.eu/resource/authority/data-theme/SOCI', 'keyword': 'bezirke eigentum eigentumswechsel einfamilienhaus gemeinden handaenderungen immobilien kanton_zuerich ogd', 'title_lemma': 'Verkauf EFH Anz . ', 'description': 'Anzahl verkaufter Einfamilienhäuser (EFH). Die letzten drei Jahre sind provisorisch.', 'identifier': '35@statistisches-amt-kanto

In [25]:
# Get total count of all items in the collection.
collection = client.collections.get("MDV")
response = collection.aggregate.over_all(total_count=True)

# Check if the total count of items in the collection is equal to the number of items in the DataFrame.
# If the assertion fails, try to re-run the previous cells.
assert response.total_count == len(df)

print(f"The collection contains {response.total_count} items.")

The collection contains 813 items.


## Lexical search

[Weaviate documentation for lexical search](https://weaviate.io/developers/weaviate/search/bm25)

In [26]:
collection = client.collections.get("MDV")

In [27]:
# Set fields to search in with BM25.
# We exclude the identifier and link fields.
# We give twice the weight to the title fields.

query_properties = [
    "title^2",
    "description",
    "title_lemma^2",
    "description_lemma",
    "theme",
    "theme_lemma",
    "keyword",
    "distribution",
    "distribution_lemma",
]

In [28]:
query = "Löhne in Zürich"
query = prepare_for_lexical_search(query)

# See documentation for more details about parameters:
# https://weaviate.io/developers/weaviate/search/bm25
response = collection.query.bm25(
    query=query,
    query_properties=query_properties,
    offset=0,
    limit=20,
    # auto_limit=10,
    return_metadata=wvc.query.MetadataQuery(
        score=True,
        explain_score=True,
    ),
)
for item in response.objects:
    print(
        item.properties["title"],
        # item.properties["description"],
        # item.metadata.score,
        # item.metadata.explain_score,
    )

Löhne im Kanton Zürich- Branchen, Tätigkeiten, Alter, Geschlecht
Löhne, Teilzeitarbeit und Boni.
Löhne im Kanton Zürich liegen deutlich über schweizerischem Mittel
Kontaktpersonen des Volksschulamts des Kantons Zürich nach Schulgemeinde und Themen
Kanton Zürich in Zahlen
Lohnstudie Kanton Zürich
Gebietsstammdaten Kanton Zürich
Abfall im Kanton Zürich
Abfall im Kanton Zürich
Veloverkehr im Kanton Zürich
Mobilität im Kanton Zürich
Sozialhilfebezug im Kanton Zürich
Abfall im Kanton Zürich
Strukturatlas des Kantons Zürich
Siedlungsstruktur im Kanton Zürich
Treibstoffverbrauch im Kanton Zürich
Strukturatlas des Kantons Zürich
Autofahren im Kanton Zürich
Fahrzeugflotte der kantonalen Verwaltung Zürich
Datengrundlage zur ZHweb-Seite «Autoflotte»


## Semantic search

[Weaviate documentation for semantic search](https://weaviate.io/developers/weaviate/search/similarity)

In [29]:
query = "Löhne in Zürich"
query_embedding = embed_with_sentence_transformer(query)

# See documentation for more details about parameters:
# https://weaviate.io/developers/weaviate/search/similarity
collection = client.collections.get("MDV")
response = collection.query.near_vector(
    near_vector=query_embedding,
    limit=20,
    # auto_limit=2,
    return_metadata=wvc.query.MetadataQuery(distance=True, certainty=True),
)

for item in response.objects:
    print(
        item.properties["title"],
        item.metadata.distance,
        item.metadata.certainty,
    )

Löhne im Kanton Zürich liegen deutlich über schweizerischem Mittel 0.08438634872436523 0.9578068256378174
Frauenlöhne, Männerlöhne. 0.0978999137878418 0.9510500431060791
Löhne im Kanton Zürich- Branchen, Tätigkeiten, Alter, Geschlecht 0.10274624824523926 0.9486268758773804
Lohngefälle im Kanton Zürich 0.1120295524597168 0.9439852237701416
Frauenlöhne, Männerlöhne. Vollzeitlöhne, Teilzeitlöhne. 0.11550843715667725 0.9422457814216614
Räumliche Segregation in der Agglomeration Zürich 0.11629301309585571 0.9418535232543945
Lohnstudie Kanton Zürich 0.11719709634780884 0.941401481628418
Löhne, Teilzeitarbeit und Boni. 0.11751610040664673 0.941241979598999
Wer bezahlt die soziale Sicherheit? 0.12348628044128418 0.9382568597793579
Arbeitskräfteballung in Zürich 0.12401676177978516 0.9379916191101074
Abfall im Kanton Zürich 0.12511283159255981 0.9374436140060425
Abfall im Kanton Zürich 0.12511283159255981 0.9374436140060425
Abfall im Kanton Zürich 0.12511283159255981 0.9374436140060425
Die Sozi

## Hybrid search

[Weaviate documentation for hybrid search - the combination of lexical and vector search](https://weaviate.io/developers/weaviate/search/hybrid)

In [30]:
query = "Löhne in Zürich"
query_embedding = embed_with_sentence_transformer(query)
query = prepare_for_lexical_search(query)

# See documentation for more details about parameters:
# https://weaviate.io/developers/weaviate/search/hybrid
collection = client.collections.get("MDV")
response = collection.query.hybrid(
    query=query,
    vector=query_embedding,
    limit=40,
    # auto_limit=2,
    alpha=0.7,
    fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
    # return_metadata=wvc.query.MetadataQuery(distance=True),
)

for item in response.objects:
    print(
        item.properties["title"],
        # item.properties["year"],
        # item.metadata.score,
        # item.metadata.distance,
        # item.metadata.certainty,
    )

Löhne im Kanton Zürich liegen deutlich über schweizerischem Mittel
Löhne im Kanton Zürich- Branchen, Tätigkeiten, Alter, Geschlecht
Löhne, Teilzeitarbeit und Boni.
Frauenlöhne, Männerlöhne.
Lohngefälle im Kanton Zürich
Lohnstudie Kanton Zürich
Räumliche Segregation in der Agglomeration Zürich
Frauenlöhne, Männerlöhne. Vollzeitlöhne, Teilzeitlöhne.
Arbeitskräfteballung in Zürich
Wer bezahlt die soziale Sicherheit?
Abfall im Kanton Zürich
Abfall im Kanton Zürich
Abfall im Kanton Zürich
Die Sozialhilfe im Kanton Zürich 2001
Gripen, Mindestlohn und Kirchensteuer
Zürcher Gemeindefinanzen 2001
Volkszählung 2000
Reiche Rentner?
Zürcher Gemeindefinanzen 2004
Freiwillige und ihre Leistungen im Kanton Zürich und der Schweiz
Wirtschaftskraft in den Zürcher Regionen
Teilzeitarbeit boomt
Arbeitskräfte im Kanton Zürich immer besser qualifiziert
Der «Ernährereffekt»
Kontaktpersonen des Volksschulamts des Kantons Zürich nach Schulgemeinde und Themen
Familienergänzende Kinderbetreuung im Kanton Zürich
