# Hybrid search for the metadata catalogue of the Canton of Zurich
In this notebook we prepare the data for our search application and setup the search index.

- We load the data from the API. 
- We prepare the data and lemmatize the text data (for lexical search).
- We embed via the OpenAI embedding API. 
- We test the embeddings with cosine similarity.
- We setup the Weaviate index.
- We create a collection with our data.
- We test the index in regard to lexical, and vector search as well as the combination of both - hybrid search. The latter is what we use in the app.

# Imports

In [1]:
import pandas as pd
from pandarallel import pandarallel
import numpy as np

pd.options.mode.chained_assignment = None
pd.options.display.max_rows = 500
pd.options.display.max_seq_items = 500
pandarallel.initialize(progress_bar=False)

from time import time
import os
import re
import requests
from dotenv import load_dotenv

import warnings
from bs4 import MarkupResemblesLocatorWarning
from sklearn.metrics.pairwise import cosine_similarity
import weaviate
from weaviate.classes.config import Property, DataType
import weaviate.classes as wvc
import weaviate.classes.config as wc
import spacy
from openai import OpenAI

warnings.filterwarnings("ignore")
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client_openai = OpenAI(api_key=OPENAI_API_KEY)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


**Constants**

In [2]:
DATA_FOLDER = "_data/"
DATASETS = DATA_FOLDER + "01_data.parq"
DATA_WITH_EMBEDDINGS = DATA_FOLDER + "02_data_embedded.parq"

BASELINK_API = "https://www.web.statistik.zh.ch/ogd/daten/zhweb.json"

# Dataset links are composed of this baselink and the identifier for each dataset.
BASELINK_DATASHOP = (
    "https://www.zh.ch/de/politik-staat/statistik-daten/datenkatalog.html#/datasets/"
)

# Load and parse data

In [3]:
def retrieve_mdv_metadata():
    """Retrieve metadata for all datasets from the data catalog API.

    Args:
        None

    Returns:
        raw (list): Metadata for all datasets.
    """
    try:
        raw = requests.get(BASELINK_API)
        if raw.status_code == 200:
            raw = raw.json()
            raw = raw["dataset"]
            return raw
    except Exception as e:
        print(e)
        return None


def clean_text(d):
    """Clean text. Remove square brackets that often enclose units or other information. Replace multiple spaces with single spaces and strip leading and trailing spaces.

    Args:
        d (str): Text to clean.

    Returns:
        str: Cleaned text.
    """
    # Remove square brackets.
    d = re.sub(r"(\[|\])", " ", d)
    # Replace multiple spaces with single space.
    d = re.sub(r"[\s]+", " ", d)
    d = d.strip()
    return d


def parse_and_prepare_metadata(raw):
    """Parse and prepare metadata for all available datasets.

    Args:
        raw (list): Raw metadata for all available datasets.

    Returns:
        pd.DataFrame: Parsed and prepared metadata for all datasets.
    """

    identifier = []
    links = []
    titles = []
    descriptions = []
    themes = []
    keywords = []
    distributions = []
    is_study = []

    # Iterate over all datasets and retrieve metadata.
    for dataset in raw:
        identifier.append(dataset["identifier"])

        # The link to the dataset is composed of the base link and the dataset identifier.
        links.append(BASELINK_DATASHOP + dataset["identifier"])

        titles.append(dataset["title"].strip())

        if dataset["description"] not in ["NA", "keine", "null"]:
            descriptions.append(dataset["description"].strip())
        else:
            descriptions.append("")

        if isinstance(dataset["theme"], list):
            kwds = [x for x in dataset["theme"] if x != None]
            themes.append(" ".join(kwds))
        else:
            themes.append("")

        if isinstance(dataset["keyword"], list):
            keywords.append(" ".join(dataset["keyword"]))
        else:
            keywords.append("")

        # Go through all resources and retrieve their textual content.
        tmp_res = []
        for resource in dataset["distribution"]:
            is_pdf = False
            if resource["format"] == "PDF":
                is_pdf = True
            tmp_res.append(resource["title"])
            if isinstance(resource["description"], list):
                tmp_res.append(resource["description"])
            else:
                tmp_res.append("")
        is_study.append(is_pdf)

        # Join textual content of dataset and resources.
        distributions.append(" ".join(tmp_res))

    df = pd.DataFrame(identifier, columns=["identifier"])
    df["link"] = links
    df["title"] = titles
    df["description"] = descriptions
    df["theme"] = themes
    df["keyword"] = keywords
    df["distribution"] = distributions
    df["is_study"] = is_study

    return df

In [4]:
raw = retrieve_mdv_metadata()
df = parse_and_prepare_metadata(raw)
df.title = df.title.apply(lambda x: x.strip())
df.description = df.description.apply(lambda x: x.strip())

# Combine title and description for embedding.
df["text_for_embedding"] = df.title + " " + df.description
df["text_for_embedding"] = df["text_for_embedding"].apply(lambda x: x.strip())
df.to_parquet(DATASETS)
df.info(memory_usage="deep")
df.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   identifier          795 non-null    object
 1   link                795 non-null    object
 2   title               795 non-null    object
 3   description         795 non-null    object
 4   theme               795 non-null    object
 5   keyword             795 non-null    object
 6   distribution        795 non-null    object
 7   is_study            795 non-null    bool  
 8   text_for_embedding  795 non-null    object
dtypes: bool(1), object(8)
memory usage: 1.4 MB


Unnamed: 0,identifier,link,title,description,theme,keyword,distribution,is_study,text_for_embedding
0,214@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,Schül. Berufsschule [pro 1000 Einw.],Anzahl BerufsschülerInnen gemessen an der zivi...,http://publications.europa.eu/resource/authori...,berufsschule bezirke gemeinden kanton_zuerich ...,Schül. Berufsschule [pro 1000 Einw.],False,Schül. Berufsschule [pro 1000 Einw.] Anzahl Be...
1,216@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,Schül. Mittelschule [pro 1000 Einw.],Anzahl MittelschülerInnen inkl. Langzeitgymnas...,http://publications.europa.eu/resource/authori...,bezirke gemeinden kanton_zuerich mittelschulen...,Schül. Mittelschule [pro 1000 Einw.],False,Schül. Mittelschule [pro 1000 Einw.] Anzahl Mi...
2,217@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,Schül. Oberstufe [pro 1000 Einw.],Anzahl OberstufenschülerInnen (exklusiv Langze...,http://publications.europa.eu/resource/authori...,bezirke gemeinden kanton_zuerich schueler seku...,Schül. Oberstufe [pro 1000 Einw.],False,Schül. Oberstufe [pro 1000 Einw.] Anzahl Obers...
3,218@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,Schül. Primarschule [pro 1000 Einw.],Anzahl PrimarschülerInnen gemessen an der zivi...,http://publications.europa.eu/resource/authori...,bezirke gemeinden kanton_zuerich primarschule ...,Schül. Primarschule [pro 1000 Einw.],False,Schül. Primarschule [pro 1000 Einw.] Anzahl Pr...
4,229@statistisches-amt-kanton-zuerich,https://www.zh.ch/de/politik-staat/statistik-d...,Neuerstellte EFH [Whg.],Neu erstellte Einfamilienhäuser,http://publications.europa.eu/resource/authori...,bezirke einfamilienhaus gemeinden immobilien k...,Neuerstellte EFH [Whg.],False,Neuerstellte EFH [Whg.] Neu erstellte Einfamil...


# Lemmatize text

In [5]:
nlp = spacy.load("de_core_news_lg")

LETTERS_AND_DIGITS = re.compile(r"[^a-zäüöA-ZÜÄÖ0-9.]")
MULTIPLE_SPACES = re.compile(r"\s+")


def prepare_for_lexical_search(text, lower=False, remove_umlauts=False):
    """Lemmatize text, and optionally lower case and remove umlauts for lexical search.

    Args:
        text (str): Text to process.

    Keyword Arguments:
        lower (bool): Lower case text (default: {True}).
        remove_umlauts (bool): Remove umlauts from text (default: {True}).

    Returns:
        str: Lemmatized text, optionally lower cased, and without umlauts.
    """
    doc = nlp(text)
    text = " ".join([token.lemma_ if token.is_alpha else token.text for token in doc])
    text = re.sub(LETTERS_AND_DIGITS, " ", text)
    text = re.sub(MULTIPLE_SPACES, " ", text)
    if lower:
        text = text.lower()
    if remove_umlauts:
        text = text.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue")
    return text

In [6]:
df["title_lemma"] = df.title.parallel_apply(lambda x: prepare_for_lexical_search(x))
df["description_lemma"] = df.description.parallel_apply(
    lambda x: prepare_for_lexical_search(x)
)
df["theme_lemma"] = df.theme.parallel_apply(lambda x: prepare_for_lexical_search(x))
df["distribution_lemma"] = df.distribution.parallel_apply(
    lambda x: prepare_for_lexical_search(x)
)

Have a quick look at the data.

In [7]:
df.sample(2).T

Unnamed: 0,745,578
identifier,2202@statistisches-amt-kanton-zuerich,567@statistisches-amt-kanton-zuerich
link,https://www.zh.ch/de/politik-staat/statistik-d...,https://www.zh.ch/de/politik-staat/statistik-d...
title,Resultate und Wahlbeteiligung Nationalratswahl...,Sitzverlegungen von Firmen im Kanton Zürich 2002
description,Echtzeitresultate und -wahlbeteiligungen der E...,
theme,http://publications.europa.eu/resource/authori...,http://publications.europa.eu/resource/authori...
keyword,nationalratswahlen wahlen wahlergebnisse wahla...,unternehmensstruktur statistik.info
distribution,Nationalratswahl 2023: Parteiresultate Ebene K...,Sitzverlegungen von Firmen im Kanton Zürich 2002
is_study,False,True
text_for_embedding,Resultate und Wahlbeteiligung Nationalratswahl...,Sitzverlegungen von Firmen im Kanton Zürich 2002
title_lemma,Resultat und Wahlbeteiligung Nationalratswahl ...,Sitzverlegung von Firma in Kanton Zürich 2002


# Embed data

For our prototype app we use OpenAIs embeddings for convienience. 

We also tested these open source models with [SentenceTransformers](https://sbert.net/) with very good results: 

- [PM-AI/bi-encoder_msmarco_bert-base_german](https://huggingface.co/PM-AI/bi-encoder_msmarco_bert-base_german) - 350 tokens context length
- [Jina AI jina-embeddings-v2-base-de](https://huggingface.co/jinaai/jina-embeddings-v2-base-de) - 8192 tokens context length

In [9]:
def embed_with_openai(texts, model="text-embedding-3-small"):
    """Embed text using OpenAIs embedding API.

    Args:
        texts (list): List of texts to embed.
        model (str): OpenAI model to use (default: {"text-embedding-3-small"}).

    Returns:
        list: List of embeddings.
    """
    response = client_openai.embeddings.create(input=texts, model=model)
    return [x.embedding for x in response.data]

In [10]:
df["embedding_openai"] = embed_with_openai(df.text_for_embedding.values)

In [11]:
cols = [
    "identifier",
    "link",
    "title",
    "title_lemma",
    "description",
    "description_lemma",
    "text_for_embedding",
    "theme",
    "theme_lemma",
    "keyword",
    "distribution",
    "distribution_lemma",
    "is_study",
    "embedding_openai",
]

df = df[cols]

In [12]:
df.sample(1).T

Unnamed: 0,177
identifier,563@awel-kanton-zuerich
link,https://www.zh.ch/de/politik-staat/statistik-d...
title,Lufttemperatur und Luftfeuchte LoRa-Sensor-Mes...
title_lemma,Lufttemperatur und Luftfeuchte LoRa Sensor Mes...
description,Seit Frühjahr 2019 betreibt das AWEL des Kanto...
description_lemma,seit Frühjahr 2019 betreiben der AWEL der Kant...
text_for_embedding,Lufttemperatur und Luftfeuchte LoRa-Sensor-Mes...
theme,http://publications.europa.eu/resource/authori...
theme_lemma,http publications.europa.eu resource authority...
keyword,luft lufttemperaturen ogd


In [13]:
df.to_parquet(DATA_WITH_EMBEDDINGS)
df.info(memory_usage="deep")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 795 entries, 0 to 794
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   identifier          795 non-null    object
 1   link                795 non-null    object
 2   title               795 non-null    object
 3   title_lemma         795 non-null    object
 4   description         795 non-null    object
 5   description_lemma   795 non-null    object
 6   text_for_embedding  795 non-null    object
 7   theme               795 non-null    object
 8   theme_lemma         795 non-null    object
 9   keyword             795 non-null    object
 10  distribution        795 non-null    object
 11  distribution_lemma  795 non-null    object
 12  is_study            795 non-null    bool  
 13  embedding_openai    795 non-null    object
dtypes: bool(1), object(13)
memory usage: 11.4 MB


# Test embeddings

Perform a quick check with scikit's cosine similarity function.

In [14]:
df = pd.read_parquet(DATA_WITH_EMBEDDINGS)

In [15]:
query = "wohnungsnot in zürich"
show_n_results = 20

# Vector search based on OpenAI embeddings.
now = time()
embedding_openai = embed_with_openai(query)
cosine_sim = cosine_similarity(embedding_openai, df.embedding_openai.tolist())
top_k = np.argsort(cosine_sim[0])
top_k = top_k[::-1]
display(df.iloc[top_k][["title"]].head(show_n_results).values)

array([['Kleineres Armutsrisiko dank günstiger Wohnung'],
       ['Wohnungswechsel'],
       ['Leerwohnungszahlen'],
       ['Entspannung auf dem Wohnungsmarkt?'],
       ['Ersatzneubau von Wohnungen immer wichtiger'],
       ['Bevölkerungsdynamik und Wohnungsbau'],
       ['Dynamischer Zürcher Wohnungsmarkt'],
       ['Gemeinnützige Wohnungen in Gemeinden des Kantons Zürich [Anz.]'],
       ['Zuwanderung und Wohnungsmarkt'],
       ['Anteil Gemeinnützige Wohnungen in Gemeinden des Kantons Zürich [%]'],
       ['Abfall im Kanton Zürich'],
       ['Abfall im Kanton Zürich'],
       ['Abfall im Kanton Zürich'],
       ['Kantonaler Wohnungsleerstand bleibt niedrig, aber Anzeichen für Umschwung auf Wohnungsmarkt'],
       ['Kantonaler Wohnungsleerstand immer noch rückläufig'],
       ['Aus- und Einzugsmieten im Kanton Zürich'],
       ['Zürcher Haushalte im Wandel der Zeit'],
       ['Schwache Konjunktur bremst Bevölkerungswachstum im Kanton Zürich'],
       ['Gebäude und Wohnungen im Kant

# Implement search

Now we create the actual search index for lexical and semantic search.

Note that [Weaviates](https://weaviate.io/developers/weaviate) default location for the index is `~/.local/share/weaviate`. If you want to use the app on a different machine, simply copy the files from this location to the same location on the other machine. You can also set a specific path with parameter `persistence_data_path` – see [documentation here](https://weaviate.io/developers/weaviate/installation/embedded).

In [16]:
df = pd.read_parquet(DATA_WITH_EMBEDDINGS)

In [17]:
# This will either connect to an existing Weaviate instance or create a new one.
# We only can execute this once, as is. If we want to run it again, we need to restart the kernel.
# More info regarding the Embedded Weaviate client:
# https://weaviate.io/developers/weaviate/installation/embedded
# By default the index data is stored in ~/.local/share/weaviate/

client = weaviate.connect_to_embedded()

# # Use this line if another application on your machine is already using Weaviate on the default port.
# # This notebook also counts as another application. If your notebook is running, you need to use a different port for the Streamlit app. Otherwise, you get an error.
# client = weaviate.connect_to_local(port=8079, grpc_port=50050)

Started /Users/patrickarnecke/.cache/weaviate-embedded: process ID 4665


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-07-17T17:25:39+02:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-07-17T17:25:39+02:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-07-17T17:25:39+02:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50050","time":"2024-07-17T17:25:39+02:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-07-17T17:25:39+02:00"}


{"level":"info","msg":"Completed loading shard mdv_epJ0bgfBKeWH in 3.908291ms","time":"2024-07-17T17:25:39+02:00"}
{"action":"hnsw_vector_cache_prefill","count":3000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-07-17T17:25:39+02:00","took":81708}
{"level":"info","msg":"Completed loading shard rrbs_mRNSfQC7t9fQ in 315.293792ms","time":"2024-07-17T17:25:40+02:00"}
{"action":"hnsw_vector_cache_prefill","count":237352,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-07-17T17:25:45+02:00","took":5043900334}


In [None]:
# # Get the meta endpoint description of Weaviate.
# display(client.get_meta())

# # Ping Weaviate’s live and ready state.
# print(client.is_live())
# print(client.is_ready())

In [19]:
# Delete collection.
client.collections.delete("MDV")

In [21]:
# First we need to create a schema that defines how data is stored, organized and retrieved in Weaviate.
# A schema is called a "collection". We can define as many collections as we want.

client.collections.create(
    "MDV",
    vectorizer_config=wc.Configure.Vectorizer.none(),
    inverted_index_config=wvc.config.Configure.inverted_index(
        bm25_b=0.75,  # default 0.75
        bm25_k1=1.2,  # default 1.2
        stopwords_additions=None,
        stopwords_preset=None,
        stopwords_removals=None,
    ),
    properties=[
        Property(name="identifier", data_type=DataType.TEXT),
        Property(name="link", data_type=DataType.TEXT),
        Property(name="title", data_type=DataType.TEXT),
        Property(name="title_lemma", data_type=DataType.TEXT),
        Property(name="description", data_type=DataType.TEXT),
        Property(name="description_lemma", data_type=DataType.TEXT),
        Property(name="theme", data_type=DataType.TEXT),
        Property(name="theme_lemma", data_type=DataType.TEXT),
        Property(name="keyword", data_type=DataType.TEXT),
        Property(name="distribution", data_type=DataType.TEXT),
        Property(name="distribution_lemma", data_type=DataType.TEXT),
        Property(name="is_study", data_type=DataType.BOOL),
    ],
)

{"level":"info","msg":"Created shard mdv_OO9LVECxFH3G in 18.866666ms","time":"2024-07-17T17:26:13+02:00"}
{"action":"hnsw_vector_cache_prefill","count":1000,"index_id":"main","level":"info","limit":1000000000000,"msg":"prefilled vector cache","time":"2024-07-17T17:26:14+02:00","took":5410375}


<weaviate.collections.collection.Collection at 0x166bfd310>

In [23]:
# # List all collections.
# for v in client.collections.list_all().values():
#     print(v.name)

# # # Get detailed information about all collections.
# # schema = client.collections.list_all(simple=False)
# # print(schema)

In [24]:
# Now we ingest data into the collection in the form of "items".

collection = client.collections.get("MDV")

with collection.batch.dynamic() as batch:
    for data in df.to_dict(orient="records"):
        properties = {
            "identifier": data["identifier"],
            "link": data["link"],
            "title": data["title"],
            "title_lemma": data["title_lemma"],
            "description": data["description"],
            "description_lemma": data["description_lemma"],
            "theme": data["theme"],
            "theme_lemma": data["theme_lemma"],
            "keyword": data["keyword"],
            "distribution": data["distribution"],
            "distribution_lemma": data["distribution_lemma"],
            "is_study": data["is_study"],
        }
        batch.add_object(properties=properties, vector=data["embedding_openai"])

In [25]:
# List all items in the collection.
collection = client.collections.get("MDV")
for item in collection.iterator():
    print(item)
    break

Object(uuid=_WeaviateUUIDInt('004af901-ce28-4f6e-a731-77055a09c281'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'description': 'Entwicklungstrends eines Wirtschaftszweigs', 'theme_lemma': 'http publications.europa.eu resource authority data theme AGRI', 'identifier': '493@statistisches-amt-kanton-zuerich', 'is_study': True, 'keyword': 'wirtschaft statistik.info', 'distribution': 'Strukturwandel in der Zürcher Landwirtschaft ', 'title_lemma': 'Strukturwandel in der Zürcher Landwirtschaft', 'theme': 'http://publications.europa.eu/resource/authority/data-theme/AGRI', 'distribution_lemma': 'Strukturwandel in der Zürcher Landwirtschaft', 'description_lemma': 'Entwicklungstrends ein Wirtschaftszweig', 'link': 'https://www.zh.ch/de/politik-staat/statistik-daten/datenkatalog.html#/datasets/493@statistisches-amt-kanton-zuerich', 'title': 'Strukturwandel in 

In [26]:
# Get total count of all items in the collection.
collection = client.collections.get("MDV")
response = collection.aggregate.over_all(total_count=True)

# Check if the total count of items in the collection is equal to the number of items in the DataFrame.
# If the assertion fails, try to re-run the previous cells.
assert response.total_count == len(df)

print(f"The collection contains {response.total_count} items.")

The collection contains 795 items.


## Lexical search

[Weaviate documentation for lexical search](https://weaviate.io/developers/weaviate/search/bm25)

In [27]:
collection = client.collections.get("MDV")

In [28]:
# Set fields to search in with BM25.
# We exclude the identifier and link fields.
# We give twice the weight to the title fields.

query_properties = [
    "title^2",
    "description",
    "title_lemma^2",
    "description_lemma",
    "theme",
    "theme_lemma",
    "keyword",
    "distribution",
    "distribution_lemma",
]

In [29]:
query = "Löhne in Zürich"
query = prepare_for_lexical_search(query)

# See documentation for more details about parameters:
# https://weaviate.io/developers/weaviate/search/bm25
response = collection.query.bm25(
    query=query,
    query_properties=query_properties,
    offset=0,
    limit=20,
    # auto_limit=10,
    return_metadata=wvc.query.MetadataQuery(
        score=True,
        explain_score=True,
    ),
)
for item in response.objects:
    print(
        item.properties["title"],
        # item.properties["description"],
        # item.metadata.score,
        # item.metadata.explain_score,
    )

Löhne, Teilzeitarbeit und Boni.
Löhne im Kanton Zürich liegen deutlich über schweizerischem Mittel
Löhne im Kanton Zürich- Branchen, Tätigkeiten, Alter, Geschlecht
Kontaktpersonen des Volksschulamts des Kantons Zürich nach Schulgemeinde und Themen
Kanton Zürich in Zahlen
Lohnstudie Kanton Zürich
Nutzungsdichte im Kanton Zürich
Strukturatlas des Kantons Zürich
Autofahren im Kanton Zürich
Sozialhilfebezug im Kanton Zürich
Abfall im Kanton Zürich
Abfall im Kanton Zürich
Veloverkehr im Kanton Zürich
Siedlungsstruktur im Kanton Zürich
Mobilität im Kanton Zürich
Strukturatlas des Kantons Zürich
Abfall im Kanton Zürich
Treibstoffverbrauch im Kanton Zürich
Fahrzeugflotte der kantonalen Verwaltung Zürich
Datengrundlage zur ZHweb-Seite «Autoflotte»


## Semantic search

[Weaviate documentation for semantic search](https://weaviate.io/developers/weaviate/search/similarity)

In [30]:
query = "Löhne in Zürich"
query_embedding = embed_with_openai(query)[0]

# See documentation for more details about parameters:
# https://weaviate.io/developers/weaviate/search/similarity
collection = client.collections.get("MDV")
response = collection.query.near_vector(
    near_vector=query_embedding,
    limit=20,
    # auto_limit=2,
    return_metadata=wvc.query.MetadataQuery(distance=True, certainty=True),
)

for item in response.objects:
    print(
        item.properties["title"],
        item.metadata.distance,
        item.metadata.certainty,
    )

Löhne im Kanton Zürich liegen deutlich über schweizerischem Mittel 0.24334806203842163 0.8783259391784668
Löhne im Kanton Zürich- Branchen, Tätigkeiten, Alter, Geschlecht 0.32373493909835815 0.8381325006484985
Lohngefälle im Kanton Zürich 0.3252621293067932 0.8373689651489258
Löhne, Teilzeitarbeit und Boni. 0.3402245044708252 0.8298877477645874
Frauenlöhne, Männerlöhne. Vollzeitlöhne, Teilzeitlöhne. 0.3444167375564575 0.8277916312217712
Frauenlöhne, Männerlöhne. 0.363132119178772 0.818433940410614
Lohnstudie Kanton Zürich 0.3684327006340027 0.8157836198806763
Teilzeitarbeit boomt 0.42636024951934814 0.7868198752403259
Arbeitskräfteballung in Zürich 0.4340125322341919 0.782993733882904
35-Stunden Woche im Kanton Zürich? 0.4363228678703308 0.7818385362625122
Abfall im Kanton Zürich 0.4381587505340576 0.7809206247329712
Abfall im Kanton Zürich 0.4381587505340576 0.7809206247329712
Abfall im Kanton Zürich 0.4381587505340576 0.7809206247329712
Lehrstellenmangel in Dienstleistungsbranchen 0.

## Hybrid search

[Weviate documentation for hybrid search - the combination of lexical and vector search](https://weaviate.io/developers/weaviate/search/hybrid)

In [31]:
query = "Löhne in Zürich"
query_embedding = embed_with_openai(query)[0]
query = prepare_for_lexical_search(query)

# See documentation for more details about parameters:
# https://weaviate.io/developers/weaviate/search/hybrid
collection = client.collections.get("MDV")
response = collection.query.hybrid(
    query=query,
    vector=query_embedding,
    limit=40,
    # auto_limit=2,
    alpha=0.7,
    fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE,
    # return_metadata=wvc.query.MetadataQuery(distance=True),
)

for item in response.objects:
    print(
        item.properties["title"],
        # item.properties["year"],
        # item.metadata.score,
        # item.metadata.distance,
        # item.metadata.certainty,
    )

Löhne im Kanton Zürich liegen deutlich über schweizerischem Mittel
Löhne, Teilzeitarbeit und Boni.
Löhne im Kanton Zürich- Branchen, Tätigkeiten, Alter, Geschlecht
Lohngefälle im Kanton Zürich
Frauenlöhne, Männerlöhne. Vollzeitlöhne, Teilzeitlöhne.
Lohnstudie Kanton Zürich
Frauenlöhne, Männerlöhne.
Teilzeitarbeit boomt
Arbeitskräfteballung in Zürich
Abfall im Kanton Zürich
Abfall im Kanton Zürich
Abfall im Kanton Zürich
35-Stunden Woche im Kanton Zürich?
Firmengründungen, Wirtschaftsstruktur und Steuerbelastung
Lehrstellenmangel in Dienstleistungsbranchen
Zentrale Lagen begehrt
Prämienverbilligung: Wer profitiert – und wer bezahlt
Gripen, Mindestlohn und Kirchensteuer
Reiche Rentner?
Der Preis des Bodens
Unbezahlte Arbeit im Kanton Zürich
Woher kommen die Steuern?
Kontaktpersonen des Volksschulamts des Kantons Zürich nach Schulgemeinde und Themen
Dienstleistungen gewinnen die Oberhand
Sitzverlegungen von Firmen im Kanton Zürich 2002
Immer mehr Hochqualifizierte
Die Sozialhilfe im Kanto