In [1]:
import meilisearch
import requests
import json

from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
base_url = 'http://localhost:7700'
api_key = 'y9iPmYwIjjm6uqLZ9SejDViU6SbkfufLZTfuffufMzfPJM'

headers = {
  "Authorization": f"Bearer {api_key}",
  "Content-Type": "application/json"
}

client = meilisearch.Client(base_url, api_key)


In [3]:
client.version()

{'commitSha': 'e20b91210d1b5da82072289a9198435276661174',
 'commitDate': '2025-01-23T13:41:34.000000000Z',
 'pkgVersion': '1.12.7'}

In [4]:
index = client.index("publications")

## Model setup

In [12]:
model = SentenceTransformer('intfloat/multilingual-e5-large')

---

# Config meilisearch

In [31]:
requests.get(f"{base_url}/experimental-features", headers=headers).json()

{'vectorStore': True,
 'metrics': False,
 'logsRoute': False,
 'editDocumentsByFunction': False,
 'containsFilter': False}

In [38]:
requests.patch(
  f"{base_url}/experimental-features",
  json={ "vectorStore": True },
  headers=headers
).json()

{'vectorStore': True,
 'metrics': False,
 'logsRoute': False,
 'editDocumentsByFunction': False,
 'containsFilter': False}

In [25]:
index.reset_settings()

TaskInfo(task_uid=64, index_uid='publications', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2025, 1, 27, 19, 31, 11, 130715))

In [26]:
index.update_settings({
  "embedders": {
    "default": {
      "source": "userProvided",
      "dimensions": 1024
    }
  }
})

TaskInfo(task_uid=65, index_uid='publications', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2025, 1, 27, 19, 31, 13, 434230))

In [27]:
index.get_settings()['embedders']

{'default': UserProvidedEmbedder(source='userProvided', dimensions=1024)}

In [29]:
index.update_distinct_attribute("document_id")

TaskInfo(task_uid=66, index_uid='publications', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2025, 1, 27, 19, 31, 19, 495236))

In [30]:
index.get_distinct_attribute()


'document_id'

---

# Retrieval

## Prepare documents

In [32]:
raw_documents = [
    [
        "Artificial Intelligence (AI) has rapidly transformed various industries, from healthcare to finance. It is driven by algorithms that analyze large datasets, enabling machines to perform tasks previously thought exclusive to humans.",
        "Machine Learning, a subset of AI, allows systems to learn and improve from data without explicit programming. Applications range from recommendation systems to autonomous vehicles.",
        "Natural Language Processing (NLP) is another critical AI component, powering technologies like chatbots and voice assistants. It enables machines to understand, interpret, and respond to human language.",
        "AI ethics have become a significant concern, addressing issues like bias in algorithms, data privacy, and job displacement. Transparent and accountable AI development is a growing priority.",
        "Despite its challenges, AI continues to innovate and improve, offering solutions to complex problems and creating opportunities for businesses and individuals."
    ],
    [
        "Blockchain technology underpins cryptocurrencies like Bitcoin and Ethereum, offering a decentralized and secure method for recording transactions.",
        "Smart contracts, powered by blockchain, allow automated and trustless agreements between parties. These are widely used in decentralized finance (DeFi) applications.",
        "The technology's transparency and immutability make it valuable for supply chain management, ensuring authenticity and traceability of goods.",
        "Non-fungible tokens (NFTs) have gained popularity as digital assets representing ownership of unique items like art and collectibles on the blockchain.",
        "Challenges like scalability, energy consumption, and regulatory scrutiny continue to shape the future of blockchain adoption."
    ],
    [
        "Cloud computing has revolutionized the IT industry by providing scalable and flexible resources over the internet. Companies can access computing power, storage, and services on demand.",
        "Public cloud platforms like AWS, Microsoft Azure, and Google Cloud dominate the market, offering a range of services from infrastructure to AI tools.",
        "Hybrid cloud solutions enable businesses to combine on-premises infrastructure with cloud resources, offering the best of both worlds for flexibility and security.",
        "Cloud computing supports innovation through tools like serverless computing, which allows developers to focus on code without managing infrastructure.",
        "Security and data privacy remain critical considerations, driving advancements in encryption and compliance frameworks in cloud environments."
    ],
    [
        "The Internet of Things (IoT) connects everyday devices to the internet, creating a network of 'smart' devices that can communicate and automate tasks.",
        "Smart home technology, including thermostats, lights, and security cameras, is a prominent IoT application, enhancing convenience and energy efficiency.",
        "IoT is revolutionizing industries like healthcare through wearable devices that monitor patient health in real-time, enabling proactive care.",
        "Industrial IoT (IIoT) improves manufacturing processes by enabling predictive maintenance, optimizing resource use, and enhancing operational efficiency.",
        "As IoT devices proliferate, concerns about security vulnerabilities and data breaches are increasing, requiring robust solutions and standards."
    ],
    [
        "Cybersecurity is a critical concern in the digital age, with threats ranging from phishing attacks to ransomware targeting individuals and organizations.",
        "Advanced security measures, like multi-factor authentication (MFA) and end-to-end encryption, are becoming standard practices to protect data.",
        "Cybersecurity frameworks, such as Zero Trust Architecture, focus on continuous verification and limiting access to sensitive information.",
        "Ethical hacking and penetration testing are essential for identifying and mitigating vulnerabilities before malicious actors exploit them.",
        "The growing importance of cybersecurity has led to a demand for skilled professionals and the development of cutting-edge technologies like AI-powered threat detection."
    ]
]


documents = []

for (idx, document) in enumerate(raw_documents):
    documents.append({
        "id": idx,
        "embeddings": model.encode(document),
        "paragraphs": document,
    })

In [22]:
index.delete_all_documents()

TaskInfo(task_uid=63, index_uid='publications', status='enqueued', type='documentDeletion', enqueued_at=datetime.datetime(2025, 1, 27, 19, 30, 55, 964221))

## Add documents


In [38]:
for (i, document) in enumerate(documents):
    id = document["id"]
    embeddings = document["embeddings"]

    for (j, embedding) in enumerate(embeddings):
        index.add_documents([{
            "id": f"{id}-{j}",
            "document_id": id,
            "_vectors": {
                "default": embedding.tolist()
            },
            "text": document["paragraphs"][j],
        }], "id")

## Query documents

In [35]:
query = "autonomne vozidla"
embedding = model.encode([query])[0].tolist()

In [36]:
index.search(None, {
  # "limit": 10,
  # "offset": 5,
  "vector": embedding,
  "showRankingScore": True,
  "hitsPerPage": 2,
  "hybrid": {
    "embedder": "default",
    "semanticRatio": 1
  }
})

{'hits': [],
 'query': '',
 'processingTimeMs': 1,
 'hitsPerPage': 2,
 'page': 1,
 'totalPages': 0,
 'totalHits': 0,
 'semanticHitCount': 0}

In [37]:
docs = index.get_documents().results

for doc in docs:
  print({
    "id": doc.id,
    "text": doc.text,
    "document_id": doc.document_id
  })

print(len(docs))

0
