In [41]:
import meilisearch
import requests
import json

from sentence_transformers import SentenceTransformer

In [42]:
base_url = 'http://localhost:7700'
api_key = 'y9iPmYwIjjm6uqLZ9SejDViU6SbkfufLZTfuffufMzfPJM'

headers = {
  "Authorization": f"Bearer {api_key}",
  "Content-Type": "application/json"
}

client = meilisearch.Client(base_url, api_key)


In [43]:
client.version()

{'commitSha': '885b9f07b10ce7c22cbbbe0d6f3468ad1fbc4fd4',
 'commitDate': '2024-09-23T16:19:53.000000000Z',
 'pkgVersion': '1.10.2'}

In [44]:
index = client.index("publications")

## Model setup

In [45]:
model = SentenceTransformer('intfloat/multilingual-e5-large')

---

# Config meilisearch

In [28]:
requests.get(f"{base_url}/experimental-features", headers=headers).json()

{'vectorStore': True,
 'metrics': False,
 'logsRoute': False,
 'editDocumentsByFunction': False,
 'containsFilter': False}

In [38]:
requests.patch(
  f"{base_url}/experimental-features",
  json={ "vectorStore": True },
  headers=headers
).json()

{'vectorStore': True,
 'metrics': False,
 'logsRoute': False,
 'editDocumentsByFunction': False,
 'containsFilter': False}

In [30]:
index.reset_settings()

TaskInfo(task_uid=21, index_uid='publications', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2024, 9, 30, 22, 55, 54, 666697))

In [31]:
index.update_settings({
  "embedders": {
    "default": {
      "source": "userProvided",
      "dimensions": 1024
    }
  }
})

TaskInfo(task_uid=22, index_uid='publications', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2024, 9, 30, 22, 55, 56, 408900))

In [5]:
index.get_settings()['embedders']

{'default': UserProvidedEmbedder(source='userProvided', dimensions=1024)}

In [32]:
index.update_distinct_attribute("document_id")

TaskInfo(task_uid=23, index_uid='publications', status='enqueued', type='settingsUpdate', enqueued_at=datetime.datetime(2024, 9, 30, 22, 56, 0, 905548))

In [80]:
index.get_distinct_attribute()


'document_id'

---

# Retrieval

## Prepare documents

In [54]:
with open("documents.json") as f:
  raw_documents = json.load(f)

documents = []

for (idx, document) in enumerate(raw_documents):
  documents.append({
    "id": idx,
    "embeddings": model.encode(document["paragraphs"]),
    "paragraphs": document["paragraphs"],
  })


In [10]:
index.delete_all_documents()

TaskInfo(task_uid=136, index_uid='publications', status='enqueued', type='documentDeletion', enqueued_at=datetime.datetime(2024, 10, 25, 20, 27, 41, 108743))

## Add documents


In [12]:
for (i, document) in enumerate(documents):
  id = document["id"]
  embeddings = document["embeddings"]
  
  for (j, embedding) in enumerate(embeddings):
    index.add_documents([{
      "id": f"{id}-{j}",
      "document_id": id,
      "_vectors": {
        "default": embedding.tolist()
      },
      "text": document["paragraphs"][j],
    }])


## Query documents

In [55]:
query = "autonomne vozidla"
embedding = model.encode([query])[0].tolist()

In [56]:
index.search(None, {
  # "limit": 10,
  # "offset": 5,
  "vector": embedding,
  "showRankingScore": True,
  "hitsPerPage": 2,
  "hybrid": {
    "embedder": "default",
    "semanticRatio": 1
  }
})

{'hits': [{'id': '0-2',
   'text': 'In recent years, deep learning—a subset of machine learning that uses neural networks—has gained significant attention. Techniques like convolutional neural networks (CNNs) for image processing and recurrent neural networks (RNNs) for sequential data have led to breakthroughs in fields such as natural language processing and computer vision.',
   'document_id': 0,
   '_rankingScore': 0.873306393623352},
  {'id': '1-2',
   'text': 'Consensus algorithms like Paxos and Raft are vital in maintaining consistency across distributed systems, especially in environments prone to network failures. These algorithms ensure that all nodes in the system agree on the same state, even if some nodes experience downtime or unreliable connections.',
   'document_id': 1,
   '_rankingScore': 0.8723658323287964}],
 'query': '',
 'processingTimeMs': 1,
 'hitsPerPage': 2,
 'page': 1,
 'totalPages': 6,
 'totalHits': 11,
 'semanticHitCount': 2}

In [57]:
docs = index.get_documents().results

for doc in docs:
  print({
    "id": doc.id,
    "text": doc.text,
    "document_id": doc.document_id
  })

print(len(docs))

{'id': '0-0', 'text': 'Machine learning (ML) algorithms enable systems to learn patterns and make predictions based on data. The most common types of machine learning algorithms include supervised learning, unsupervised learning, and reinforcement learning. Supervised learning uses labeled data to train models, while unsupervised learning deals with unstructured data to identify hidden patterns.', 'document_id': 0}
{'id': '0-1', 'text': 'Some popular supervised learning algorithms include linear regression, decision trees, and support vector machines (SVMs). Each algorithm is designed for specific tasks, such as classification or regression, and has its strengths depending on the problem domain and the size of the dataset.', 'document_id': 0}
{'id': '0-2', 'text': 'In recent years, deep learning—a subset of machine learning that uses neural networks—has gained significant attention. Techniques like convolutional neural networks (CNNs) for image processing and recurrent neural networks 