# Setup

1. Run the search worker inside Docker
```shell
docker run --name search-worker-test -p 8080:8080 -v ./index:/mounted_store/index hakes-searchworker:v1
```
2. Run MongoDB inside Docker
```shell
docker run -d -p 27017:27017 --name mongo-test mongo
```
3. Configure environment variables
```shell
export SEARCH_WORKER_ADDR=http://host:port
export MONGO_ADDR=mongodb://host:port
export HF_API_KEY=api_key
```

In [None]:
import os
import sys
sys.path.append('..')
sys.path.append('../../../python/')

import numpy as np
import random
import torch
from hakesclient import Client, ClientConfig
from hakesclient.components.store import Store
from hakesclient.components.embedder import Embedder
from hakesclient.extensions.mongodb import MongoDB
from hakesclient.extensions.huggingface import HuggingFaceEmbedder
from hakes.index.build import init_hakes_params, build_dataset, train_hakes_params, recenter_ivf

INDEX_DIR = "../../../index/"
COLLECTION_NAME = "poc"

# Init index

When calling `load_collection` in the search worker, a missing index file will cause a `checkpoint not found error`. Therefore, the initial index file must be created manually.

> NOTE: Calling `load_collection` twice in the search worker can cause the container to terminate unexpectedly.

In [13]:
def init_index(collection_name):
    os.makedirs(os.path.join(INDEX_DIR, COLLECTION_NAME), exist_ok=True)
    os.makedirs(os.path.join(INDEX_DIR, COLLECTION_NAME, 'checkpoint_0'), exist_ok=True)
    dir = os.path.join(INDEX_DIR, collection_name, 'checkpoint_0')

    seed = 0
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    data = np.random.randn(10000, 768).astype(np.float32)
    data = data / np.linalg.norm(data, axis=1, keepdims=True)

    index = init_hakes_params(data, 384, 100, "ip")
    index.set_fixed_assignment(True)
    index.save_as_hakes_index(os.path.join(dir, "findex.bin"))
    sample_ratio = 0.1
    dataset = build_dataset(data, sample_ratio=sample_ratio, nn=50)
    train_hakes_params(
        model=index,
        dataset=dataset,
        epochs=3,
        batch_size=128,
        lr_params={"vt": 1e-4, "pq": 1e-4, "ivf": 0},
        loss_weight={
            "vt": "rescale",
            "pq": 1,
            "ivf": 0,
        },
        temperature=1,
        loss_method="hakes",
        device="cpu",
    )
    recenter_ivf(index, data, sample_ratio)
    index.save_as_hakes_index(os.path.join(dir, "uindex.bin"))

# init index if index checkpoint files of specified collection not exist
if not (os.path.isdir(os.path.join(INDEX_DIR, COLLECTION_NAME)) and any(os.scandir(os.path.join(INDEX_DIR, COLLECTION_NAME)))):
    init_index(COLLECTION_NAME)

# Build clients


In [14]:
config = ClientConfig(search_worker_addrs=[os.getenv("SEARCH_WORKER_ADDR")])
client = Client(config)
store: Store = MongoDB(os.getenv('MONGO_ADDR'), "hakes", COLLECTION_NAME)
embedder: Embedder = HuggingFaceEmbedder(os.getenv('HF_API_KEY'), 'google/embeddinggemma-300m')

# Add doc

In [15]:
doc = {
        'key': 'doc1',
        'value': '高血压病人可以口服党参的。党参有降血脂，降血压的作用，可以彻底消除血液中的垃圾，从而对冠心病以及心血管疾病的患者都有一定的稳定预防工作作用，因此平时口服党参能远离三高的危害。另外党参除了益气养血，降低中枢神经作用，调整消化系统功能，健脾补肺的功能。'
    }

# add doc to store and return xid
success, xids = store.put([doc['key']], [doc['value']], None)
if not success:
    raise RuntimeError("Failed to add document to the store")
xids = [int.from_bytes(xid, "big", signed=True) for xid in xids]
# embed doc
vector = embedder.embed_text([doc['value']])
# add vector to index with xid
client.add(COLLECTION_NAME, vector, xids)
# test checkpoint function
client.checkpoint(COLLECTION_NAME)

{'msg': 'checkpoint success', 'status': True}

# Search

In [16]:
question = "我有高血压这两天女婿来的时候给我拿了些党参泡水喝，您好高血压可以吃党参吗？"
question_vector = embedder.embed_text([question])

xids = client.search(COLLECTION_NAME, question_vector, 3, 20, 5, "IP")['ids'][0]
xids = [x.to_bytes(8, "big", signed=True) for x in xids]
doc = store.get_by_ids(xids)

# generate prompt
prompt = f"根据以下内容，简要回答用户的问题。\n\n"
for i, d in enumerate(doc):
    if not d:
        continue
    prompt += f"内容{i+1}：{d}\n\n"
prompt += f"用户的问题：{question}\n简要回答："

print(prompt)

(1, 768)
search result: {'ids': [[24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, -1, -1, -1]], 'msg': 'search success', 'scores': [[0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, 0.27845263481140137, -0.6673946976661682, -0.6673946976661682, -0.6673946976661682]], 'status': True}
(1, 768)
根据以下内容，简要回答用户的问题。

内容2：高血压病人可以口服党参的。党参有降血脂，降血压的作用，可以彻底消除血液中的垃圾，从而对冠心病以及心血管疾病的患者都有一定的稳定预防工作作用，因此平时口服党参能远离三高的危害。另外党参除了益气养血，降低中枢神经作用，调整消化系统功能，健脾补肺的功能。

用户的问题：我有高血压这两天女婿来的时候给我拿了些党参泡水喝，您好高血压可以吃党参吗？
简要回答：
