In [None]:
!pip install -U pymilvus
!pip install -U 'pymilvus[model]'
!pip install -U scikit-learn

In [1]:
docs = [
    "Artificial intelligence was founded as an academic discipline in 1956.",
    "Alan Turing was the first person to conduct substantial research in AI.",
    "Born in Maida Vale, London, Turing was raised in southern England.",
]

In [2]:
import random
import string
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer

from pymilvus import (
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection, AnnSearchRequest, RRFRanker, connections,
)

from pymilvus.model.hybrid import BGEM3EmbeddingFunction



In [5]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
X

<3x27 sparse matrix of type '<class 'numpy.float64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [7]:
print(vectorizer.get_feature_names_out())
print(X.todense())

['1956' 'academic' 'ai' 'alan' 'an' 'artificial' 'as' 'born' 'conduct'
 'discipline' 'england' 'first' 'founded' 'in' 'intelligence' 'london'
 'maida' 'person' 'raised' 'research' 'southern' 'substantial' 'the' 'to'
 'turing' 'vale' 'was']
[[0.33907746 0.33907746 0.         0.         0.33907746 0.33907746
  0.33907746 0.         0.         0.33907746 0.         0.
  0.33907746 0.20026461 0.33907746 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.20026461]
 [0.         0.         0.3119513  0.3119513  0.         0.
  0.         0.         0.3119513  0.         0.         0.3119513
  0.         0.18424347 0.         0.         0.         0.3119513
  0.         0.3119513  0.         0.3119513  0.3119513  0.3119513
  0.23724701 0.         0.18424347]
 [0.         0.         0.         0.         0.         0.
  0.         0.32751633 0.         0.         0.32751633 0.
  0.         0.38687284 0.         0.32751633 0.32751633 0.

In [8]:
query = "Who started AI research?"

In [10]:
import pymilvus
pymilvus.__version__

'2.4.0'

In [11]:
ef = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
dense_dim = ef.dim["dense"]

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


Fetching 23 files:   0%|          | 0/23 [00:00<?, ?it/s]

In [13]:
docs_embeddings = ef(docs)
query_embeddings = ef([query])

In [14]:
docs_embeddings

{'dense': [array([-0.02505933, -0.00142187,  0.04015452, ..., -0.0209493 ,
          0.02623649,  0.00324106], dtype=float32),
  array([ 0.00118473,  0.00649282, -0.0073576 , ..., -0.01446302,
          0.04243686, -0.01794804], dtype=float32),
  array([ 0.00415292, -0.01014923,  0.00098095, ..., -0.02559674,
          0.08084681,  0.00141655], dtype=float32)],
 'sparse': <3x250002 sparse array of type '<class 'numpy.float32'>'
 	with 43 stored elements in Compressed Sparse Row format>}

In [15]:
connections.connect("default", host="localhost", port="19530")

In [19]:
fields = [
    # Use auto generated id as primary key
    FieldSchema(name="pk", dtype=DataType.VARCHAR,
                is_primary=True, auto_id=True, max_length=100),
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=512),
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR,
                dim=dense_dim)
]
schema = CollectionSchema(fields, "")
col = Collection("sparse_dense_demo", schema)

In [20]:
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
dense_index = {"index_type": "FLAT", "metric_type": "COSINE"}
col.create_index("sparse_vector", sparse_index)
col.create_index("dense_vector", dense_index)

Status(code=0, message=)

In [21]:
entities = [docs, docs_embeddings["sparse"], docs_embeddings["dense"]]
col.insert(entities)
col.flush()

In [23]:
col.load()

In [24]:
sparse_search_params = {"metric_type": "IP"}
sparse_req = AnnSearchRequest(query_embeddings["sparse"],
                              "sparse_vector", sparse_search_params, limit=2)
dense_search_params = {"metric_type": "COSINE"}
dense_req = AnnSearchRequest(query_embeddings["dense"],
                             "dense_vector", dense_search_params, limit=2)

res = col.hybrid_search([sparse_req, dense_req], rerank=RRFRanker(),
                        limit=2, output_fields=["text"])

In [25]:
res

['["id: 448695871357611268, distance: 0.032786883413791656, entity: {\'text\': \'Alan Turing was the first person to conduct substantial research in AI.\'}", "id: 448695871357611267, distance: 0.016129031777381897, entity: {\'text\': \'Artificial intelligence was founded as an academic discipline in 1956.\'}"]']