Let's define few tables in Pydantic style.

In [None]:
from pgvector.sqlalchemy import Vector
from sqlmodel import Field, SQLModel, Column, Relationship

from microsearch import TrigramIndex, FullTextIndex


# The Meta table contains metadata of the document.
class Meta(SQLModel, table=True):
    id: int | None = Field(default=None, primary_key=True)
    title: str

    # This is how you define relationship to document(s) referencing this Meta object
    documents: list["Document"] | None = Relationship(back_populates="meta")



class Document(SQLModel, table=True):
    id: int | None = Field(default=None, primary_key=True)
    text: str

    # This column will store embedding vectors for semantic search
    vector: list[float] = Field(default=None, sa_column=Column(Vector(dim=384)))

    # Foreign key to Meta object and property-like accessor to Meta object.
    meta_id: int | None = Field(default=None, foreign_key="meta.id")
    meta: Meta | None = Relationship(back_populates="documents")

    # This will create index over text column for trigram search
    __table_args__ = (
        TrigramIndex(table="document", column="text"),
        FullTextIndex(table="document", column="text"),
    )


And create tables and indices in the database.

In [None]:
from sqlmodel import create_engine

engine = create_engine("postgresql://admin:admin@localhost:5432/db", echo=False)

SQLModel.metadata.create_all(engine)

In [None]:
# from sqlmodel import Session, text

# with Session(engine) as session:
#     session.exec(text("DROP TABLE document"))
#     session.exec(text("DROP TABLE meta"))
#     session.commit()

Define embedding function for vector search.

In [None]:
import ollama

def embed(text: str) -> list[float]:
    return ollama.embed(model="all-minilm:33m", input=text).embeddings[0]

Download and index example data.

In [None]:
from itertools import batched

import httpx
from sqlmodel import Session


book = httpx.get("https://www.gutenberg.org/cache/epub/2591/pg2591.txt").text

docs = []
for no, words in enumerate(batched(book.split(), n=100)):
    chunk = " ".join(words)
    meta = Meta(title=f"Document number {no}")
    doc = Document(text=chunk, vector=embed(chunk), meta=meta)
    docs.append(doc)


with Session(engine) as session:
    session.add_all(docs)
    session.commit()

Perform hybrid search with RRF reranker.

In [None]:
# %%timeit -n 5

from microsearch import Result, microsearch, weighted_reciprocal_rank, wrapped

query = "the queen bee"
query_vec = embed("the queen bee")


def ident(doc: Result[Document]) -> str:
    """Return unique (hashable) property of the object."""
    return doc.item.id


with microsearch(engine) as use:
    docs, scores = weighted_reciprocal_rank(
        arrays=[
            use.fulltext(table=Document, query=query),
            use.semantic(table=Document, query=embed(query)),
            use.trigram(table=Document, query=query, strict=True),
        ],
        ident_fn=ident,
    )

    # len(docs)
    for i, doc in enumerate(docs[:20], start=1):
        print(
            f"{i:>5} | {doc.kind:^8} | {doc.score:>5.4f} | ",
            wrapped(doc.item.text[:60] + "..."),
            sep="",
        )