Neural Search using Bert transformer and qdrant database

Test out matching process for words/sentences using the pretrained sentence transformer and the qdrant vector database.

The main idea is taken from the article: https://blog.qdrant.tech/neural-search-tutorial-3f034ab13adc.

Transformer which is used https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2

Qdrant documentation https://qdrant.tech/documentation/


Qdrant installation is done via docker

https://qdrant.tech/documentation/quick_start/


In [13]:
from sentence_transformers import SentenceTransformer
from typing import Optional, List
from qdrant_client import QdrantClient
import numpy as np
from qdrant_client.models import Distance
from typing import Optional
from qdrant_client.conversions.common_types import Record
from qdrant_client.models import VectorParams


# File path to the file with words on which the matchins should be done 
file_path = "qdrant_data/data.txt"
out_file_path = "qdrant_data/word_embeddings.npy"

In [8]:
# 1. Transform the words to vectors using the prebuilt model

def transform_names_to_vectors(file_path: str, batch_size: Optional = 30000):
    """
    Load the model and encode them to the vectors
    """
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2',device="cpu")
    names = []
    with open(file_path) as file:
        for rec in file:
            names.append(rec.strip())
        names_embeddings = model.encode(names)
        _dump_to_file(out_file_path=out_file_path, vectors = names_embeddings)


def _dump_to_file(out_file_path: str, vectors: List):
    np.save(out_file_path, vectors)


In [14]:
transform_names_to_vectors(file_path=file_path)

In [28]:
# Read the word embeddings and ingest them to qdrant database 

COLLECTION_NAME = "my_first_collection"
DIM = 768

def ingest_vectors_to_qdrant(file_path:str=out_file_path):

    embeddings = np.load(file_path)
    records = (Record(
            id=idx,
            vector=embedding.tolist())
        for idx, embedding in enumerate(embeddings)
    )

    client = QdrantClient(host="localhost", port=6333)
    client.recreate_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=VectorParams(size=DIM, distance=Distance.DOT),
    )
    client.upload_records(
        collection_name=COLLECTION_NAME,
        records=records,
        parallel=2
    )



In [29]:
ingest_vectors_to_qdrant(file_path=out_file_path)

In [30]:
# preform the search in the database given a word vector 

def search(sentence: str):
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2', device="cpu")
    query_vector = model.encode(sentence).tolist()
    client = QdrantClient(host="localhost", port=6333)

    hits = client.search(
        collection_name=COLLECTION_NAME,
        query_vector=query_vector,
        limit=5  # Return 5 closest points
    )

    return hits

In [31]:
search(sentence ="Qdrant is a vector similarity search engine.") 

[ScoredPoint(id=0, version=0, score=7.943817, payload={}, vector=None),
 ScoredPoint(id=2, version=0, score=4.656517, payload={}, vector=None),
 ScoredPoint(id=3, version=0, score=3.063758, payload={}, vector=None),
 ScoredPoint(id=1, version=0, score=0.86934197, payload={}, vector=None),
 ScoredPoint(id=4, version=0, score=0.6023959, payload={}, vector=None)]

In [32]:
# Typos in sentence: 
search(sentence ="Qdrat is veccor similaily seatch enhine.") 

[ScoredPoint(id=0, version=0, score=3.9752455, payload={}, vector=None),
 ScoredPoint(id=2, version=0, score=3.2339041, payload={}, vector=None),
 ScoredPoint(id=3, version=0, score=1.3033721, payload={}, vector=None),
 ScoredPoint(id=4, version=0, score=1.1483693, payload={}, vector=None),
 ScoredPoint(id=1, version=0, score=1.0606894, payload={}, vector=None)]