In [1]:
# wget https://storage.googleapis.com/generall-shared-data/startups_demo.json

In [3]:
# !pip install "qdrant-client[fastembed]>=1.8.2"

In [1]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

import json
import os
from dotenv import load_dotenv

load_dotenv()

client = QdrantClient(
    url=os.getenv("QDRANT_API_URL") or exit("QDRANT_API_URL environment variable not set"),
    api_key=os.getenv("QDRANT_KEY") or exit("QDRANT_KEY environment variable not set"),
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
client.set_model("sentence-transformers/all-MiniLM-L6-v2")
# comment this line to use dense vectors only
client.set_sparse_model("prithivida/Splade_PP_en_v1")

In [6]:
# client.delete_collection("startups")

True

In [4]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config=client.get_fastembed_vector_params(),
        # comment this line to use dense vectors only
        sparse_vectors_config=client.get_fastembed_sparse_vector_params(),  
    )

In [8]:
import json

payload_path = "startups_demo.json"
metadata = []
documents = []

with open(payload_path) as fd:
    for line in fd:
        obj = json.loads(line)
        documents.append(obj.pop("description"))
        metadata.append(obj)

In [None]:
# This adds documents to the "startups" collection in Qdrant:
# - collection_name: specifies which collection to add to ("startups")
# - documents: list of text documents to encode and store as vectors
# - metadata: additional data to store alongside each document
# - parallel=0: uses all available CPU cores for parallel encoding
#   (requires wrapping in if __name__ == '__main__' for multiprocessing)
client.add(
    collection_name="startups",
    documents=documents,
    metadata=metadata,
    parallel=0,  # Use all available CPU cores to encode data. 
    # Requires wrapping code into if __name__ == '__main__' block
)

In [5]:
!wget https://storage.googleapis.com/dataset-startup-search/startup-list-com/startups_hybrid_search_processed_40k.tar.gz
!tar -xvf startups_hybrid_search_processed_40k.tar.gz

--2025-01-31 19:24:54--  https://storage.googleapis.com/dataset-startup-search/startup-list-com/startups_hybrid_search_processed_40k.tar.gz
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.0.207, 142.251.0.207, 172.217.192.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.0.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 149774371 (143M) [application/x-gzip]
Saving to: ‘startups_hybrid_search_processed_40k.tar.gz’


2025-01-31 19:25:05 (15.8 MB/s) - ‘startups_hybrid_search_processed_40k.tar.gz’ saved [149774371/149774371]

x dense_vectors.npy
x sparse_vectors.json
x payload.json


In [6]:
from qdrant_client import QdrantClient, models

collection_name = "startups"

class HybridSearcher:
    DENSE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    SPARSE_MODEL = "prithivida/Splade_PP_en_v1"
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # initialize Qdrant client
        self.qdrant_client =  QdrantClient(
            url=os.getenv("QDRANT_API_URL") or exit("QDRANT_API_URL environment variable not set"),
            api_key=os.getenv("QDRANT_KEY") or exit("QDRANT_KEY environment variable not set"))
        self.qdrant_client.set_model(self.DENSE_MODEL)
        # comment this line to use dense vectors only
        self.qdrant_client.set_sparse_model(self.SPARSE_MODEL)
        
    def search(self, text: str):
        
        city_of_interest = "Berlin"

        # Define a filter for cities
        city_filter = models.Filter(
            must=[
                models.FieldCondition(
                    key="city", 
                    match=models.MatchValue(value=city_of_interest)
                )
            ]
        )
        search_result = self.qdrant_client.query(
            collection_name=self.collection_name,
            query_text=text,
            query_filter=city_filter,  # If you don't want any filters for now
            limit=5,  # 5 the closest results
        )
        # `search_result` contains found vector ids with similarity scores 
        # along with the stored payload
        
        # Select and return metadata
        metadata = [hit.metadata for hit in search_result]
        return metadata