In [1]:
! wget https://storage.googleapis.com/generall-shared-data/startups_demo.json

--2025-01-30 22:54:56--  https://storage.googleapis.com/generall-shared-data/startups_demo.json
Resolving storage.googleapis.com (storage.googleapis.com)... 142.250.0.207, 64.233.190.207, 64.233.186.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.250.0.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 22205751 (21M) [application/json]
Saving to: ‘startups_demo.json’


2025-01-30 22:54:59 (11.4 MB/s) - ‘startups_demo.json’ saved [22205751/22205751]



In [5]:
# !pip install sentence-transformers numpy pandas tqdm

In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np
import json
import pandas as pd
from tqdm.notebook import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model = SentenceTransformer(
    "all-MiniLM-L6-v2", device="cpu"
)  # or device="cpu" if you don't have a GPU

In [8]:
df = pd.read_json("./startups_demo.json", lines=True)

In [9]:
vectors = model.encode(
    [row.alt + ". " + row.description for row in df.itertuples()],
    show_progress_bar=True,
)

Batches: 100%|██████████| 1265/1265 [01:39<00:00, 12.76it/s]


In [11]:
vectors.shape
# > (40474, 384)

(40474, 384)

In [12]:
np.save("startup_vectors.npy", vectors, allow_pickle=False)

In [13]:
# Import client library
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance

import json
import os
from dotenv import load_dotenv

load_dotenv()

client = QdrantClient(
    url=os.getenv("QDRANT_API_URL") or exit("QDRANT_API_URL environment variable not set"),
    api_key=os.getenv("QDRANT_KEY") or exit("QDRANT_KEY environment variable not set"),
)

In [14]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config=VectorParams(size=384, distance=Distance.COSINE),
    )

In [15]:
fd = open("./startups_demo.json")

# payload is now an iterator over startup data
payload = map(json.loads, fd)

# Load all vectors into memory, numpy array works as iterable for itself.
# Other option would be to use Mmap, if you don't want to load all data into RAM
vectors = np.load("./startup_vectors.npy")

In [16]:
client.upload_collection(
    collection_name="startups",
    vectors=vectors,
    payload=payload,
    ids=None,  # Vector ids will be assigned automatically
    batch_size=256,  # How many vectors will be uploaded in a single request?
)

In [21]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer


class NeuralSearcher:
    def __init__(self, collection_name):
        self.collection_name = collection_name
        # Initialize encoder model
        self.model = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
        # initialize Qdrant client
        self.qdrant_client =  QdrantClient(
            url=os.getenv("QDRANT_API_URL") or exit("QDRANT_API_URL environment variable not set"),
            api_key=os.getenv("QDRANT_KEY") or exit("QDRANT_KEY environment variable not set"))
        
    def search(self, text: str):
        # Convert text query into vector
        vector = self.model.encode(text).tolist()

        # Use `vector` for search for closest vectors in the collection
        search_result = self.qdrant_client.query_points(
            collection_name=self.collection_name,
            query=vector,
            query_filter=None,  # If you don't want any filters for now
            limit=5,  # 5 the most closest results is enough
        ).points
        # `search_result` contains found vector ids with similarity scores along with the stored payload
        # In this function you are interested in payload only
        payloads = [hit.payload for hit in search_result]
        return payloads
        
        