In [1]:
import os
from typing import List, Dict

from pymilvus import MilvusClient, DataType
from sentence_transformers import SentenceTransformer
from pypdf import PdfReader
from dotenv import load_dotenv
load_dotenv()

# ========= CONFIGURATION =========

# Set these as environment variables OR hard-code (not recommended)
MILVUS_HOST = os.getenv("MILVUS_HOST")        # gRPC host from watsonx.data Milvus service
MILVUS_PORT = os.getenv("MILVUS_PORT", "443") # gRPC port from service
MILVUS_API_KEY = os.getenv("MILVUS_API_KEY")  # IBM Cloud API key

# Collection names
PUBLIC_COLLECTION = "offerings_public"
MANAGERS_COLLECTION = "offerings_managers_only"

# Embedding model (384 dimensions)
EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
EMBEDDING_DIM = 384  # fixed for this model

# print(MILVUS_HOST)
# print(MILVUS_PORT)
# print(MILVUS_API_KEY)

  from pkg_resources import DistributionNotFound, get_distribution
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def connect_milvus() -> MilvusClient:
    if not (MILVUS_HOST and MILVUS_PORT and MILVUS_API_KEY):
        raise RuntimeError("Set MILVUS_HOST, MILVUS_PORT and MILVUS_API_KEY first.")
	

    milvus_uri = f"https://ibmlhapikey_michal.kordyzon@pl.ibm.com:{MILVUS_API_KEY}@{MILVUS_HOST}:{MILVUS_PORT}"

    client = MilvusClient(
        uri=milvus_uri,
        secure=True,
    )
    return client


In [3]:
# Connect & list collections

from pprint import pprint

client = connect_milvus()
print("Connected to IBM Milvus.")

print("\nCollections in Milvus:")
pprint(client.list_collections())


Connected to IBM Milvus.

Collections in Milvus:
['offerings_managers_only', 'offerings_public']


In [4]:
# Inspect schema & indexes
# Public collection
print("\n--- Schema: offerings_public ---")
schema_public = client.describe_collection(PUBLIC_COLLECTION)
pprint(schema_public)

print("\n--- Stats: offerings_public ---")
stats_public = client.get_collection_stats(PUBLIC_COLLECTION)
pprint(stats_public)


--- Schema: offerings_public ---
{'aliases': [],
 'auto_id': True,
 'collection_id': 462349868925826679,
 'collection_name': 'offerings_public',
 'consistency_level': 2,
 'created_timestamp': 462485509794693123,
 'description': '',
 'enable_dynamic_field': False,
 'fields': [{'auto_id': True,
             'description': '',
             'field_id': 100,
             'is_primary': True,
             'name': 'id',
             'params': {},
             'type': <DataType.INT64: 5>},
            {'description': '',
             'field_id': 101,
             'name': 'offering_id',
             'params': {'max_length': 256},
             'type': <DataType.VARCHAR: 21>},
            {'description': '',
             'field_id': 102,
             'name': 'text',
             'params': {'max_length': 2048},
             'type': <DataType.VARCHAR: 21>},
            {'description': '',
             'field_id': 103,
             'name': 'embedding',
             'params': {'dim': 384},
          

In [5]:
# Inspect schema & indexes
# Managers-only collection
print("\n--- Schema: offerings_managers_only ---")
schema_managers = client.describe_collection(MANAGERS_COLLECTION)
pprint(schema_managers)

print("\n--- Stats: offerings_managers_only ---")
stats_managers = client.get_collection_stats(MANAGERS_COLLECTION)
pprint(stats_managers)

# --- Schema: offerings_managers_only ---
# {'aliases': [],
#  'auto_id': True,
#  'collection_id': 462191026214080938,
#  'collection_name': 'offerings_managers_only',
#  'consistency_level': 2,
#  'created_timestamp': 462422309374263299,
#  'description': '',
#  'enable_dynamic_field': False,
#  'fields': [{'auto_id': True,
#              'description': '',
#              'field_id': 100,
#              'is_primary': True,
#              'name': 'id',
#              'params': {},
#              'type': <DataType.INT64: 5>},
#             {'description': '',
#              'field_id': 101,
#              'name': 'offering_id',
#              'params': {'max_length': 256},
#              'type': <DataType.VARCHAR: 21>},
#             {'description': '',
#              'field_id': 102,
#              'name': 'text',
#              'params': {'max_length': 2048},
#              'type': <DataType.VARCHAR: 21>},
#             {'description': '',
#              'field_id': 103,
#              'name': 'embedding',
#              'params': {'dim': 384},
#              'type': <DataType.FLOAT_VECTOR: 101>}],
#  'num_partitions': 1,
#  'num_shards': 1,
#  'properties': {}}

# --- Stats: offerings_managers_only ---
# {'row_count': 0}



--- Schema: offerings_managers_only ---
{'aliases': [],
 'auto_id': True,
 'collection_id': 462349868925826699,
 'collection_name': 'offerings_managers_only',
 'consistency_level': 2,
 'created_timestamp': 462485510319243267,
 'description': '',
 'enable_dynamic_field': False,
 'fields': [{'auto_id': True,
             'description': '',
             'field_id': 100,
             'is_primary': True,
             'name': 'id',
             'params': {},
             'type': <DataType.INT64: 5>},
            {'description': '',
             'field_id': 101,
             'name': 'offering_id',
             'params': {'max_length': 256},
             'type': <DataType.VARCHAR: 21>},
            {'description': '',
             'field_id': 102,
             'name': 'text',
             'params': {'max_length': 2048},
             'type': <DataType.VARCHAR: 21>},
            {'description': '',
             'field_id': 103,
             'name': 'embedding',
             'params': {'dim': 38

In [6]:
# Load collections & sample some rows
client.load_collection(PUBLIC_COLLECTION)
client.load_collection(MANAGERS_COLLECTION)
print("Collections loaded.")


Collections loaded.


In [7]:
# Sample from offerings_public
print("\nSample rows from offerings_public:")

rows_public = client.query(
    collection_name=PUBLIC_COLLECTION,
    filter="offering_id == 'offering_xyz'",
    output_fields=["id", "offering_id", "text"],
    limit=5,
)

for r in rows_public:
    print("----")
    pprint(r)



Sample rows from offerings_public:
----
{'id': 462349868925426229,
 'offering_id': 'offering_xyz',
 'text': 'FintechNova – Token-Based Insurance Portfolio (Public Overview) '
         'FintechNova is introducing a next-generation suite of token-based '
         'insurance products designed for digital-first customers who value '
         'transparency, automation, and rapid claim settlement. All products '
         'are built on our proprietary NovaChain technology, which enables '
         'secure token issuance, premium automation, and instant micro-payouts '
         'using smart contracts. This document provides an overview of our '
         'five flagship insurance products. 1. TravelFlex Token Insurance '
         'Purpose: A flexible, on-demand travel insurance designed for '
         'customers who want instant protection without lengthy forms. Key '
         'Features: • Pay-as-you-travel: Smart contracts activate and '
         'deactivate coverage based on geolocation signa

In [8]:
# Sample from offerings_managers_only
print("\nSample rows from offerings_managers_only:")

rows_managers = client.query(
    collection_name=MANAGERS_COLLECTION,
    filter="offering_id == 'offering_xyz'",
    output_fields=["id", "offering_id", "text"],
    limit=5,
)

for r in rows_managers:
    print("----")
    pprint(r)



Sample rows from offerings_managers_only:
----
{'id': 462349868925426232,
 'offering_id': 'offering_xyz',
 'text': 'FintechNova – Internal Strategy, Revenue Model, Costs & Risks for '
         'Token-Based Insurance Portfolio Confidential – For Managers Only '
         'This document outlines the internal financials, strategy, '
         'operational risks, and cost structure behind FintechNova’s new '
         'token-based insurance product line. Information contained here must '
         'not be shared with non-managerial staff or external parties. 1. '
         'Strategic Objectives (2025–2028) 1. Capture 3% of digital '
         'micro-insurance market in EU/CEE within 36 months. 2. Achieve 45% '
         'automated claim resolution, reducing operational cost by ~52%. 3. '
         'Establish NovaChain as the leading insurance smart-contract platform '
         'in the region. 4. Expand into Asia-Pacific in FY2027 with TravelFlex '
         'and GadgetSecure. 5. Integrate AI-drive

In [9]:
# (Optional) Inspect a vector
print("\nOne row with embedding from offerings_public:")

rows_with_vec = client.query(
    collection_name=PUBLIC_COLLECTION,
    filter="offering_id == 'offering_xyz'",
    output_fields=["id", "offering_id", "text", "embedding"],
    limit=1,
)

pprint(rows_with_vec[0])
print("\nEmbedding length:", len(rows_with_vec[0]["embedding"]))



One row with embedding from offerings_public:
{'embedding': [np.float32(-0.069086835),
               np.float32(0.0660844),
               np.float32(0.04644627),
               np.float32(-0.03041381),
               np.float32(0.046152275),
               np.float32(0.0062247063),
               np.float32(0.11905965),
               np.float32(0.061887607),
               np.float32(0.030657107),
               np.float32(0.013776384),
               np.float32(0.008934268),
               np.float32(0.03811596),
               np.float32(-0.0116381375),
               np.float32(-0.017073773),
               np.float32(0.046980996),
               np.float32(-0.06222403),
               np.float32(0.041162044),
               np.float32(0.003875597),
               np.float32(-0.047203038),
               np.float32(0.06952149),
               np.float32(0.034009438),
               np.float32(-0.017327312),
               np.float32(-0.065095),
               np.float32(0.031030