In [None]:
# Step 1: Install necessary libraries (run in a cell if needed)
!pip install -q -r requirements.txt

from pymilvus import connections, Collection, CollectionSchema, FieldSchema, DataType
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Shakeout minio connection

In [None]:
#!/usr/bin/env python3
"""
Shake-out test for a MinIO deployment on Kubernetes.

Environment variables:
  AWS_S3_ENDPOINT        â€“ MinIO service DNS name (e.g. minio.minio.svc.cluster.local)
  AWS_ACCESS_KEY_ID      â€“ MinIO access key
  AWS_SECRET_ACCESS_KEY  â€“ MinIO secret key
  AWS_DEFAULT_REGION     â€“ Dummy value; boto3 still expects one
"""
import os
import sys

import boto3
from botocore.client import Config
from botocore.exceptions import BotoCoreError, ClientError


def main() -> None:
    endpoint = os.getenv("AWS_S3_ENDPOINT", "minio.minio.svc.cluster.local")
    access_key = os.getenv("AWS_ACCESS_KEY_ID", "minio")
    secret_key = os.getenv("AWS_SECRET_ACCESS_KEY", "test")
    region = os.getenv("AWS_DEFAULT_REGION", "us-east-1") or "us-east-1"

    try:
        s3 = boto3.client(
            "s3",
            endpoint_url=f"http://{endpoint}",
            aws_access_key_id=access_key,
            aws_secret_access_key=secret_key,
            region_name=region,
            config=Config(signature_version="s3v4"),
        )

        resp = s3.list_buckets()
        buckets = [b["Name"] for b in resp.get("Buckets", [])]

        if buckets:
            print("ðŸŸ¢ Connection succeeded â€“ buckets discovered:")
            for name in buckets:
                print(f"  â€¢ {name}")
        else:
            print("ðŸŸ¢ Connected but no buckets found.")

    except (BotoCoreError, ClientError) as exc:
        print(f"ðŸ”´ MinIO connectivity test failed: {exc}", file=sys.stderr)
        sys.exit(1)


if __name__ == "__main__":
    main()

# Shakeout the Milvus connectivity


## Create the Milvus database

In [None]:
from pymilvus import MilvusClient

# This is the name of the collection that this program will use.
collection_name = "shakeout_collection"

# Create the client object
milvus_client = MilvusClient(
    uri="http://milvus-service.milvus.svc.cluster.local:19530",
    db_name="default"
)

In [None]:
# Make sure we start with a clean slate by deleting the collection if it exists from a prior run.
if milvus_client.has_collection(collection_name):
    milvus_client.drop_collection(collection_name)

In [None]:
# 1. Build the schema that the database will use.
embedding_dim=384    # This is defined by the embedding model we use.

schema = MilvusClient.create_schema(auto_id=False, enable_dynamic_field=False)
schema.add_field("id", DataType.INT64, is_primary=True, auto_id=False)
schema.add_field("embedding", DataType.FLOAT_VECTOR, dim=embedding_dim)

In [None]:
# Create the collection and attach the schema defined above
milvus_client.create_collection(
    collection_name=collection_name,
    schema=schema,
    metric_type="IP",            # inner product
    consistency_level="Strong",
)

# Debug print statement to make sure we can see the collection
print(f"Collection list: {milvus_client.list_collections()}") 

## Create test data for the vector database

In [None]:
# Generate embeddings using a simple embedding model from Hugging Face
model = SentenceTransformer('all-MiniLM-L6-v2')

# We will put three words into the database and hope to see how the vector database uses the 
# embedding model to store these in a way where we can find the most similar words (cat and dog)
terms = ["dog", "cat", "pumpkin"]
embeddings = model.encode(terms)

# Display the first 5 rows of embedding data. Observe that these are now vectors.
for term, vector in zip(terms, embeddings):
    print(f"Embedding for '{term}' (first 5 values):\n{vector[:5]}\n")

In [None]:
# Create a "list of dictionaries" data structure that matches the schema we defined for the vector database. E.g.
# [
#    {"id": 0, "embedding": [0.1, 0.2, ...]},
#    {"id": 1, "embedding": [0.3, 0.4, ...]},
#    {"id": 2, "embedding": [0.6, 0.2, ...]}
#]
data = [
    {"id": i, "embedding": vec}
    for i, vec in enumerate(embeddings.tolist())
]

In [None]:
# Insert the vectors into the collection
milvus_client.insert(collection_name=collection_name, data=data)

In [None]:
# Create index on the vector field
milvus_client.create_index(
    field_name="embedding",
    index_params={
        "metric_type": "COSINE",
        "index_type": "IVF_FLAT",
        "params": {"nlist": 128}
    }
)

collection.load()

## Test data retrieval

In [None]:
# Prove we can retireve data from the database
def search(term):
    vector = model.encode([term])
    results = collection.search(vector, "embedding", param={"metric_type": "COSINE"}, limit=3)
    return results

results_dog = search("dog")
print(f"Results: {results_dog}")

In [None]:
# Perform a cosine similarity search to find how similar dog (embeddings[0]) is to cat and pumpkin (embeddings[1:]).
cos_sim = cosine_similarity([embeddings[0]], embeddings[1:])

print("Similarity (dog vs cat):", cos_sim[0][0])
print("Similarity (dog vs pumpkin):", cos_sim[0][1])

In [None]:
# Visualise the result
pca = PCA(n_components=2)
reduced = pca.fit_transform(embeddings)
plt.scatter(reduced[:, 0], reduced[:, 1])
for i, term in enumerate(terms):
    plt.annotate(term, (reduced[i, 0], reduced[i, 1]))
plt.title("PCA Projection of Embeddings")
plt.show()

In [None]:
# Close the Milvus connection
milvus_client.close()
milvus_client.drop_collection(collection_name)