Init qdrant

`docker run -p 6333:6333 -p 6334:6334 \
    -v $(pwd)/qdrant_storage:/qdrant/storage:z \
    qdrant/qdrant:latest`

In [27]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance


In [28]:

# Local Qdrant example – adjust for Cloud (host, api_key, etc.)
client = QdrantClient(host="localhost", port=6333)


In [29]:
# Cell 2: load YANG catalog
import json

CATALOG_PATH = "../data/sensor_catalog.jsonl"  # or .json

all_rows = []

with open(CATALOG_PATH, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        all_rows.append(json.loads(line))

len(all_rows)


54063

In [30]:
from openai import OpenAI
from qdrant_client.models import PointStruct

client_oa = OpenAI()  # uses OPENAI_API_KEY from env

EMBEDDING_MODEL = "text-embedding-3-small"
DIMENSION = 1536  # for text-embedding-3-small
def get_embedding(text: str) -> list[float]:
    resp = client_oa.embeddings.create(
        model=EMBEDDING_MODEL,
        input=text,
    )
    return resp.data[0].embedding

In [31]:
from dataclasses import dataclass
from typing import List, Dict, Any

import os
import glob

# Cell 2: load & chunk YANG files

YANG_ROOT = "../data/yang/vendor/cisco/xr/701"  # adapt if your path is different

@dataclass
class Chunk:
    id: int
    file_path: str
    chunk_index: int
    text: str

def load_yang_files(root: str) -> List[str]:
    pattern = os.path.join(root, "**", "*.yang")
    files = glob.glob(pattern, recursive=True)
    return files

def chunk_text(text: str, max_chars: int = 1000) -> List[str]:
    # naive fixed-size char chunking
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]

def build_chunks(root: str, max_chunks: int = None) -> List[Chunk]:
    chunks: List[Chunk] = []
    files = load_yang_files(root)
    cid = 0
    for f in files:
        try:
            with open(f, "r", encoding="utf-8", errors="ignore") as fh:
                content = fh.read()
        except Exception as e:
            print(f"Skipping {f}: {e}")
            continue

        pieces = chunk_text(content)
        for i, piece in enumerate(pieces):
            if max_chunks is not None and len(chunks) >= max_chunks:
                print(f"Stopping early: reached {max_chunks} chunks")
                return chunks  # ← early return
            
            # add a tiny header so the model knows where this comes from
            text = f"FILE: {os.path.basename(f)}\nCHUNK: {i}\n{piece}"
            chunks.append(Chunk(id=cid, file_path=f, chunk_index=i, text=text))
            cid += 1
    print(f"Loaded {len(files)} files, created {len(chunks)} chunks.")
    return chunks

chunks = build_chunks(YANG_ROOT,max_chunks=10)


Stopping early: reached 10 chunks


In [None]:
# Create new collection for raw YANG chunks
collection_name = "fixed_window_embeddings"

if not client.collection_exists(collection_name):
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=DIMENSION, distance=Distance.COSINE),
    )

print(f"Created collection: {collection_name}")

Created collection: fixed_window_embeddings


In [33]:
# Embed and upload only your 10 chunks

points = []
for chunk in chunks:  # ← only your 10 chunks
    vector = get_embedding(chunk.text)
    
    points.append(PointStruct(
        id=chunk.id,
        vector=vector,
        payload={
            "module": os.path.basename(chunk.file_path),
            "file_path": chunk.file_path,
            "chunk_index": chunk.chunk_index,
            "text_preview": chunk.text[:500] + "..." if len(chunk.text) > 500 else chunk.text,
            "source": "raw_yang_module"
        }
    ))

client.upsert(collection_name=collection_name, points=points)
print(f"Uploaded {len(points)} chunks to {collection_name}")

Uploaded 10 chunks to fixed_window_embeddings


In [34]:
# Example: Scroll all points in a collection (replace 'your_collection' with your actual collection name)
points = client.scroll(
    collection_name="fixed_window_embeddings",
    limit=10,  # Fetch first 10 points; use with_offset for pagination
    with_vectors=True,  # Include the actual vector values
    with_payload=True   # Include metadata/payloads
)

# Print the results
for point in points[0][:3]:
    print(f"Point ID: {point.id}")
    print(f"Vector values (first 5 dims): {point.vector[:5]}...")  # Vectors are lists of floats; slice to avoid spam
    print(f"Payload: {point.payload}")
    print("---")

Point ID: 0
Vector values (first 5 dims): [-0.027288899, 0.03474736, 0.028422786, -0.02413921, -0.014047609]...
Payload: {'module': 'openconfig-local-routing.yang', 'file_path': '../data/yang/vendor/cisco/xr/701/openconfig-local-routing.yang', 'chunk_index': 0, 'text_preview': 'FILE: openconfig-local-routing.yang\nCHUNK: 0\nmodule openconfig-local-routing {\n  yang-version 1;\n  namespace "http://openconfig.net/yang/local-routing";\n  prefix oc-loc-rt;\n\n  import openconfig-inet-types {\n    prefix inet;\n  }\n  import openconfig-policy-types {\n    prefix oc-pt;\n  }\n  import openconfig-extensions {\n    prefix oc-ext;\n  }\n  import openconfig-interfaces {\n    prefix oc-if;\n  }\n\n  organization\n    "OpenConfig working group";\n  contact\n    "OpenConfig working group\n     www.opencon...', 'source': 'raw_yang_module'}
---
Point ID: 1
Vector values (first 5 dims): [0.0026591269, 0.050561566, 0.02311765, -0.02153843, 0.0030788144]...
Payload: {'module': 'openconfig-local-routing.

In [35]:
if not client.collection_exists("catalog_embeddings"):
    client.create_collection(
        collection_name="catalog_embeddings",
        vectors_config=VectorParams(size=1536, distance="Cosine"),
    )

In [36]:
from qdrant_client.models import PointStruct

points = []

for idx, row in enumerate(all_rows[:10]):
    vector = get_embedding(row["search_text"])

    point = PointStruct(
        id=idx,  # ✅ valid: unsigned integer
        vector=vector,
        payload={
            "yang_id": row["id"],
            "module": row["module"],
            "path": row["path"],
            "protocol_tag": row["protocol_tag"],
            "category": row["category"],
            "kind": row["kind"],
            "leaf_count": row["leaf_count"],
            "description": row["description"],
            "leaf_names": row["leaf_names"],
        },
    )
    points.append(point)

client.upsert(
    collection_name="catalog_embeddings",
    points=points,
)


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
# Example: Scroll all points in a collection (replace 'your_collection' with your actual collection name)
points = client.scroll(
    collection_name="catalog_embeddings",
    limit=10,  # Fetch first 10 points; use with_offset for pagination
    with_vectors=True,  # Include the actual vector values
    with_payload=True   # Include metadata/payloads
)

# Print the results
for point in points[0][:3]:
    print(f"Point ID: {point.id}")
    print(f"Vector values (first 5 dims): {point.vector[:5]}...")  # Vectors are lists of floats; slice to avoid spam
    print(f"Payload: {point.payload}")
    print("---")

Point ID: 0
Vector values (first 5 dims): [-0.0036967276, 0.03387373, 0.03239265, -0.009961457, -0.042616878]...
Payload: {'yang_id': 0, 'module': 'Cisco-IOS-XR-tunnel-ip-ma-oper', 'path': 'Cisco-IOS-XR-tunnel-ip-ma-oper:tunnel-ip-ma', 'protocol_tag': 'tunnel', 'category': ['state', 'stats', 'tunnels'], 'kind': 'container', 'leaf_count': 92, 'description': 'Tunnel Ip ma parameters', 'leaf_names': ['address-family', 'adjacency', 'afi', 'bandwidth', 'base-caps-state', 'bfd-session-state', 'cap-ipv4-transport-supported', 'cap-ipv6-transport-supported', 'check-point-id', 'convergence-state', 'destination-address-length', 'destination-prefix-list', 'endpt-count', 'endpt-prod', 'eod-recvd', 'ep-app-id', 'flag-bits', 'flags', 'flags-other', 'gre-cap-checksum-supported', 'gre-cap-ipv4-transport-supported', 'gre-cap-ipv6-transport-supported', 'gre-cap-key-supported', 'gre-cap-max-mtu-supported', 'gre-cap-max-tunnels-supported', 'gre-cap-mgre-ipv4-transport-supported', 'gre-cap-mgre-ipv6-transpo