In [1]:
# Necessary packages
import weaviate
import weaviate.classes as wvc
from weaviate.classes.init import Auth
from weaviate.exceptions import (
    WeaviateStartUpError,
    UnexpectedStatusCodeException,
    AuthenticationFailedException
)
from dotenv import load_dotenv
import os
import json
from pprint import pprint
from langchain_openai import OpenAIEmbeddings
from langchain_weaviate.vectorstores import WeaviateVectorStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.document_loaders import JSONLoader
from langchain_weaviate.vectorstores import WeaviateVectorStore

In [2]:
# Set Weaviate and OpenAI credentials
load_dotenv()
httpHost = os.getenv("WCD_HTTP_HOST")
grpcHost = os.getenv("WCD_GRPC_HOST")
wcdApiKey = os.getenv("WCD_API_KEY")
openaiApiKey = os.getenv("OPENAI_API_KEY")

In [3]:
# Function to create the weaviate client that connects to cloud cluster
def create_weaviate_client():
    try:
        # Initialize the Weaviate client connection to the cloud cluster
        #client = weaviate.connect_to_custom(
        client = weaviate.connect_to_weaviate_cloud(
            # http_host=httpHost,
            # http_port=443,
            # http_secure=True,
            # grpc_host=grpcHost,
            # grpc_port=443,
            # grpc_secure=True,
            cluster_url=httpHost,
            auth_credentials=Auth.api_key(wcdApiKey),
            headers={
                "X-OpenAI-Api-Key": openaiApiKey
            }
        )
        
        # Check if the client is ready to accept queries
        if client.is_ready():
            print("Weaviate connection established successfully!")
            return client
        else:
            raise WeaviateStartUpError("Weaviate is not ready.")
    # Handle exceptions
    except WeaviateStartUpError as e:
        print(f"Weaviate failed to start: {e}")
    except UnexpectedStatusCodeException as e:
        print(f"Received unexpected status code: {e}")
    except AuthenticationFailedException as e:
        print(f"Authentication failed: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [4]:
# Use the function to create the Weaviate client
client = create_weaviate_client()

Weaviate connection established successfully!


In [5]:
# clear collection before creating it
client.collections.delete("SAL_Interfaces_AstroChat")

# lets make sure its vectorizer is what the one we want
collection = client.collections.create(
    name="SAL_Interfaces_AstroChat",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.config.Configure.Generative.openai(),
)

In [6]:
#text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50)

# import JSON file
loader = JSONLoader(file_path="./data/flat_sal_interfaces.json",
                    jq_schema='.[]',
                    text_content=False
                )
data = loader.load()
pprint(data)
# docs = loader.load_and_split(text_splitter)
# print(f"GOT {len(docs)} docs for SAL Interfaces")

# db = WeaviateVectorStore.from_documents(
#     docs,
#     embedding=OpenAIEmbeddings(),
#     client=client
# )

# retriever = WeaviateVectorStore.from_documents(
#         docs,
#         embedding=OpenAIEmbeddings(),
#         client=client,
#         index_name="SAL_Interfaces_AstroChat",
#         text_key="text"
#     ).as_retriever(
#         search_type="similarity", search_kwargs={"k": 6}
# )

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
# Example querie to weaviate
salInterfaces = client.collections.get("SAL_Interfaces_AstroChat")

response = salInterfaces.query.near_text(
    query="wind speed",
    limit=2
)

pprint.pp(response.objects[0].properties)

In [None]:
# Example generative querie to weaviate
salInterfaces = client.collections.get("SAL_Interfaces_AstroChat")
response = salInterfaces.generate.near_text(
    query="ataos",
    limit=2,
    grouped_task="what is ATAOS?"
)

print(response.generated)

In [None]:
# EXTRA: Code used to flatten the YAML file into a JSON - First attempt was to use YAML
# Convert the YAML to a JSON (then tried to use JSON)
with open('sal_interfaces.yaml', 'r') as file:
    yaml_data = yaml.safe_load(file)

jsonData = json.dumps(yaml_data, indent=4)

# Save JSON if needed
with open('sal_interfaces.json', 'w') as jsonFile:
    jsonFile.write(jsonData)


# Nested JSON not useful for weaviate - Flatten the JSON - Final attempt
with open('sal_interfaces.json', 'r') as f:
    data = json.load(f)

# List to store the flattened items
flattenedItems = []


def process_subsystem(subsystem_name, subsystem_data):
    '''
    Function to process and flatten each subsystem's commands, events, or telemetry
    '''
    # Check if it's a command, event, or telemetry set
    for key, value in subsystem_data.items():
        if isinstance(value, dict) and ('SALCommandSet' in key or 'SALEventSet' in key or 'SALTelemetrySet' in key):
            commandsOrEvents = value.get('SALCommand') or value.get('SALEvent') or value.get('SALTelemetry')

            # Check if commandsOrEvents is a list (properly structured)
            if isinstance(commandsOrEvents, list):
                # Iterate over each command/event/telemetry and extract items
                for commandOrEvents in commandsOrEvents:
                    if isinstance(commandOrEvents, dict):  # Ensure it's a dictionary
                        topic = commandOrEvents.get('EFDB_Topic')
                        description = commandOrEvents.get('Description')
                        items = commandOrEvents.get('item', [])

                        # If 'item' is a list, process each one
                        if isinstance(items, list):
                            for item in items:
                                if isinstance(item, dict):
                                    flattenedItems.append({
                                        'Subsystem': subsystem_name,
                                        'EFDB_Topic': topic,
                                        'Description': description,
                                        'EFDB_Name': item.get('EFDB_Name'),
                                        'Item_Description': item.get('Description'),
                                        'Count': item.get('Count'),
                                        'IDL_Type': item.get('IDL_Type'),
                                        'Units': item.get('Units')
                                    })
                        # If 'item' is a single dict, process it directly
                        elif isinstance(items, dict):
                            flattenedItems.append({
                                'Subsystem': subsystem_name,
                                'EFDB_Topic': topic,
                                'Description': description,
                                'EFDB_Name': items.get('EFDB_Name'),
                                'Item_Description': items.get('Description'),
                                'Count': items.get('Count'),
                                'IDL_Type': items.get('IDL_Type'),
                                'Units': items.get('Units')
                            })

# Process each subsystem in the JSON file
for subsystem, content in data.items():
    process_subsystem(subsystem, content)

# Save the flattened data into a new JSON file
with open('simplified_sal_interfaces_with_item_description.json', 'w') as f:
    json.dump(flattenedItems, f, indent=4)