In [None]:
# Necessary packages
import weaviate
import weaviate.classes as wvc
from dotenv import load_dotenv
import os
import json
import pprint

In [None]:
# Set OpenAI credentials - always needed
load_dotenv()
openaiApiKey = os.getenv("OPENAI_API_KEY")

In [None]:
# Set Weaviate and OpenAI credentials - only for cloud cluster
wcdUrl = os.getenv("WEAVIATE_URL")
wcdApiKey = os.getenv("WEAVIATE_API_KEY")

In [None]:
# Create client and Test local connection
client = weaviate.connect_to_local(
    host="0.0.0.0",  # Use a string to specify the host
    port=8080,
    grpc_port=50051,
    headers={
        "X-OpenAI-Api-Key": openaiApiKey
    }
)

print(client.is_ready())

In [None]:
# Create client and Test  remote connection
client = weaviate.connect_to_weaviate_cloud(
    clusterUrl=wcdUrl,
    authCredentials=wvc.init.Auth.api_key(wcdApiKey),
    headers={
        "X-OpenAI-Api-Key": openaiApiKey
    }
)

print(client.is_ready())

In [None]:
# In case is necessary to delete the collection and start over
client.collections.delete("SAL_Interfaces_AstroChat")

In [None]:
# Create collection
salInterfaces = client.collections.create(
    name="SAL_Interfaces_AstroChat",
    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.config.Configure.Generative.openai(model="gpt-4-1106-preview"),
    properties=[
        wvc.config.Property(
            name="subsystem",
            data_type=wvc.config.DataType.TEXT,
            vectorize_property_name=True,
            tokenization=wvc.config.Tokenization.LOWERCASE 
        ),
        wvc.config.Property(
            name="efdb_topic",
            data_type=wvc.config.DataType.TEXT,
            vectorize_property_name=True,
            tokenization=wvc.config.Tokenization.LOWERCASE
        ),
        wvc.config.Property(
            name="description",
            data_type=wvc.config.DataType.TEXT,
            vectorize_property_name=True,
            tokenization=wvc.config.Tokenization.LOWERCASE
        ),
        wvc.config.Property(
            name="efdb_name",
            data_type=wvc.config.DataType.TEXT,
            vectorize_property_name=True,
            tokenization=wvc.config.Tokenization.LOWERCASE
        ),
        wvc.config.Property(
            name="item_description",
            data_type=wvc.config.DataType.TEXT,
            vectorize_property_name=True,
            tokenization=wvc.config.Tokenization.LOWERCASE
        ),
        wvc.config.Property(
            name="idl_type",
            data_type=wvc.config.DataType.TEXT,
            vectorize_property_name=True,
            tokenization=wvc.config.Tokenization.LOWERCASE
        ),
        wvc.config.Property(
            name="units",
            data_type=wvc.config.DataType.TEXT,
            vectorize_property_name=True,
            tokenization=wvc.config.Tokenization.LOWERCASE
        )
    ]
)

In [None]:
# Populate the collection
with open('flat_sal_interfaces.json', 'r') as f:
    data = json.load(f)

salInterfaceObjs = list()
for i, d in enumerate(data):
    salInterfaceObjs.append({
        "subsystem": d["Subsystem"],
        "efdb_topic": d["EFDB_Topic"],
        "description": d["Description"],
        "efdb_name": d["EFDB_Name"],
        "item_description": d["Item_Description"],
        "idl_type": d["IDL_Type"],
        "units": d["Units"]
    })

salInterfaces = client.collections.get("SAL_Interfaces_AstroChat")
salInterfaces.data.insert_many(salInterfaceObjs)

In [None]:
# Example querie to weaviate
salInterfaces = client.collections.get("SAL_Interfaces_AstroChat")

response = salInterfaces.query.near_text(
    query="wind speed",
    limit=2
)

pprint.pp(response.objects[0].properties)

In [None]:
# Example generative querie to weaviate
salInterfaces = client.collections.get("SAL_Interfaces_AstroChat")
response = salInterfaces.generate.near_text(
    query="ataos",
    limit=2,
    grouped_task="what is ATAOS?"
)

print(response.generated)

In [None]:
# EXTRA: Code used to flatten the YAML file into a JSON - First attempt was to use YAML
# Convert the YAML to a JSON (then tried to use JSON)
with open('sal_interfaces.yaml', 'r') as file:
    yaml_data = yaml.safe_load(file)

jsonData = json.dumps(yaml_data, indent=4)

# Save JSON if needed
with open('sal_interfaces.json', 'w') as jsonFile:
    jsonFile.write(jsonData)


# Nested JSON not useful for weaviate - Flatten the JSON - Final attempt
with open('sal_interfaces.json', 'r') as f:
    data = json.load(f)

# List to store the flattened items
flattenedItems = []


def process_subsystem(subsystem_name, subsystem_data):
    '''
    Function to process and flatten each subsystem's commands, events, or telemetry
    '''
    # Check if it's a command, event, or telemetry set
    for key, value in subsystem_data.items():
        if isinstance(value, dict) and ('SALCommandSet' in key or 'SALEventSet' in key or 'SALTelemetrySet' in key):
            commandsOrEvents = value.get('SALCommand') or value.get('SALEvent') or value.get('SALTelemetry')

            # Check if commandsOrEvents is a list (properly structured)
            if isinstance(commandsOrEvents, list):
                # Iterate over each command/event/telemetry and extract items
                for commandOrEvents in commandsOrEvents:
                    if isinstance(commandOrEvents, dict):  # Ensure it's a dictionary
                        topic = commandOrEvents.get('EFDB_Topic')
                        description = commandOrEvents.get('Description')
                        items = commandOrEvents.get('item', [])

                        # If 'item' is a list, process each one
                        if isinstance(items, list):
                            for item in items:
                                if isinstance(item, dict):
                                    flattenedItems.append({
                                        'Subsystem': subsystem_name,
                                        'EFDB_Topic': topic,
                                        'Description': description,
                                        'EFDB_Name': item.get('EFDB_Name'),
                                        'Item_Description': item.get('Description'),
                                        'Count': item.get('Count'),
                                        'IDL_Type': item.get('IDL_Type'),
                                        'Units': item.get('Units')
                                    })
                        # If 'item' is a single dict, process it directly
                        elif isinstance(items, dict):
                            flattenedItems.append({
                                'Subsystem': subsystem_name,
                                'EFDB_Topic': topic,
                                'Description': description,
                                'EFDB_Name': items.get('EFDB_Name'),
                                'Item_Description': items.get('Description'),
                                'Count': items.get('Count'),
                                'IDL_Type': items.get('IDL_Type'),
                                'Units': items.get('Units')
                            })

# Process each subsystem in the JSON file
for subsystem, content in data.items():
    process_subsystem(subsystem, content)

# Save the flattened data into a new JSON file
with open('simplified_sal_interfaces_with_item_description.json', 'w') as f:
    json.dump(flattenedItems, f, indent=4)