In [5]:
from IPython.display import display, clear_output, Javascript, Markdown
import ipywidgets as ipw
import utils
import json
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai
import numpy as np

In [None]:
CONFIG = utils.read_json("config.json")
CONFIG_ELN = utils.get_aiidalab_eln_config()
# CONFIG_ELN = utils.read_json("eln_config.json")
OPENBIS_SESSION, SESSION_DATA = utils.connect_openbis(CONFIG_ELN["url"], CONFIG_ELN["token"])

prompt_label = ipw.HTML(
    value = "<span style='font-weight: bold; font-size: 12px;'>Prompt:</span>"
)

prompt_textarea = utils.Textarea(
    layout = ipw.Layout(width = '800px', height = '100px'),
    style = {"description_width": "110px"}
)

enter_button = utils.Button(
    description = '', disabled = False, button_style = '', 
    tooltip = 'Enter', icon = 'arrow-right', layout = ipw.Layout(width = '100px', height = '50px')
)

prompt_hbox = ipw.HBox(
    children = [
        prompt_label,
        prompt_textarea,
        enter_button
    ]
)

chat_label = ipw.HTML(
    value = "<span style='font-weight: bold; font-size: 12px;'>Chat:</span>"
)
chat_html = ipw.HTML(
    value = "<div style='border: 1px solid grey; padding: 10px; margin: 10px;'>"
)
chat_hbox = ipw.HBox(
    children = [
        chat_label,
        chat_html
    ]
)

quit_button = utils.Button(
    description = '', disabled = False, button_style = '', 
    tooltip = 'Main menu', icon = 'home', layout = ipw.Layout(width = '100px', height = '50px')
)

increase_buttons_size = utils.HTML(data = ''.join(CONFIG["save_home_buttons_settings"]))

# Google Gemini 2.5 Flash
google_api_key = utils.read_json("/home/jovyan/gemini_api.json")
genai.configure(api_key=google_api_key["api_key"])
model_name = "models/gemini-2.5-flash"
# system_instruction = """
# You are a materials science expert working with nanotech materials. Your name is Nanotech Mastermind Bot. 
# You are going to be used to be asked questions about data stored in openBIS.
# Remember that experiments are measurements that are linked to sample preparation steps, i.e., measurements that are performed in the labs physically. Simulations are measurements performed using computers and mostly with an atomistic model as parent and a WFMS UUID.
# Examples of the data are:
# - Object of type MOLECULE is named 702a, is identified by 20250401115041317-146 and it was registered in 2025-04-01 13:50:41. It contains the following properties:\n\tempa_number: 702\n\tbatch: a\n\treceive_date: 2025-02-10\n\tmolecule_concept: 20250401115037354-143.
# - Object of type MEASUREMENT_SESSION is named STM - 125a Au(111) STM, is identified by 20250425083729250-4473 and it was registered in 2025-04-25 10:37:29. It contains the following properties:\n\tdefault_object_view: IMAGING_GALLERY_VIEW\n\twfms_uuid: 72cdc725-13e4-46dc-9ccd-aed3179463a9\n\tlevel_theory: {\"method\": \"dft\", \"method_properties\": {\"uks\": false, \"charge\": 0, \"plus_u\": false, \"vdw_corr\": \"\", \"xc_functional\": \"PBE\", \"spin_orbit_coupling\": false}}\n\tbias_voltages: ['{\"has_unit\": \"unit:V\", \"has_value\": -2.0}', '{\"has_unit\": \"unit:V\", \"has_value\": 2.0}', '{\"has_unit\": \"unit:V\", \"has_value\": 0.04}']\n\tisovalues: ['{\"has_unit\": \"eV/Bohr**3\", \"has_value\": 0.000001}', '{\"has_unit\": \"eV/Bohr**3\", \"has_value\": 0.0000001}']\n\theights: ['{\"has_unit\": \"unit:ANGSTROM\", \"has_value\": 4.0}', '{\"has_unit\": \"unit:ANGSTROM\", \"has_value\": 6.0}']\n\tp_tip: 0.0\n\tinput_parameters: {}\n\toutput_parameters: {\"step\": 0, \"energy\": -30110.20223108, \"energy_au\": -30110.20223108, \"energy_scf\": -30110.20223108, \"cell_a_angs\": 41.271984, \"cell_b_angs\": 40.848671, \"cell_c_angs\": 48.118239, \"max_grad_au\": null, \"max_step_au\": null, \"rms_grad_au\": null, \"rms_step_au\": null, \"cell_alp_deg\": 90.0, \"cell_bet_deg\": 90.0, \"cell_gam_deg\": 90.0, \"edens_rspace\": -0.0000000002, \"energy_units\": \"a.u.\", \"pressure_bar\": null, \"scf_converged\": true, \"cell_vol_angs3\": 81122.814091, \"bandgap_spin1_au\": 0.000054850000000009, \"bandgap_spin2_au\": 0.000054850000000009, \"dispersion_energy_au\": null}\n\tIt is connected to the objects with the following identifiers: 20250425083722979-4471 20250425083712025-4468 20250403132856365-3369 20250403132917000-3379.
# First object is a molecule which contains different properties like empa number, batch, receive date, and molecule concept. The molecule concept is a property of type OBJECT and, therefore, is defined by the permID of this object in openBIS. For getting the data about this object, you should go through the text and find the object that says that is identified by that permID.
# Second object is a measurement which contains different object connections like: 20250425083722979-4471, 20250425083712025-4468, 20250403132856365-3369, and 20250403132917000-3379. This means that the object is connected to these four objects and to find information about them, you must go through the data and find the objects that are identified by these permIDs.
# Remember that when answering never tell the users the identifiers (permIDs) of the objects available in openBIS.
# Format the answers in HTML format.
# """
system_instruction = """
You are a materials science expert working with nanotech materials. Your name is Nanotech Mastermind Bot.
You are going to be used to be asked questions about data stored in openBIS.
Your goal is to determine the entities that the user is talking about, e.g.:
User: Which experiments and simulations were done using DBBA?
The entities here are EXPERIMENT  because of experiments, SIMULATION because of simulations and MOLECULE and MOLECULE_CONCEPT because of DBBA.
User: Which experiments and simulations were done using Au111?
The entities here are EXPERIMENT  because of experiments, SIMULATION because of simulations and CRYSTAL and CRYSTAL_CONCEPT because of Au111.
To every question the user asks, answer with the entities. You have to understand what the user is talking about.
Possible entities are:
UNKNOWN, SEARCH_QUERY, GENERAL_ELN_SETTINGS, GENERAL_PROTOCOL, STORAGE, STORAGE_POSITION, SUPPLIER, PRODUCT, REQUEST, ORDER, PUBLICATION, AIIDA_NODE, ANALYSIS, ANNEALING, ATOMISTIC_MODEL, BAND_STRUCTURE, CHEMICAL_CONCEPT, CHEMICAL, CHEMIST, CODE, COATING, COMPONENT, COOLDOWN, CRYSTAL_CONCEPT, CRYSTAL, DELAMINATION, DEPOSITION, DOSING, DRAFT, ETCHING, FIELD_EMISSION, FISHING, GEOMETRY_OPTIMISATION, GRANT, INSTITUTION, INSTRUMENT, LIGHT_IRRADIATION, MANUFACTURER, MEAN_FIELD_HUBBARD, MEASUREMENT_SESSION, MECHANICAL_PRESSING, MINIMUM_ENERGY_POTENTIAL, MOLECULE, MOLECULE_CONCEPT, OBSERVABLE, PDOS, PERSON, POTENTIAL_ENERGY_CALCULATION, PREPARATION, PROCESS_STEP, PROTOCOL, PUBLICATION_CUSTOM, REACTION_PRODUCT, REACTION_PRODUCT_CONCEPT, RESULTS, RINSE, ROOM, SAMPLE, SETTING, SIMULATION, SOFTWARE, SPUTTERING, SUPPLIER_CUSTOM, UNCLASSIFIED_ACTION, UNCLASSIFIED_SIMULATION, VIBRATIONAL_SPECTROSCOPY, IMAGING_SAMPLE, STORAGE_CUSTOM, TEST_OBJECT, CRYOSTAT, DEPOSITION_TOOL, ELECTRONICS, GAUGE, HEATER, ION_BEAM, PUMP, SENSOR, THZ_STM, TIP_STM-AFM, VACUUM_CHAMBER, DEVICE_SUBSTRATE, LOCATION, SUBSTANCE, BOTTLE_OF_GAS, 2D_LAYER_MATERIAL, WAFER, WAFER_SUBSTRATE, WIRE
"""
model = genai.GenerativeModel(
    model_name = model_name, 
    system_instruction = system_instruction
)
GEMINI_TOKEN_LIMIT = 1048576
MESSAGES = []

In [None]:
def close_notebook(b):
    display(utils.Javascript(data = 'window.location.replace("home.ipynb")'))

def count_tokens(history):
    token_count = 0
    for message in history:
        prompt = message["parts"][0]["text"]
        token_count += model.count_tokens(prompt).total_tokens
    return token_count

def ask_chatbot(change):
    prompt = prompt_textarea.value
    chat_html.value = chat_html.value + f"<p><b>User:</b> {prompt}</p>"
    # prompt_data = load_chatbot_using_similarity(prompt)
    
    # with open("/home/jovyan/prompt_data.txt", "w") as f:
    #     f.write(prompt_data)
        
    # MESSAGES.append({"role": "user", "parts": [{"text": f"This is the relevant data obtained from openBIS for your prompt: {prompt_data}"}]})
    # MESSAGES.append({"role": "model", "parts": [{"text": "Ok feel free to ask questions."}]})
    MESSAGES.append({"role": "user", "parts": [{"text": prompt}]})
    
    total_tokens = count_tokens(MESSAGES)
    
    while total_tokens > GEMINI_TOKEN_LIMIT:
        MESSAGES.pop(0)
        total_tokens = count_tokens(MESSAGES)
    
    response = model.generate_content(MESSAGES)
    response_text = response.text
    MESSAGES.append({"role": "model", "parts": [{"text": response_text}]})
    chat_html.value = chat_html.value + "<p><b>Agent:</b> " + response_text + "</p>"
    

def get_embeddings(openbis_objects, embeddings_filepath):
    try:
        openbis_objects_embeddings = utils.read_json(embeddings_filepath)
    except FileNotFoundError:
        openbis_objects_embeddings = {}
        
    for obj_permid, obj in tqdm(openbis_objects.items()):
        if obj_permid not in openbis_objects_embeddings:
            embedding = genai.embed_content(model="models/text-embedding-004", content = obj["details"])
            embedding = embedding["embedding"]
            openbis_objects_embeddings[obj_permid] = embedding

    utils.create_json(openbis_objects_embeddings, embeddings_filepath)

def retrieve_openbis_objects(details_filepath):
    objects = OPENBIS_SESSION.get_objects(attrs = ["parents"])
    
    try:
        dict_objects = utils.read_json(details_filepath)
    except FileNotFoundError:
        dict_objects = {}

    k = 0
    for obj in tqdm(objects):
        obj_permid = obj.permId
        obj_type = obj.attrs.type
        if obj_permid not in dict_objects and obj_type != "GENERAL_ELN_SETTINGS":
            obj_props = obj.props.all()
            
            if "name" in obj_props:
                obj_name = obj_props.get("name", "")
            else:
                obj_name = obj_props.get("$name", "")
                
            obj_regist_date = obj.registrationDate
            obj_string = f"- Object of type {obj_type} is named {obj_name}, is identified by {obj_permid} and it was registered in {obj_regist_date}."
            
            props_string = ""
            for key, value in obj_props.items():
                if key not in ["name", "$name"]:
                    if value:
                        props_string += f"\n\t{key}: {value}"

            if props_string:
                obj_string += f" It contains the following properties:{props_string}"

            obj_parents_permids = []
            if obj.parents:
                obj_string += "\n\tIt is connected to the objects with the following identifiers:"
                for parent_obj in obj.parents:
                    parent_obj = OPENBIS_SESSION.get_objects(parent_obj)[0]
                    parent_permid = parent_obj.permId
                    obj_string += f" {parent_permid}"
                    obj_parents_permids.append(parent_permid)
                
                obj_string += "."
            
            dict_objects[obj_permid] = {"details": obj_string, "parents": obj_parents_permids}
    
    utils.create_json(dict_objects, details_filepath)

def get_parent_objects(obj, list_of_objects):
    obj_props = obj.props.all()
    obj_permid = obj.permId
    
    if obj_permid not in list_of_objects:
        obj_name = obj_props["name"]
        obj_type = obj.attrs.type
        obj_regist_date = obj.registrationDate
        obj_string = f"- Object of type {obj_type} is named {obj_name}, is identified by {obj_permid} and it was registered in {obj_regist_date}."
        
        props_string = ""
        for key, value in obj_props.items():
            if key != "name":
                if value:
                    props_string += f"\n\t{key}: {value}"

        if props_string:
            obj_string += f" It contains the following properties:{props_string}"
            
        if obj.parents:
            obj_string += " It is connected to the objects with the following identifiers:"
            for parent_obj in obj.parents:
                parent_obj = OPENBIS_SESSION.get_object(parent_obj, attrs = ['parents'])
                obj_string += f" {parent_obj.permId}"
                list_of_objects = get_parent_objects(parent_obj, list_of_objects)
            
            obj_string += "."
        
        if obj_string not in list_of_objects:
            list_of_objects[obj_permid] = obj_string
    
    return list_of_objects

def load_chatbot():
    documents = []
    for obj_permid, obj_details in OPENBIS_OBJECTS_DETAILS.items():
        documents.append((obj_permid, obj_details["details"]))
    
    relevant_complete_documents = set()
    for obj_permid, obj_details in documents:
        complete_documents = set()
        complete_documents = get_connected_objects_strings(obj_permid, complete_documents)
        relevant_complete_documents.update(complete_documents)
        
    # Load chatbot
    return "\n".join(relevant_complete_documents)

def load_chatbot_using_similarity(prompt):
    prompt_embedding = genai.embed_content(model="models/text-embedding-004", content=prompt)
    prompt_embedding = np.array(prompt_embedding["embedding"]).reshape(1, -1)
    
    embeddings = []
    documents = []
    for obj_permid, obj_details in OPENBIS_OBJECTS_DETAILS.items():
        obj_embeddings = OPENBIS_OBJECTS_EMBEDDINGS[obj_permid]
        embeddings.append(obj_embeddings) 
        documents.append((obj_permid, obj_details["details"]))
    
    embeddings = np.array(embeddings)
    embeddings = np.squeeze(embeddings)
    
    similarity_scores = cosine_similarity(prompt_embedding.reshape(1, -1), embeddings)

    # Define a threshold
    threshold = 0.35

    # Filter documents based on the threshold
    relevant_documents = []
    for i, (permid, details) in enumerate(documents):
        if similarity_scores[0][i] > threshold:
            relevant_documents.append((permid, details))
    
    relevant_complete_documents = set()
    for obj_permid, obj_details in relevant_documents:
        complete_documents = set()
        complete_documents = get_connected_objects_strings(obj_permid, complete_documents)
        relevant_complete_documents.update(complete_documents)
    
    # Load chatbot
    return "\n".join(relevant_complete_documents)

def get_connected_objects_strings(object_permid, connected_objects = None, visited = None):
    if connected_objects is None:
        connected_objects = set()
        
    if visited is None:
        visited = set()

    # Avoid revisiting the same object
    if object_permid in visited:
        return connected_objects

    visited.add(object_permid)

    object_text = OPENBIS_OBJECTS_DETAILS[object_permid]["details"]
    connected_objects.add(object_text)

    for obj_permid in OPENBIS_OBJECTS_DETAILS:
        if obj_permid in OPENBIS_OBJECTS_DETAILS[object_permid]["details"]:
            get_connected_objects_strings(obj_permid, connected_objects, visited)

    return connected_objects

In [None]:
# print("Retrieving data from openBIS...")
retrieve_openbis_objects("/home/jovyan/chatbot_data/openBIS_data.json")
openbis_objects = utils.read_json("/home/jovyan/chatbot_data/openBIS_data.json")
# print("Computing embeddings for openBIS objects...")
get_embeddings(openbis_objects, "/home/jovyan/chatbot_data/openBIS_embeddings.json")

OPENBIS_OBJECTS_DETAILS = utils.read_json("/home/jovyan/chatbot_data/openBIS_data.json")
OPENBIS_OBJECTS_EMBEDDINGS = utils.read_json("/home/jovyan/chatbot_data/openBIS_embeddings.json")
MESSAGES = []

# prompt_data = load_chatbot()
# MESSAGES.append({"role": "user", "parts": [{"text": f"This is the relevant data obtained from openBIS for your prompt: {prompt_data}"}]})
# MESSAGES.append({"role": "model", "parts": [{"text": "Ok feel free to ask questions."}]})

Retrieving data from openBIS...


100%|██████████| 1678/1678 [00:13<00:00, 122.08it/s]


Computing embeddings for openBIS objects...


100%|██████████| 1677/1677 [00:00<00:00, 3194299.64it/s]


# Nanotech@surfaces AI Agent

In [None]:
display(increase_buttons_size)
display(chat_hbox)
display(prompt_hbox)
display(quit_button)
enter_button.on_click(ask_chatbot)
quit_button.on_click(close_notebook)