In [1]:
# Just start with the most important keys
limited_keys = """
entry_type
published
publish_time
nomad_version
external_db
origin
main_author.user_id
writers.user_id
writer_groups
viewers.user_id
viewer_groups
domain
quantities
results.material.material_name
results.material.structural_type
results.material.dimensionality
results.material.elements
results.material.n_elements
results.material.elements_exclusive
results.material.chemical_formula_descriptive
results.material.chemical_formula_reduced
results.material.chemical_formula_hill
results.material.chemical_formula_iupac
results.material.symmetry.bravais_lattice
results.material.symmetry.crystal_system
results.material.symmetry.hall_number
results.material.symmetry.hall_symbol
results.material.symmetry.point_group
results.material.symmetry.space_group_number
results.material.symmetry.space_group_symbol
results.material.symmetry.structure_name
results.material.symmetry.strukturbericht_designation
results.method.method_name
results.method.simulation.program_name
results.method.simulation.dft.basis_set_type
results.method.simulation.dft.core_electron_treatment
results.method.simulation.dft.spin_polarized
results.method.simulation.dft.scf_threshold_energy_change
results.method.simulation.dft.van_der_Waals_method
results.method.simulation.dft.relativity_method
results.method.simulation.dft.smearing_kind
results.method.simulation.dft.smearing_width
results.method.simulation.dft.jacobs_ladder
results.method.simulation.dft.xc_functional_type
results.method.simulation.dft.xc_functional_names
results.properties.available_properties
results.properties.electronic.band_gap.value
results.properties.electronic.band_gap.type
results.properties.geometry_optimization.convergence_tolerance_energy_difference
results.properties.geometry_optimization.convergence_tolerance_force_maximum
results.properties.geometry_optimization.final_force_maximum
results.properties.geometry_optimization.final_energy_difference
results.properties.geometry_optimization.final_displacement_maximum""".split()

In [2]:
# produce some documentation of the search keys
from nomad.datamodel import EntryArchive
from nomad.metainfo import Reference, MEnum
from nomad.metainfo.elasticsearch_extension import entry_index
import json

search_keys = dict()
if not entry_index.doc_type.mapping:
    entry_index.doc_type.create_mapping(EntryArchive.m_def)
for key, value in entry_index.doc_type.quantities.items():
    annotation = value.annotation

    try:
        if isinstance(annotation.definition.type, Reference):
            continue

        if isinstance(annotation.definition.type, MEnum):
            type = list(annotation.definition.type)
        else:
            type = annotation.definition.type.__name__
    except:
        type = str(annotation.definition.type)

    if annotation.field:
        key = f'{key}.{annotation.field}'

    if '__suggestion' in key:
        continue
    if 'optimade' in key:
        continue
    if 'topology' in key:
        continue
    if 'eln' in key:
        continue

    if key.startswith('data'):
        continue

    if not annotation.definition.description:
        continue

    if key not in set(limited_keys):
        continue
    
    search_keys[key] = dict(
        repeats=value.repeats,
        description=annotation.definition.description,
        type=type
    )

len(json.dumps(search_keys).split())

Schema is deprecated, use plugins. ()


1338

In [3]:
# Some example queries
example_queries = """
{
  "query": {
    "results.method.simulation.program_name:any": [  
      "VASP"
    ],
    "results.material.elements:any": [
      "C",
      "O"
    ],
    "results.properties.available_properties:all": [
      "dos_electronic"
    ]
  }
}

{
  "query": {
    "results.method.simulation.program_name:any": [  
      "exciting"
    ],
    "results.material.elements:any": [
      "Ti",
      "O"
    ],
    "results.material.symmetry.crystal_system": "cubic",
    "results.properties.available_properties:all": [
      "dos_electronic", "band_gap", "band_structure"
    ]
  }
}  
"""

In [4]:
# A prompt template with the keys and examples plus some instructions
base_template = '''
```
{search_keys}
```

There is a search API for a database for computational materials science data comprising mostly DFT 
calculations and simulations. This API has a search function that allows to search based the following keys.
The keys are given above as JSON object where the keys are the keys and the values contain a description and type.

These are the only available keys. Do not invent new keys! It is important to use the full keys. 
From these keys (and only those keys) you can create search queries like these:

```
{example_queries}
```

There needs to be a top level "query" key. Multiple criteria can be combined with "and", "or", and "not" operators.
If you want to pass multiple values to a key, use the ":any" (some values match) and ":all" (all values match) suffix on the keys. Don't use a $ sign.
'''

In [5]:
# A prompt for creating a query based on user input
generate_search_template = f'''
{base_template}

This describes what we want to search: {{input}}

Generate a search query. Your output has to be valid JSON and only valid JSON.
'''

In [6]:
# Generating the search query
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import ChatPromptTemplate
import json
from langchain_community.llms import Ollama

llm = Ollama(model="llama3:70b", temperature=0.05)
llm.base_url = 'http://172.28.105.30/backend'

prompt = ChatPromptTemplate.from_template(generate_search_template)
output_parser = JsonOutputParser()

chain = prompt | llm | output_parser

api_query = chain.invoke({
    "search_keys": json.dumps(search_keys, indent=2),
    "example_queries": example_queries,
    "input": "I am looking for VASP simulations of bulk materials with common metals for elements and have a dos available."})

print(json.dumps(api_query, indent=2))

{
  "query": {
    "results.method.simulation.program_name:any": [
      "VASP"
    ],
    "results.material.elements:any": [
      "Li",
      "Be",
      "Na",
      "Mg",
      "Al",
      "K",
      "Ca"
    ],
    "results.properties.available_properties:all": [
      "dos_electronic"
    ]
  }
}


In [11]:
# Running the query

import requests

def search_database(query_json: dict) -> int:
    """ Send a query to the search API of the database and return the number of results. """
    # Set the API endpoint URL
    url = "https://nomad-lab.eu/prod/v1/api/v1/entries/query"

    # Send a POST request to the API endpoint with the query JSON object
    response = requests.post(url, json=query_json)

    # Check if the response was successful (200 OK)
    if response.status_code != 200:
        raise Error(f"Error. Status code {response.status_code}, {response.text}")
    
    return response.json()

api_query.update(pagination=dict(page_size=1))
api_result = search_database(api_query)
api_result["pagination"]

{'page_size': 1,
 'order_by': 'entry_id',
 'order': 'asc',
 'total': 1434133,
 'next_page_after_value': '---9UIuzNJpS4IKamInkdUpmcn3_'}

In [9]:
# A prompt to extract the number of hits from the query response
report_template = f"""
{base_template}

I ran the following query on the API:

{{api_query}}

The api responsed with the following JSON:

{{api_result}}

The amount is NOT! given by page_size. The total amount of entries 
is given by "pagination.total". Please extract the total number!

Only respond with the total number. After this point, no more words!
"""

In [12]:
# Extract the total results
prompt = ChatPromptTemplate.from_template(report_template)
chain = prompt | llm
chain.invoke({
    "search_keys": json.dumps(search_keys, indent=2),
    "example_queries": example_queries,
    "api_query": json.dumps(api_query, indent=2),
    "api_result": json.dumps(api_result["pagination"], indent=2)})

'1434133'