In [2]:
import json
def load_json(data_path):
    with open(data_path, "r") as file_1:
        data = json.load(file_1)
    return data


In [4]:
test_data = []
with open("SWEET/test/sweet_test_re_types.txt", "r", encoding="utf-8") as file:
    content = file.readlines()
    for con in content:
        test_data.append(con.strip('\n'))

print(test_data)

['abyssal trench', 'acoustic wave', 'acoustics', 'aeolian', 'air sea exchange', 'airborne toxic control measure', 'archive', 'archiving', 'atcm', 'atmosphere ocean exchange', 'attenuation coefficient', 'autotroph', 'average', 'bag', 'bdoc', 'bearing', 'bed load', 'belief', 'biodegradable dissolved organic carbon', 'boundary', 'brightness', 'broad spectrum', 'budget', 'budget equation', 'c horizon', 'calving', 'capillary', 'capillary action', 'carbon neutral', 'carbon neutrality', 'category', 'characteristics', 'chemical process', 'classification', 'classify', 'clump', 'clumping', 'cluster', 'coastal', 'coastal region', 'coefficient of thermal expansion', 'compression', 'conic', 'conic section', 'continuous', 'convergent boundary', 'convergent margin', 'core mantle boundary', 'creep', 'crosswalk', 'curate', 'curation', 'cyclotron frequency', 'decision support system', 'decompression', 'deep focus earthquake', 'deep sea trench', 'description', 'dike', 'direct use', 'directed graph', 'dir

In [5]:
test_relations = ['disjointWith', 'equivalentClass']

In [None]:
from google import genai
model = "gemini-2.5-pro"
client= genai.Client(api_key="")

In [8]:
prompt = f"""
You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning, specializing in the SWEET (Semantic Web for Earth and Environmental Terminology) ontology. Developed by NASA’s Jet Propulsion Laboratory, SWEET is a user-friendly, standardized framework for organizing Earth and environmental science concepts. It includes terms like Earth realms (e.g., 'Ocean', 'Atmosphere'), substances (e.g., 'Water', 'Ice'), natural phenomena, and processes, connected by relationships such as 'hasState' (e.g., Water turns into Ice), 'hasPart' (e.g., Ocean includes Sea), and 'isComposedOf'. This structure makes scientific data clear and computer-readable, enabling better connections across environmental information.

Your task is to group a list of terms into clusters based on specific relationships from the SWEET ontology, using your domain knowledge to determine connections. You’ll receive two inputs:
1. A list of terms: ['Ocean', 'Sea', 'River', 'Lake', 'Water', 'Ice', 'Glacier', 'Atmosphere', 'Oxygen', 'CarbonDioxide', 'Rock', 'Magma'].
2. A list of clustering relationships: ['hasPart', 'isComposedOf', 'hasState'].

Here’s what you need to do:
- Build the Graph:
  - Treat each term as a point (node) in a graph.
  - Using your knowledge of the SWEET ontology, check every pair of terms in the input list to see if they could be connected by any of the clustering relationships ('hasPart', 'isComposedOf', 'hasState'). If a pair can be connected by one of these relationships (e.g., 'Ocean' and 'Sea' via 'hasPart', or 'Water' and 'Ice' via 'hasState'), create a two-way (undirected) link between them.
  - Handle special cases:
    - If the terms list is empty, return an empty list of clusters: [].
    - If the clustering relationships list is empty, place each term in its own cluster.
    - If no valid connections can be inferred between any pairs based on SWEET ontology knowledge, each term forms its own cluster.
- Form Clusters:
  - Group terms into clusters where terms are connected directly or indirectly through the clustering relationships.
  - Terms with no connections form their own single-term clusters.
  - Ensure every term from the input list appears in the output, even if alone.
- Output:
  - Return a Python list of lists, where each inner list contains the terms in a cluster.
  - Don’t include relationships in the output, only the grouped terms.
  - Sort clusters alphabetically by the first term in each cluster.
  - Within each cluster, sort terms alphabetically.

Task:
Group the following terms into clusters using the provided clustering relationships. For each pair of terms, use your SWEET ontology knowledge to determine if they can be connected by 'hasPart', 'isComposedOf', or 'hasState'. If they can, place them in the same cluster. Repeat for all pairs:
- List of Terms: {test_data}
- List of Clustering Relationships: {test_relations}

Based on SWEET ontology patterns, infer connections such as: 'Ocean' hasPart 'Sea', 'Water' hasState 'Ice', 'Glacier' isComposedOf 'Ice', 'River' isComposedOf 'Water', 'Lake' isComposedOf 'Water', 'Sea' isComposedOf 'Water', 'Magma' hasState 'Rock', 'Atmosphere' isComposedOf 'Oxygen', 'Atmosphere' isComposedOf 'CarbonDioxide'.

Return the result as a Python list of lists, where each inner list represents a cluster of terms.
"""


In [9]:
generation_config = {"response_mime_type": "application/json"}

response = client.models.generate_content(
        contents= prompt,
        config=generation_config,
        model=model
    )

In [10]:
response_1 = response.text
response_1

'[\n  [\n    "abyssal trench",\n    "deep sea trench"\n  ],\n  [\n    "acoustic wave",\n    "sound wave"\n  ],\n  [\n    "acoustics"\n  ],\n  [\n    "aeolian",\n    "eolian"\n  ],\n  [\n    "air sea exchange",\n    "atmosphere ocean exchange"\n  ],\n  [\n    "airborne toxic control measure",\n    "atcm"\n  ],\n  [\n    "archive",\n    "archiving"\n  ],\n  [\n    "attenuation coefficient",\n    "extinction coefficient"\n  ],\n  [\n    "autotroph",\n    "heterotroph"\n  ],\n  [\n    "average",\n    "mean"\n  ],\n  [\n    "bag",\n    "multiset"\n  ],\n  [\n    "bdoc",\n    "biodegradable dissolved organic carbon",\n    "dissolved organic carbon",\n    "doc"\n  ],\n  [\n    "bearing"\n  ],\n  [\n    "bed load",\n    "suspended load",\n    "wash load"\n  ],\n  [\n    "belief",\n    "trust"\n  ],\n  [\n    "boundary"\n  ],\n  [\n    "brightness",\n    "specific intensity"\n  ],\n  [\n    "broad spectrum"\n  ],\n  [\n    "budget",\n    "budget equation"\n  ],\n  [\n    "c horizon",\n    "hori

In [11]:
final_output = json.loads(response_1)

In [12]:
final_output

[['abyssal trench', 'deep sea trench'],
 ['acoustic wave', 'sound wave'],
 ['acoustics'],
 ['aeolian', 'eolian'],
 ['air sea exchange', 'atmosphere ocean exchange'],
 ['airborne toxic control measure', 'atcm'],
 ['archive', 'archiving'],
 ['attenuation coefficient', 'extinction coefficient'],
 ['autotroph', 'heterotroph'],
 ['average', 'mean'],
 ['bag', 'multiset'],
 ['bdoc',
  'biodegradable dissolved organic carbon',
  'dissolved organic carbon',
  'doc'],
 ['bearing'],
 ['bed load', 'suspended load', 'wash load'],
 ['belief', 'trust'],
 ['boundary'],
 ['brightness', 'specific intensity'],
 ['broad spectrum'],
 ['budget', 'budget equation'],
 ['c horizon',
  'horizon',
  'parent rock',
  'substrata',
  'substratum',
  'surface soil',
  'topsoil'],
 ['calving', 'ice calving'],
 ['capillary', 'capillary action'],
 ['carbon neutral', 'carbon neutrality'],
 ['category'],
 ['characteristics'],
 ['chemical process', 'reaction'],
 ['classification', 'classify'],
 ['clump', 'clumping', 'clus

In [13]:
model="gemini-2.5-flash"

In [None]:
response_final = []
from tqdm import tqdm
for i, term_cluster in tqdm(enumerate(final_output)):
    prompt=f"""
You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning, specializing in the SWEET (Semantic Web for Earth and Environmental Terminology) ontology. Developed by NASA’s Jet Propulsion Laboratory, SWEET is a user-friendly, standardized framework for organizing Earth and environmental science concepts. It includes terms like Earth realms (e.g., 'Ocean', 'Atmosphere'), substances (e.g., 'Water', 'Ice'), natural phenomena, and processes, linked by relationships such as 'hasState' (e.g., Water turns into Ice), 'hasPart' (e.g., Ocean includes Sea), and 'isComposedOf'. This structure makes scientific data clear and computer-readable, enabling better connections across environmental information.

Your task is to take a single cluster of terms and a list of clustering relationships, and generate all possible relationship triples (head, relation, tail) between terms within that cluster, using only the specified relationships. You’ll receive two inputs:
1. A single cluster, which is a list of terms (e.g., ['abyssal trench', 'deep sea trench']).
2. A list of clustering relationships: ['hasPart', 'isComposedOf', 'hasState'].

Here’s what you need to do:
- Process the Cluster:
  - Examine all pairs of terms within the given cluster.
  - Using your knowledge of the SWEET ontology, determine if a pair can be connected by one of the clustering relationships ('hasPart', 'isComposedOf', 'hasState'). If so, create a triple (head, relation, tail).
  - Only include triples where both head and tail are terms within the cluster and the relation is in the clustering relationships list.
- Handle Special Cases:
  - If the cluster has only one term, return an empty list, as no pairs are possible.
  - If no valid relationships can be inferred for any pair in the cluster based on SWEET ontology knowledge, return an empty list.
  - If the cluster is empty, return an empty list: [].
  - If the clustering relationships list is empty, return an empty list.
- Output:
  - Return a Python list containing triples (as tuples) for the given cluster.
  - Each triple is in the format (head, relation, tail), where head and tail are terms from the cluster, and relation is one of the clustering relationships.
  - Sort the triples alphabetically by head, then relation, then tail.

Task:
For the given cluster of terms, generate all possible relationship triples (head, relation, tail) using the given clustering relationships {test_relations}. Use your SWEET ontology knowledge to infer valid relationships within the cluster.

Here is the list of terms {term_cluster}

Return the result as a Python list of triples for the given cluster.
"""
    generation_config = {"response_mime_type": "application/json"}

    response = client.models.generate_content(
            contents= prompt,
            config=generation_config,
            model=model
        )
    response_final.append(response.text)
    if i%10==0:
        print(response.text)

In [15]:
response_final

['[\n  [\n    "abyssal trench",\n    "equivalentClass",\n    "deep sea trench"\n  ],\n  [\n    "deep sea trench",\n    "equivalentClass",\n    "abyssal trench"\n  ]\n]',
 '[\n    [\n        "acoustic wave",\n        "equivalentClass",\n        "sound wave"\n    ],\n    [\n        "sound wave",\n        "equivalentClass",\n        "acoustic wave"\n    ]\n]',
 '[]',
 '[\n  [\n    "aeolian",\n    "equivalentClass",\n    "eolian"\n  ],\n  [\n    "eolian",\n    "equivalentClass",\n    "aeolian"\n  ]\n]',
 '[\n  [\n    "air sea exchange",\n    "equivalentClass",\n    "atmosphere ocean exchange"\n  ],\n  [\n    "atmosphere ocean exchange",\n    "equivalentClass",\n    "air sea exchange"\n  ]\n]',
 '[\n  [\n    "airborne toxic control measure",\n    "equivalentClass",\n    "atcm"\n  ],\n  [\n    "atcm",\n    "equivalentClass",\n    "airborne toxic control measure"\n  ]\n]',
 '[\n  [\n    "archive",\n    "disjointWith",\n    "archiving"\n  ],\n  [\n    "archiving",\n    "disjointWith",\n    "ar

In [16]:
final_prep_output = []
for output_1 in response_final:
    response_dict = json.loads(output_1)
    for ind_output in response_dict:
        final_prep_output.append({
            "head":ind_output[0],
            "relation":ind_output[1],
            "tail":ind_output[2]
        })

In [17]:
len(final_prep_output)

291

In [18]:
final_prep_output

[{'head': 'abyssal trench',
  'relation': 'equivalentClass',
  'tail': 'deep sea trench'},
 {'head': 'deep sea trench',
  'relation': 'equivalentClass',
  'tail': 'abyssal trench'},
 {'head': 'acoustic wave',
  'relation': 'equivalentClass',
  'tail': 'sound wave'},
 {'head': 'sound wave',
  'relation': 'equivalentClass',
  'tail': 'acoustic wave'},
 {'head': 'aeolian', 'relation': 'equivalentClass', 'tail': 'eolian'},
 {'head': 'eolian', 'relation': 'equivalentClass', 'tail': 'aeolian'},
 {'head': 'air sea exchange',
  'relation': 'equivalentClass',
  'tail': 'atmosphere ocean exchange'},
 {'head': 'atmosphere ocean exchange',
  'relation': 'equivalentClass',
  'tail': 'air sea exchange'},
 {'head': 'airborne toxic control measure',
  'relation': 'equivalentClass',
  'tail': 'atcm'},
 {'head': 'atcm',
  'relation': 'equivalentClass',
  'tail': 'airborne toxic control measure'},
 {'head': 'archive', 'relation': 'disjointWith', 'tail': 'archiving'},
 {'head': 'archiving', 'relation': 'd

In [19]:
# final_prepared_train_data = []
# for data_1 in train_data:
import json
with open("predictions_SWEET_cluster_llm_re_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_prep_output, f, indent=2, ensure_ascii=False)
