In [1]:
import json
def load_json(data_path):
    with open(data_path, "r") as file_1:
        data = json.load(file_1)
    return data


In [2]:
test_data = []
with open("blind domain/D4-Blind-Types.txt", "r", encoding="utf-8") as file:
    content = file.readlines()
    for con in content:
        test_data.append(con.strip('\n'))

print(test_data)

['acellular anatomical structure', 'adipose tissue', 'aerobic', 'aggregated', 'amictic lake', 'anaerobic', 'anterior segment of eyeball', 'anterior uvea', 'articular system', 'biome', 'blood vessel', 'bone element', 'bone tissue', 'caldera', 'circulatory system', 'clay', 'closed', 'collective plant organ structure', 'continental shelf', 'continuant', 'continuant fiat boundary', 'cowpea (pulse) food product', 'cowpea vegetable food product', 'crevasse', 'crevice', 'dead cave', 'dense connective tissue', 'digestive system', 'dimictic lake', 'disposition', 'distributed', 'ecoregion', 'ecozone', 'edible', 'elastic', 'entire sense organ system', 'excretory system', 'fluid astronomical body part', 'formal settlement', 'generically dependent continuant', 'glacier', 'hematopoietic system', 'holomictic lake', 'hydroform', 'hydrophilicity', 'hydrophobicity', 'illegal settlement', 'immaterial anatomical entity', 'immaterial entity', 'immune system', 'impermeable', 'independent continuant', 'inedi

In [3]:
test_relations = ['consider', 'disconnected_from', 'disjointWith','equivalentClass', 'seeAlso']

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [5]:
prompt_domain_extraction= f"""You are an expert AI assistant specializing in knowledge representation, ontology, and domain analysis. Your task is to analyze a given list of terms and a list of relationship names to identify the specific knowledge domain they belong to.

**Instructions:**

1.  **Analyze Inputs:** You will be given a list of terms and a list of potential relationships. Carefully examine these items to find the common theme or scientific field that connects them. Consider what subject matter these terms represent (e.g., biology, geography, technology, food production).
2.  **Generate Domain Name:** Based on your analysis, create a concise and descriptive name for the domain. For example, "Food Science and Production" or "Earth and Environmental Science."
3.  **Generate Domain Description:** Write a 4-5 line description of this domain. This description should explain:
    *   The general area of study or industry it covers.
    *   The types of concepts the terms represent within this domain.
    *   How the relationships typically connect these concepts.
    *   The overall purpose of structuring knowledge in this domain (e.g., for scientific analysis, traceability, data interoperability).
4.  **Format the Output:** Return a Python dictionary containing two keys: `domain_name` and `domain_description`.

**Task:**
Analyze the following inputs and generate a domain name and description.

-   **List of Terms:** {test_data}
-   **List of Relationships:** {test_relations}

Return the result as a Python dictionary."""

In [6]:
generation_config = {"response_mime_type": "application/json"}

response = client.models.generate_content(
        contents= prompt_domain_extraction,
        config=generation_config,
        model=model
    )
domain= json.loads(response.text)

In [7]:
domain

{'domain_name': 'Natural Systems Ontology',
 'domain_description': 'This domain focuses on the formal representation of knowledge spanning biological and Earth sciences. It integrates concepts related to living organisms (e.g., anatomy, tissues, systems, biological processes) and the physical environment (e.g., geological features, water bodies, biomes, material properties). Leveraging foundational ontological principles, it defines precise relationships to structure and categorize entities. The primary purpose is to establish a coherent, machine-readable framework for semantic interoperability, data integration, and robust reasoning across diverse scientific disciplines.'}

In [8]:
prompt = f"""You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning. You will be provided with a domain name and a description to set the context for your task.

Your primary task is to group a list of terms into clusters based on a specific set of relationships, using your knowledge of the identified domain to determine the connections.

**Context (Provided as Input):**
-   **Domain Name:** {domain['domain_name']}
-   **Domain Description:** {domain['domain_description']}

**Instructions:**

1.  **Build the Graph:**
    *   Treat each term from the input list as a point (node) in a graph.
    *   Using your knowledge of the **identified domain** (contextualized by the provided name and description), check every pair of terms to see if they could be connected by any of the **clustering relationships**. If a valid connection exists, create a two-way (undirected) link between them.
2.  **Form Clusters:**
    *   Group terms into clusters where terms are connected either directly or indirectly through the specified clustering relationships.
    *   Terms with no connections form their own single-term clusters.
    *   Ensure every term from the input list appears in the output.
3.  **Handle Special Cases:**
    *   If the terms list is empty, return an empty list of clusters: [].
    *   If the clustering relationships list is empty, place each term in its own cluster.
4.  **Format the Output:**
    *   Return a Python list of lists, where each inner list contains the terms in a cluster.
    *   Do not include the relationships in the output, only the grouped terms.
    *   Sort the clusters alphabetically by the first term in each cluster.
    *   Within each cluster, sort the terms alphabetically.

**Task:**
Group the following terms into clusters using the provided clustering relationships and the domain context. For each pair of terms, use your knowledge of the domain to determine if they can be connected by the specified relationships.

-   **List of Terms:** {test_data}
-   **List of Clustering Relationships:** {test_relations}

Return the result as a Python list of lists, where each inner list represents a cluster of terms."""


In [10]:
len(prompt), print(prompt)

You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning. You will be provided with a domain name and a description to set the context for your task.

Your primary task is to group a list of terms into clusters based on a specific set of relationships, using your knowledge of the identified domain to determine the connections.

**Context (Provided as Input):**
-   **Domain Name:** Natural Systems Ontology
-   **Domain Description:** This domain focuses on the formal representation of knowledge spanning biological and Earth sciences. It integrates concepts related to living organisms (e.g., anatomy, tissues, systems, biological processes) and the physical environment (e.g., geological features, water bodies, biomes, material properties). Leveraging foundational ontological principles, it defines precise relationships to structure and categorize entities. The primary purpose is to establish a coherent, machine-readable framework for semantic interoper

(5517, None)

In [11]:
generation_config = {"response_mime_type": "application/json"}

response = client.models.generate_content(
        contents= prompt,
        config=generation_config,
        model=model
    )

In [12]:
response_1 = response.text
response_1

'[\n  [\n    "acellular anatomical structure",\n    "adipose tissue",\n    "aerobic",\n    "aggregated",\n    "amictic lake",\n    "anaerobic",\n    "anterior segment of eyeball",\n    "anterior uvea",\n    "articular system",\n    "biome",\n    "blood vessel",\n    "bone element",\n    "bone tissue",\n    "caldera",\n    "circulatory system",\n    "clay",\n    "closed",\n    "collective plant organ structure",\n    "continental shelf",\n    "continuant",\n    "continuant fiat boundary",\n    "cowpea (pulse) food product",\n    "cowpea vegetable food product",\n    "crevasse",\n    "crevice",\n    "dead cave",\n    "dense connective tissue",\n    "digestive system",\n    "dimictic lake",\n    "disposition",\n    "distributed",\n    "ecoregion",\n    "ecozone",\n    "edible",\n    "elastic",\n    "entire sense organ system",\n    "excretory system",\n    "fluid astronomical body part",\n    "formal settlement",\n    "generically dependent continuant",\n    "glacier",\n    "hematopoietic

In [13]:
final_output = json.loads(response_1)

In [14]:
final_output

[['acellular anatomical structure',
  'adipose tissue',
  'aerobic',
  'aggregated',
  'amictic lake',
  'anaerobic',
  'anterior segment of eyeball',
  'anterior uvea',
  'articular system',
  'biome',
  'blood vessel',
  'bone element',
  'bone tissue',
  'caldera',
  'circulatory system',
  'clay',
  'closed',
  'collective plant organ structure',
  'continental shelf',
  'continuant',
  'continuant fiat boundary',
  'cowpea (pulse) food product',
  'cowpea vegetable food product',
  'crevasse',
  'crevice',
  'dead cave',
  'dense connective tissue',
  'digestive system',
  'dimictic lake',
  'disposition',
  'distributed',
  'ecoregion',
  'ecozone',
  'edible',
  'elastic',
  'entire sense organ system',
  'excretory system',
  'fluid astronomical body part',
  'formal settlement',
  'generically dependent continuant',
  'glacier',
  'hematopoietic system',
  'holomictic lake',
  'hydroform',
  'hydrophilicity',
  'hydrophobicity',
  'illegal settlement',
  'immaterial anatomical

In [15]:
model="gemini-2.5-pro"

In [16]:
response_final = []
from tqdm import tqdm
for i, term_cluster in tqdm(enumerate(final_output)):
    prompt=f"""You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning. Your task will be guided by a specific domain context provided to you.

**Context (Provided as Input):**
-   **Domain Name:** {domain['domain_name']}
-   **Domain Description:** {domain['domain_description']}

Your task is to take a single cluster of terms and a list of relationships, and generate all possible relationship triples (head, relation, tail) *between terms within that cluster*. You must use your knowledge of the identified domain and only use the specified relationships.

**Instructions:**

1.  **Process the Cluster:**
    *   Examine all possible pairs of terms within the given input cluster.
    *   Using your knowledge of the **identified domain** (as described in the provided context), determine if a pair can be connected by one of the clustering relationships. If a valid connection exists, create a relationship triple `(head, relation, tail)`.
    *   Only include triples where both the `head` and `tail` are terms from the input cluster and the `relation` is in the provided list of clustering relationships.
2.  **Handle Special Cases:**
    *   If the cluster contains one or zero terms, return an empty list, as no pairs are possible.
    *   If no valid relationships can be inferred for any pair in the cluster based on your knowledge of the domain, return an empty list.
    *   If the list of clustering relationships is empty, return an empty list.
3.  **Format the Output:**
    *   Return a Python list containing the generated triples, where each triple is a tuple.
    *   Each triple must be in the format `(head, relation, tail)`.
    *   Sort the final list of triples alphabetically by head, then by relation, and then by tail.

**Task:**
For the given cluster of terms, and guided by the provided domain context, generate all possible relationship triples using the given clustering relationships {test_relations}. Use your knowledge of the identified domain to infer valid relationships only between terms within the cluster.

-   **Cluster of Terms:** {term_cluster}
-   **List of Clustering Relationships:** {test_relations}

Return the result as a Python list of triples.
"""
    generation_config = {"response_mime_type": "application/json"}
    print(model)
    response = client.models.generate_content(
            contents= prompt,
            config=generation_config,
            model=model
        )
    response_final.append(response.text)
    if i%10==0:
        print(response.text)

0it [00:00, ?it/s]

gemini-2.5-pro


1it [01:49, 109.14s/it]

[
  [
    "aerobic",
    "disjointWith",
    "anaerobic"
  ],
  [
    "aggregated",
    "disjointWith",
    "distributed"
  ],
  [
    "amictic lake",
    "disjointWith",
    "holomictic lake"
  ],
  [
    "amictic lake",
    "disjointWith",
    "meromictic lake"
  ],
  [
    "anaerobic",
    "disjointWith",
    "aerobic"
  ],
  [
    "articular system",
    "seeAlso",
    "musculoskeletal system"
  ],
  [
    "articular system",
    "seeAlso",
    "skeletal system"
  ],
  [
    "biome",
    "seeAlso",
    "ecoregion"
  ],
  [
    "biome",
    "seeAlso",
    "ecozone"
  ],
  [
    "biome",
    "seeAlso",
    "marine ecosystem"
  ],
  [
    "bone element",
    "seeAlso",
    "bone tissue"
  ],
  [
    "bone tissue",
    "disjointWith",
    "muscle tissue"
  ],
  [
    "bone tissue",
    "disjointWith",
    "neural tissue"
  ],
  [
    "bone tissue",
    "seeAlso",
    "bone element"
  ],
  [
    "caldera",
    "seeAlso",
    "volcanic crater"
  ],
  [
    "circulatory system",
    "disj




In [17]:
response_final

['[\n  [\n    "aerobic",\n    "disjointWith",\n    "anaerobic"\n  ],\n  [\n    "aggregated",\n    "disjointWith",\n    "distributed"\n  ],\n  [\n    "amictic lake",\n    "disjointWith",\n    "holomictic lake"\n  ],\n  [\n    "amictic lake",\n    "disjointWith",\n    "meromictic lake"\n  ],\n  [\n    "anaerobic",\n    "disjointWith",\n    "aerobic"\n  ],\n  [\n    "articular system",\n    "seeAlso",\n    "musculoskeletal system"\n  ],\n  [\n    "articular system",\n    "seeAlso",\n    "skeletal system"\n  ],\n  [\n    "biome",\n    "seeAlso",\n    "ecoregion"\n  ],\n  [\n    "biome",\n    "seeAlso",\n    "ecozone"\n  ],\n  [\n    "biome",\n    "seeAlso",\n    "marine ecosystem"\n  ],\n  [\n    "bone element",\n    "seeAlso",\n    "bone tissue"\n  ],\n  [\n    "bone tissue",\n    "disjointWith",\n    "muscle tissue"\n  ],\n  [\n    "bone tissue",\n    "disjointWith",\n    "neural tissue"\n  ],\n  [\n    "bone tissue",\n    "seeAlso",\n    "bone element"\n  ],\n  [\n    "caldera",\n    "s

In [18]:
final_prep_output = []
for output_1 in response_final:
    response_dict = json.loads(output_1)
    for ind_output in response_dict:
        final_prep_output.append({
            "head":ind_output[0],
            "relation":ind_output[1],
            "tail":ind_output[2]
        })

In [19]:
len(final_prep_output)

208

In [20]:
final_prep_output

[{'head': 'aerobic', 'relation': 'disjointWith', 'tail': 'anaerobic'},
 {'head': 'aggregated', 'relation': 'disjointWith', 'tail': 'distributed'},
 {'head': 'amictic lake',
  'relation': 'disjointWith',
  'tail': 'holomictic lake'},
 {'head': 'amictic lake',
  'relation': 'disjointWith',
  'tail': 'meromictic lake'},
 {'head': 'anaerobic', 'relation': 'disjointWith', 'tail': 'aerobic'},
 {'head': 'articular system',
  'relation': 'seeAlso',
  'tail': 'musculoskeletal system'},
 {'head': 'articular system',
  'relation': 'seeAlso',
  'tail': 'skeletal system'},
 {'head': 'biome', 'relation': 'seeAlso', 'tail': 'ecoregion'},
 {'head': 'biome', 'relation': 'seeAlso', 'tail': 'ecozone'},
 {'head': 'biome', 'relation': 'seeAlso', 'tail': 'marine ecosystem'},
 {'head': 'bone element', 'relation': 'seeAlso', 'tail': 'bone tissue'},
 {'head': 'bone tissue', 'relation': 'disjointWith', 'tail': 'muscle tissue'},
 {'head': 'bone tissue', 'relation': 'disjointWith', 'tail': 'neural tissue'},
 {'he

In [21]:
# final_prepared_train_data = []
# for data_1 in train_data:
import json
with open("predictions_BILND_llm_re_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_prep_output, f, indent=2, ensure_ascii=False)
