In [24]:
import json
def load_json(data_path):
    with open(data_path, "r") as file_1:
        data = json.load(file_1)
    return data


In [25]:
test_data = []
with open("C11-Blind-Types.txt", "r", encoding="utf-8") as file:
    content = file.readlines()
    for con in content:
        test_data.append(con.strip('\n'))

print(test_data)

['protein mass fraction', 'area unit', 'carbon dioxide equivalent mass per energy unit', 'number manual firmness 3.5', 'reddening (U-B)', 'prefixed hertz', 'Streptococcus uberis count (specific)', 'starch mass fraction', 'white light magnitude', 'prefixed metre per prefixed secon (time) squared', 'quantity of dimension one', 'color area fraction', 'quantity', 'magnetic field', 'admittance', 'radiant intensity', 'power unit', 'coverage', 'prefixed litre', 'potential difference', 'I magnitude', 'mustard powder mass fraction', 'area fraction unit', 'information capacity', 'Réaumur temperature', 'Staphylococcus aureus count (volumetric)', 'number malformed flowers', 'permeability (earth science) unit', 'mass unit', 'half-life', 'number color 4', 'responsivity unit', 'Stanton number for mass transfer unit', 'radiant flux', 'right ascension', 'Cousins magnitude', 'total density parameter', 'Streptococcus uberis count (volumetric)', 'number leaves', 'Strouhal number', 'number wilted flowers',

In [26]:
len(test_data)

798

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [28]:
prompt_domain_extraction = f"""You are an expert AI assistant specializing in knowledge representation, ontology, and domain analysis. Your task is to analyze a given list of terms to identify the specific knowledge domain they belong to.

**Instructions:**

1.  **Analyze Input:** You will be given a single list of terms. Carefully examine these terms to find the common theme or scientific field that connects them. Consider what subject matter these terms represent (e.g., biology, geography, technology, food production).
2.  **Generate Domain Name:** Based on your analysis, create a concise and descriptive name for the domain. For example, "Food Science and Production" or "Earth and Environmental Science."
3.  **Generate Domain Description:** Write a 4-5 line description of this domain. This description should explain:
    *   The general area of study or industry it covers.
    *   The types of concepts the terms represent within this domain.
    *   How concepts within this domain are typically related (e.g., one can be a part of another, one can be transformed into another, or one can be an ingredient of another).
    *   The overall purpose of structuring knowledge in this domain (e.g., for scientific analysis, traceability, data interoperability).
4.  **Format the Output:** Return a Python dictionary containing two keys: `domain_name` and `domain_description`.

**Task:**
Analyze the following list of terms and generate a domain name and description.

-   **List of Terms:** {test_data}

Return the result as a Python dictionary."""

In [29]:
generation_config = {"response_mime_type": "application/json"}
print(len(prompt_domain_extraction))
response = client.models.generate_content(
        contents= prompt_domain_extraction,
        config=generation_config,
        model=model
    )
domain= json.loads(response.text)

20052


In [30]:
domain

{'domain_name': 'Scientific and Technical Quantification Systems',
 'domain_description': 'This domain focuses on the systematic representation and definition of measurable physical quantities, dimensionless numbers, and observable attributes across diverse scientific and technical fields. It encompasses fundamental units, derived quantities, and specialized metrics found in areas like astrophysics, fluid dynamics, food science, and microbiology, alongside qualitative observations often assigned numerical scales. Concepts are typically related by measurement relationships, part-whole compositions for complex quantities, or categorizations of observed phenomena. The primary purpose is to ensure consistent data interpretation, enable interoperability across different scientific and industrial datasets, and facilitate rigorous quantitative analysis.'}

In [None]:
prompt_cluster = f"""You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning. You will be provided with a domain name and a description to set the context for your task.

Your primary task is to group a list of terms into clusters by identifying their **parent-child relationships**. In this context, a **parent-child relationship** is a fundamental connection where one term is a direct 'parent' (a broader category) or a 'child' (a more specific instance or part) of another. This includes:
*   **'is-a' relationships (Type/Subtype):** Where the child term is a specific type of the parent term (e.g., 'River' is-a 'Body of Water').
*   **'part-of' relationships (Whole/Part):** Where the child term is a component or part of the parent term (e.g., 'Sea' is-part-of 'Ocean').

You will cluster terms that are linked directly or indirectly by these fundamental parent-child connections.

**Context (Provided as Input):**
-   **Domain Name:** {domain['domain_name']}
-   **Domain Description:** {domain['domain_description']}

**Instructions:**

1.  **Build the Graph:**
    *   Treat each term from the input list as a point (node) in a graph.
    *   Using your knowledge of the **identified domain** (contextualized by the provided name and description), check every pair of terms to see if a **parent-child relationship** ('is-a' or 'part-of') exists between them.
    *   If such a relationship exists, create a two-way (undirected) link between the parent and child terms for clustering purposes.
2.  **Form Clusters:**
    *   Group terms into clusters where terms are connected either directly or indirectly through the identified **parent-child** links.
    *   Terms with no identifiable parent-child connections to other terms in the list will form their own single-term clusters.
    *   Ensure every term from the input list appears in the output.
3.  **Handle Special Cases:**
    *   If the terms list is empty, return an empty list of clusters: [].
4.  **Format the Output:**
    *   Return a Python list of lists, where each inner list contains the terms in a cluster.
    *   Do not include the relationships in the output, only the grouped terms.
    *   Sort the clusters alphabetically by the first term in each cluster.
    *   Within each cluster, sort the terms alphabetically.

**Task:**
Group the following terms into clusters by identifying all direct and indirect **parent-child (e.g:-'is-a' and 'part-of') relationships** between them, guided by the provided domain context.

-   **List of Terms:** {test_data}

Return the result as a Python list of lists, where each inner list represents a cluster of terms."""

In [10]:
print(len(prompt_cluster))

21965


In [11]:
generation_config = {"response_mime_type": "application/json"}

response = client.models.generate_content(
        contents= prompt_cluster,
        config=generation_config,
        model=model
    )
clusters_list= json.loads(response.text)

In [12]:
clusters_list

[['1040 nm Lockwood magnitude',
  'Alfvén number',
  'Alfvén number unit',
  'absolute bolometric magnitude',
  'absolute magnitude',
  'absorbed dose',
  'absorbed dose rate',
  'absorbed dose rate unit',
  'absorbed dose unit',
  'acceleration',
  'acceleration unit',
  'acidity',
  'action',
  'action unit',
  'activity',
  'activity unit',
  'admittance',
  'albedo',
  'altitude',
  'amount of money',
  'amount of money unit',
  'amount of substance',
  'amount of substance concentration',
  'amount of substance concentration unit',
  'amount of substance flow',
  'amount of substance flow unit',
  'amount of substance fraction',
  'amount of substance fraction flow',
  'amount of substance fraction unit',
  'amount of substance unit',
  'amphiphilicity',
  'amplitude',
  'angle',
  'angle unit',
  'angular acceleration',
  'angular acceleration unit',
  'angular displacement',
  'angular momentum',
  'angular momentum unit',
  'angular speed',
  'angular speed unit',
  'angular si

In [13]:
for cluser in clusters_list:
    print(len(cluser))

773


In [17]:
model= "gemini-2.5-pro"

In [18]:
from tqdm import tqdm

response_final = []

# Assume 'final_output' is a list of clusters (e.g., [['Ocean', 'Sea', 'Water'], ['Glacier', 'Ice']])
# Assume 'domain' is a dictionary like {'domain_name': '...', 'domain_description': '...'}
# Assume 'client' and 'model' are already initialized

for i, term_cluster in tqdm(enumerate(clusters_list)):
    # This prompt asks the model to discover the relationship itself
    # instead of just checking against a predefined list.
    if len(term_cluster)>1:
        prompt = f"""You are an expert AI assistant with deep expertise in knowledge graphs and ontology reasoning. Your task will be guided by a specific domain context provided to you.

    **Context (Provided as Input):**
    -   **Domain Name:** {domain['domain_name']}
    -   **Domain Description:** {domain['domain_description']}

    Your task is to take a single cluster of terms and discover all possible **parent-child relationship triples** (head, relation, tail) *between terms within that cluster*. A parent-child relationship can be either:
    *   **'is-a' (Type/Subtype):** Where the child term is a specific type of the parent term.
    *   **'part-of' (Whole/Part):** Where the child term is a component or part of the parent term.

    **Instructions:**

    1.  **Process the Cluster:**
        *   Examine all possible pairs of terms within the given input cluster.
        *   Using your knowledge of the **identified domain**, determine if a direct **parent-child ('is-a' or 'part-of') relationship** exists between them.
        *   If a relationship is found, create a triple `(head, relation, tail)`. The `relation` in the triple should be a string that describes the discovered relationship (e.g., 'is-a', 'part-of').
        *   Only include triples where both the `head` and `tail` are terms from the input cluster.

    2.  **Handle Special Cases:**
        *   If the cluster contains one or zero terms, return an empty list.
        *   If no parent-child relationships can be inferred for any pair in the cluster, return an empty list.

    3.  **Format the Output:**
        *   Return a Python list containing the generated triples, where each triple is a tuple.
        *   Each triple must be in the format `(head, relation, tail)`.
        *   Sort the final list of triples alphabetically by head, then by relation, and then by tail.

    **Task:**
    For the given cluster of terms, and guided by the provided domain context, discover and generate all possible parent-child relationship triples.

    -   **Cluster of Terms:** {term_cluster}

    Return the result as a Python list of triples.
    """
        print(len(prompt), model)
        generation_config = {"response_mime_type": "application/json"}
        
        # This print statement is for debugging the model object if needed
        # print(model) 

        response = client.models.generate_content(
                contents=prompt,
                config=generation_config,
                model=model
            )
            
        response_final.append(response.text)
        
        # Print progress every 10 iterations
        if i % 10 == 0:
            print(f"--- Response for cluster {i} ---")
            print(response.text)

    # After the loop, response_final will contain a list of JSON strings,
    # each string being a list of discovered triples for the corresponding cluster.

0it [00:00, ?it/s]

20946 gemini-2.5-pro


1it [04:22, 262.18s/it]

--- Response for cluster 0 ---
[
  [
    "1040 nm Lockwood magnitude",
    "is-a",
    "magnitude"
  ],
  [
    "Alfvén number",
    "is-a",
    "quantity of dimension one"
  ],
  [
    "Alfvén number unit",
    "is-a",
    "quantity of dimension one unit"
  ],
  [
    "absolute bolometric magnitude",
    "is-a",
    "absolute magnitude"
  ],
  [
    "absolute bolometric magnitude",
    "is-a",
    "bolometric magnitude"
  ],
  [
    "absolute magnitude",
    "is-a",
    "magnitude"
  ],
  [
    "absorbed dose rate unit",
    "is-a",
    "unit"
  ],
  [
    "absorbed dose unit",
    "is-a",
    "unit"
  ],
  [
    "acceleration unit",
    "is-a",
    "unit"
  ],
  [
    "action unit",
    "is-a",
    "unit"
  ],
  [
    "activity unit",
    "is-a",
    "unit"
  ],
  [
    "albedo",
    "is-a",
    "ratio"
  ],
  [
    "altitude",
    "is-a",
    "angle"
  ],
  [
    "amount of money unit",
    "is-a",
    "unit"
  ],
  [
    "amount of substance concentration unit",
    "is-a",
    "un




In [19]:
len(response_final)

1

In [20]:
final_ans = []
for resp in response_final:
    resp_list = json.loads(resp)
    for lst in resp_list:
        final_ans.append({"child":lst[0], "parent":lst[-1]})

In [21]:
len(final_ans)

571

In [22]:
final_ans

[{'child': '1040 nm Lockwood magnitude', 'parent': 'magnitude'},
 {'child': 'Alfvén number', 'parent': 'quantity of dimension one'},
 {'child': 'Alfvén number unit', 'parent': 'quantity of dimension one unit'},
 {'child': 'absolute bolometric magnitude', 'parent': 'absolute magnitude'},
 {'child': 'absolute bolometric magnitude', 'parent': 'bolometric magnitude'},
 {'child': 'absolute magnitude', 'parent': 'magnitude'},
 {'child': 'absorbed dose rate unit', 'parent': 'unit'},
 {'child': 'absorbed dose unit', 'parent': 'unit'},
 {'child': 'acceleration unit', 'parent': 'unit'},
 {'child': 'action unit', 'parent': 'unit'},
 {'child': 'activity unit', 'parent': 'unit'},
 {'child': 'albedo', 'parent': 'ratio'},
 {'child': 'altitude', 'parent': 'angle'},
 {'child': 'amount of money unit', 'parent': 'unit'},
 {'child': 'amount of substance concentration unit', 'parent': 'unit'},
 {'child': 'amount of substance flow unit', 'parent': 'unit'},
 {'child': 'amount of substance fraction', 'parent'

In [23]:
with open("predictions_flash_clustering_pro_final_c_10_model_clustering_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_ans, f, indent=2, ensure_ascii=False)
