In [1]:
import json
def load_json(data_path):
    with open(data_path, "r", encoding="utf-8") as file_1:
        data = json.load(file_1)
    return data

In [2]:
data = load_json("MatOnto/train/matonto_train_pairs.json")

In [3]:
len(data), data[:5]

(840,
 [{'ID': 'TR_7e0265b4', 'parent': 'measured property', 'child': 'area'},
  {'ID': 'TR_75953334', 'parent': 'element', 'child': 'Lanthanum'},
  {'ID': 'TR_ffd73cfe',
   'parent': 'measured property',
   'child': 'molecular mass'},
  {'ID': 'TR_8f425977', 'parent': 'measured property', 'child': 'frequency'},
  {'ID': 'TR_69a9e06b', 'parent': 'element', 'child': 'Europium'}])

In [4]:
from collections import defaultdict

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        # Path compression
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

def build_clusters(relationships):
    uf = UnionFind()
    
    for rel in relationships:
        parent = rel["parent"]
        child = rel["child"]
        uf.union(parent, child)

    clusters = defaultdict(set)
    for term in set([item for rel in relationships for item in (rel["parent"], rel["child"])]):
        root = uf.find(term)
        clusters[root].add(term)

    # Add isolated nodes (no parent/child relationship)
    all_terms = set()
    for rel in relationships:
        all_terms.add(rel["parent"])
        all_terms.add(rel["child"])
    for term in all_terms:
        if term not in uf.parent:
            clusters[term].add(term)

    return list(clusters.values())


In [5]:
clusters = build_clusters(data)

In [6]:
len(clusters)

29

In [7]:
clusters

[{'Actinium',
  'Arsenic',
  'Barium',
  'Beryllium',
  'Boron',
  'Bromine',
  'Cadmium',
  'Caesium',
  'Calcium',
  'Californium',
  'Cerium',
  'Curium',
  'Darmstadtium',
  'Dysprosium',
  'Einsteinium',
  'Erbium',
  'Europium',
  'Fluorine',
  'Gadolinium',
  'Germanium',
  'Gold',
  'Hafnium',
  'Holmium',
  'Hydrogen',
  'Iron',
  'Krypton',
  'Lanthanum',
  'Lawrencium',
  'Lead',
  'Lithium',
  'Lutetium',
  'Magnesium',
  'Meitnerium',
  'Mercury',
  'Molybdenum',
  'Osmium',
  'Oxygen',
  'Palladium',
  'Phosphorus',
  'Plutonium',
  'Praseodymium',
  'Promethium',
  'Radium',
  'Radon',
  'Rubidium',
  'Ruthenium',
  'Rutherfordium',
  'Scandium',
  'Seaborgium',
  'Selenium',
  'Silver',
  'Sodium',
  'Strontium',
  'Sulfur',
  'Tellurium',
  'Terbium',
  'Tin',
  'Titanium',
  'Tungsten',
  'Ununbium',
  'Ununhexium',
  'Ununnilium',
  'Ununoctium',
  'Ununquadium',
  'Ununseptium',
  'Ununtrium',
  'Uranium',
  'Vanadium',
  'Xenon',
  'Yttrium',
  'Zinc',
  'Zirconium

In [8]:
# Load a text file in Python
final_content = []
with open("MatOnto/test/matonto_test_types.txt", "r", encoding="utf-8") as f:
    contents = f.readlines()
    for content in contents:
        final_content.append(content.strip('\n'))


print(contents[:500])  # Print first 500 characters


['carbon allotrope\n', 'Silicon\n', 'specific volume\n', 'Lattice Parameter B\n', 'element\n', 'carbon group\n', 'Carbon\n', 'addition reaction\n', 'Hardness Range Lower Bound\n', 'carboxylic acid\n', 'bose einstein condensate\n', 'energy density unit\n', 'molar energy unit\n', 'Antimony Atom\n', 'parts per notation\n', 'conduction\n', 'organic compound\n', 'displacement reaction\n', 'super fluid phase\n', 'exposure unit\n', 'point group\n', 'object_aggregate\n', 'pH\n', 'flexural strength\n', 'role\n', 'process\n', 'hydrophilic\n', 'electric charge density\n', 'Tantalum\n', 'group 6 element\n', 'Barium Atom\n', 'Chromium Atom\n', 'unit\n', 'Lattice Parameter A\n', 'inorganic reaction\n', 'catalytic activity\n', 'Ununpentium\n', 'radiant intensity\n', 'area density\n', 'Ytterbium\n', 'hydrocarbon\n', 'partition coefficient\n', 'secondary amine\n', 'organic reaction\n', 'elastic modulus\n', 'polarity\n', 'acid-base reaction\n', 'youngs modulus\n', 'Iridium\n', 'electric current\n', 'amo

In [9]:
len(contents), contents[0]

(370, 'carbon allotrope\n')

In [10]:
len(final_content), final_content[0]

(370, 'carbon allotrope')

In [11]:
# Step 3: Assign function
from collections import Counter
from tqdm.notebook import tqdm
def assign_term_to_cluster(new_term, clusters):
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]
    
    if max_score == 0:
        return None  # No match
    return best_cluster_index

# Step 4: Assign all new terms
final_clusters = [set(cluster) for cluster in clusters]  # copy original clusters

for term in tqdm(final_content):
    assigned_idx = assign_term_to_cluster(term, final_clusters)
    if assigned_idx is not None:
        final_clusters[assigned_idx].add(term)
    else:
        # Create new cluster if no match found
        final_clusters.append(set([term]))

# Step 5: Convert to list of lists and return
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")

  0%|          | 0/370 [00:00<?, ?it/s]

Cluster 1: ['Actinium', 'Arsenic', 'Barium', 'Beryllium', 'Boron', 'Bromine', 'Cadmium', 'Caesium', 'Calcium', 'Californium', 'Cerium', 'Curium', 'Darmstadtium', 'Dysprosium', 'Einsteinium', 'Erbium', 'Europium', 'Fluorine', 'Gadolinium', 'Germanium', 'Gold', 'Hafnium', 'Holmium', 'Hydrogen', 'Iron', 'Krypton', 'Lanthanum', 'Lawrencium', 'Lead', 'Lithium', 'Lutetium', 'Magnesium', 'Meitnerium', 'Mercury', 'Molybdenum', 'Osmium', 'Oxygen', 'Palladium', 'Phosphorus', 'Plutonium', 'Praseodymium', 'Promethium', 'Radium', 'Radon', 'Rubidium', 'Ruthenium', 'Rutherfordium', 'Scandium', 'Seaborgium', 'Selenium', 'Silver', 'Sodium', 'Strontium', 'Sulfur', 'Tellurium', 'Terbium', 'Tin', 'Titanium', 'Tungsten', 'Ununbium', 'Ununhexium', 'Ununnilium', 'Ununoctium', 'Ununquadium', 'Ununseptium', 'Ununtrium', 'Uranium', 'Vanadium', 'Xenon', 'Yttrium', 'Zinc', 'Zirconium', 'element']
Cluster 2: ['Actinium Atom', 'Alloy', 'Aluminium', 'Aluminium Atom', 'Americium', 'Americium Atom', 'Antimony', 'Antim

In [12]:
from google import genai
model = "gemini-2.5-pro"
client= genai.Client(="AIzaSyBUn48DZUNrfLO7OsOoU1wEWOOQV3Fx_44")

In [14]:
for i, list_of_elements in tqdm(enumerate(final_clusters_list[24:])):
    if len(list_of_elements)>1:
        prompt = f""""Analyze the following list of biological terms and identify all direct and indirect **parent-child relationships**.

        A **parent-child relationship** exists when one term (the parent) is a broader, more general category or a whole that conceptually encompasses or contains another term (the child). The child term is a more specific instance, a part, or a developmental stage of the parent term.

        **Guidelines for identifying relationships:**
        * **"Is-a" relationship:** e.g., "A 'leaf procambium' **is a** type of 'procambium'." (Parent: procambium, Child: leaf procambium)
        * **"Part-of" relationship:** e.g., "A 'petal' **is part of** a 'flower'." (Parent: flower, Child: petal)
        * **Developmental sequence:** e.g., "'Petal primordium visible stage' **is a stage in** 'petal development'." (Parent: petal development, Child: petal primordium visible stage)
        * **Anatomical hierarchy:** e.g., "'Root endodermis' **is part of** 'root' and **is a type of** 'endodermis'." (Parent: root and endodermis, Child: root endodermis)

        **List of Elements:**
        {list_of_elements}
        **Output Format:**
        Provide the output as a single JSON array. Each element in the array must be a JSON object with two keys: `"parent"` and `"child"`, whose values are the corresponding terms.

        **Example Output (Illustrative, showing the expected JSON structure):**
        {json.dumps({"Answer":[
        {
            "parent": "procambium",
            "child": "branch procambium"
        },
        {
            "parent": "procambium",
            "child": "leaf procambium"
        },
        {
            "parent": "shoot system",
            "child": "axillary shoot system"
        },
        {
            "parent": "flower",
            "child": "petal"
        },
        {
            "parent": "flower development",
            "child": "petal primordium visible stage"
        }
        ]})}
        """
        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

0it [00:00, ?it/s]

[
  {
    "parent": "Ion",
    "child": "anion"
  },
  {
    "parent": "Ion",
    "child": "cation"
  }
]


In [16]:
final_ans

['[\n  {\n    "parent": "element",\n    "child": "Actinium"\n  },\n  {\n    "parent": "element",\n    "child": "Arsenic"\n  },\n  {\n    "parent": "element",\n    "child": "Barium"\n  },\n  {\n    "parent": "element",\n    "child": "Beryllium"\n  },\n  {\n    "parent": "element",\n    "child": "Boron"\n  },\n  {\n    "parent": "element",\n    "child": "Bromine"\n  },\n  {\n    "parent": "element",\n    "child": "Cadmium"\n  },\n  {\n    "parent": "element",\n    "child": "Caesium"\n  },\n  {\n    "parent": "element",\n    "child": "Calcium"\n  },\n  {\n    "parent": "element",\n    "child": "Californium"\n  },\n  {\n    "parent": "element",\n    "child": "Cerium"\n  },\n  {\n    "parent": "element",\n    "child": "Curium"\n  },\n  {\n    "parent": "element",\n    "child": "Darmstadtium"\n  },\n  {\n    "parent": "element",\n    "child": "Dysprosium"\n  },\n  {\n    "parent": "element",\n    "child": "Einsteinium"\n  },\n  {\n    "parent": "element",\n    "child": "Erbium"\n  },\n  {\n 

In [17]:
final_result = []
for ans in final_ans:
    ans_1 = json.loads(ans)
    for an in ans_1:
        if an!="Answer":
            if an not in final_result:
                final_result.append(an)
    

In [18]:
len(final_result)

915

In [19]:
final_result

[{'parent': 'element', 'child': 'Actinium'},
 {'parent': 'element', 'child': 'Arsenic'},
 {'parent': 'element', 'child': 'Barium'},
 {'parent': 'element', 'child': 'Beryllium'},
 {'parent': 'element', 'child': 'Boron'},
 {'parent': 'element', 'child': 'Bromine'},
 {'parent': 'element', 'child': 'Cadmium'},
 {'parent': 'element', 'child': 'Caesium'},
 {'parent': 'element', 'child': 'Calcium'},
 {'parent': 'element', 'child': 'Californium'},
 {'parent': 'element', 'child': 'Cerium'},
 {'parent': 'element', 'child': 'Curium'},
 {'parent': 'element', 'child': 'Darmstadtium'},
 {'parent': 'element', 'child': 'Dysprosium'},
 {'parent': 'element', 'child': 'Einsteinium'},
 {'parent': 'element', 'child': 'Erbium'},
 {'parent': 'element', 'child': 'Europium'},
 {'parent': 'element', 'child': 'Fluorine'},
 {'parent': 'element', 'child': 'Gadolinium'},
 {'parent': 'element', 'child': 'Germanium'},
 {'parent': 'element', 'child': 'Gold'},
 {'parent': 'element', 'child': 'Hafnium'},
 {'parent': 'el

In [20]:
with open("predictions_c_Matonto_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)


In [19]:
import json

# Save final_clusters_list to JSON file
with open("final_result_keywords_c_po.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [26]:
with open("_pairs.jsonl", "w", encoding="utf-8") as f:
    for entry in final_result:
        json.dump(entry, f)
        f.write("\n")
