In [1]:
import json
def load_json(data_path):
    with open(data_path, "r", encoding="utf-8") as file_1:
        data = json.load(file_1)
    return data

In [2]:
data = load_json("DOID/train/doid_train_pairs.json")

In [3]:
len(data), data[:5]

(28924,
 [{'ID': 'TR_2ed4e26c',
   'parent': 'autosomal dominant disease',
   'child': 'autosomal dominant Emery-Dreifuss muscular dystrophy 5'},
  {'ID': 'TR_18d3b799',
   'parent': 'autosomal recessive disease',
   'child': 'congenital disorder of glycosylation Iq'},
  {'ID': 'TR_934e161f',
   'parent': 'squamous cell carcinoma',
   'child': 'tonsil squamous cell carcinoma'},
  {'ID': 'TR_bc701c51',
   'parent': 'retinoschisis',
   'child': 'X-linked juvenile retinoschisis 1'},
  {'ID': 'TR_932695e9',
   'parent': 'lymph node disease',
   'child': 'lymph node tuberculosis'}])

In [4]:
from collections import defaultdict

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        # Path compression
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

def build_clusters(relationships):
    uf = UnionFind()
    
    for rel in relationships:
        parent = rel["parent"]
        child = rel["child"]
        uf.union(parent, child)

    clusters = defaultdict(set)
    for term in set([item for rel in relationships for item in (rel["parent"], rel["child"])]):
        root = uf.find(term)
        clusters[root].add(term)

    # Add isolated nodes (no parent/child relationship)
    all_terms = set()
    for rel in relationships:
        all_terms.add(rel["parent"])
        all_terms.add(rel["child"])
    for term in all_terms:
        if term not in uf.parent:
            clusters[term].add(term)

    return list(clusters.values())


In [5]:
clusters = build_clusters(data)

In [6]:
len(clusters)

75

In [7]:
clusters

[{'PO_0000055',
  'aerial tuber axillary vegetative bud',
  'areole bud',
  'axillary bud',
  'axillary flower bud',
  'axillary inflorescence bud',
  'axillary reproductive bud',
  'axillary shoot system',
  'axillary strobilus bud',
  'axillary vegetative bud',
  'bostryx inflorescence',
  'bulb',
  'capitulum inflorescence',
  'catkin inflorescence',
  'cincinnus inflorescence',
  'compound cincinnus inflorescence',
  'compound drepanium inflorescence',
  'compound raceme inflorescence',
  'compound spike inflorescence',
  'compound umbel inflorescence',
  'corm',
  'corymb inflorescence',
  'cyme inflorescence',
  'determinate thyrse inflorescence',
  'disk flower',
  'ear floret',
  'ear inflorescence',
  'ear infructescence',
  'ear sessile spikelet',
  'ear spikelet',
  'flower',
  'flower bud',
  'gametophore',
  'gametophore bud',
  'indeterminate thyrse inflorescence',
  'inflorescence',
  'inflorescence branch crown',
  'inflorescence bud',
  'infructescence',
  'long shoot'

In [8]:
# Load a text file in Python
final_content = []
with open("PO/test/po_test_types.txt", "r", encoding="utf-8") as f:
    contents = f.readlines()
    for content in contents:
        final_content.append(content.strip('\n'))


print(contents[:500])  # Print first 500 characters


['leaf lamina tooth\n', 'parenchyma cell\n', 'vascular leaf initiation stage\n', 'embryo hypocotyl\n', 'basal endosperm transfer layer\n', 'phloem\n', 'phyllome abaxial meristem\n', 'root cap\n', 'socket cell\n', 'root apical meristem\n', 'anthela inflorescence\n', 'axillary hair basal cell\n', 'thorn\n', 'xylem vessel\n', 'seminal root\n', 'root lateral meristem\n', 'microsporophyll\n', 'papilla cell\n', 'megasporangium endothecium\n', 'modified aleurone\n', 'plant axis differentiation zone\n', 'anther theca\n', 'stem base\n', 'tassel spikelet pair meristem\n', 'rhizoid meristematic apical cell\n', 'monarch protoxylem\n', 'endodermis\n', 'sporangium\n', 'portion of vascular tissue\n', 'intermediary companion cell\n', 'axillary inflorescence bud\n', 'root emergence stage\n', 'leaf substomatal cavity\n', 'plant ovule micropyle\n', 'xylem\n', 'scale leaf margin\n', 'seed chalaza\n', 'phyllome development stage\n', 'persistent sepal\n', 'plant structure\n', 'gynoecium ridge\n', 'apical me

In [9]:
len(contents), contents[0]

(916, 'leaf lamina tooth\n')

In [10]:
len(final_content), final_content[0]

(916, 'leaf lamina tooth')

In [11]:
# Step 3: Assign function
from collections import Counter
from tqdm.notebook import tqdm
def assign_term_to_cluster(new_term, clusters):
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]
    
    if max_score == 0:
        return None  # No match
    return best_cluster_index

# Step 4: Assign all new terms
final_clusters = [set(cluster) for cluster in clusters]  # copy original clusters

for term in tqdm(final_content):
    assigned_idx = assign_term_to_cluster(term, final_clusters)
    if assigned_idx is not None:
        final_clusters[assigned_idx].add(term)
    else:
        # Create new cluster if no match found
        final_clusters.append(set([term]))

# Step 5: Convert to list of lists and return
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")

  0%|          | 0/916 [00:00<?, ?it/s]

Cluster 1: ['PO_0000055', 'aerial tuber axillary vegetative bud', 'anthela inflorescence', 'areole bud', 'axillary bud', 'axillary flower bud', 'axillary inflorescence bud', 'axillary reproductive bud', 'axillary shoot system', 'axillary strobilus bud', 'axillary vegetative bud', 'bostryx inflorescence', 'bulb', 'capitulum inflorescence', 'catkin inflorescence', 'cincinnus inflorescence', 'compound capitulum inflorescence', 'compound cincinnus inflorescence', 'compound drepanium inflorescence', 'compound raceme inflorescence', 'compound spike inflorescence', 'compound umbel inflorescence', 'corm', 'corymb inflorescence', 'cyme inflorescence', 'determinate thyrse inflorescence', 'disk flower', 'drepanium inflorescence', 'ear floret', 'ear inflorescence', 'ear infructescence', 'ear sessile spikelet', 'ear spikelet', 'flower', 'flower bud', 'flower fascicle', 'gametophore', 'gametophore bud', 'indeterminate thyrse inflorescence', 'inflorescence', 'inflorescence bract', 'inflorescence brac

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [13]:
final_ans = []
for i, list_of_elements in tqdm(enumerate(final_clusters_list)):
    if len(list_of_elements)>1:
        prompt = f""""Analyze the following list of biological terms and identify all direct and indirect **parent-child relationships**.

        A **parent-child relationship** exists when one term (the parent) is a broader, more general category or a whole that conceptually encompasses or contains another term (the child). The child term is a more specific instance, a part, or a developmental stage of the parent term.

        **Guidelines for identifying relationships:**
        * **"Is-a" relationship:** e.g., "A 'leaf procambium' **is a** type of 'procambium'." (Parent: procambium, Child: leaf procambium)
        * **"Part-of" relationship:** e.g., "A 'petal' **is part of** a 'flower'." (Parent: flower, Child: petal)
        * **Developmental sequence:** e.g., "'Petal primordium visible stage' **is a stage in** 'petal development'." (Parent: petal development, Child: petal primordium visible stage)
        * **Anatomical hierarchy:** e.g., "'Root endodermis' **is part of** 'root' and **is a type of** 'endodermis'." (Parent: root and endodermis, Child: root endodermis)

        **List of Elements:**
        {list_of_elements}
        **Output Format:**
        Provide the output as a single JSON array. Each element in the array must be a JSON object with two keys: `"parent"` and `"child"`, whose values are the corresponding terms.

        **Example Output (Illustrative, showing the expected JSON structure):**
        {json.dumps({"Answer":[
        {
            "parent": "procambium",
            "child": "branch procambium"
        },
        {
            "parent": "procambium",
            "child": "leaf procambium"
        },
        {
            "parent": "shoot system",
            "child": "axillary shoot system"
        },
        {
            "parent": "flower",
            "child": "petal"
        },
        {
            "parent": "flower development",
            "child": "petal primordium visible stage"
        }
        ]})}
        """
        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

0it [00:00, ?it/s]

[
  {
    "parent": "shoot system",
    "child": "axillary shoot system"
  },
  {
    "parent": "shoot system",
    "child": "primary shoot system"
  },
  {
    "parent": "shoot system",
    "child": "reproductive shoot system"
  },
  {
    "parent": "shoot system",
    "child": "vegetative shoot system"
  },
  {
    "parent": "shoot system",
    "child": "shoot-borne shoot system"
  },
  {
    "parent": "axillary bud",
    "child": "axillary flower bud"
  },
  {
    "parent": "flower bud",
    "child": "axillary flower bud"
  },
  {
    "parent": "axillary bud",
    "child": "axillary inflorescence bud"
  },
  {
    "parent": "inflorescence bud",
    "child": "axillary inflorescence bud"
  },
  {
    "parent": "axillary bud",
    "child": "axillary reproductive bud"
  },
  {
    "parent": "reproductive bud",
    "child": "axillary reproductive bud"
  },
  {
    "parent": "axillary bud",
    "child": "axillary strobilus bud"
  },
  {
    "parent": "strobilus bud",
    "child": "axillar

In [14]:
final_ans

['[\n  {\n    "parent": "shoot system",\n    "child": "axillary shoot system"\n  },\n  {\n    "parent": "shoot system",\n    "child": "primary shoot system"\n  },\n  {\n    "parent": "shoot system",\n    "child": "reproductive shoot system"\n  },\n  {\n    "parent": "shoot system",\n    "child": "vegetative shoot system"\n  },\n  {\n    "parent": "shoot system",\n    "child": "shoot-borne shoot system"\n  },\n  {\n    "parent": "axillary bud",\n    "child": "axillary flower bud"\n  },\n  {\n    "parent": "flower bud",\n    "child": "axillary flower bud"\n  },\n  {\n    "parent": "axillary bud",\n    "child": "axillary inflorescence bud"\n  },\n  {\n    "parent": "inflorescence bud",\n    "child": "axillary inflorescence bud"\n  },\n  {\n    "parent": "axillary bud",\n    "child": "axillary reproductive bud"\n  },\n  {\n    "parent": "reproductive bud",\n    "child": "axillary reproductive bud"\n  },\n  {\n    "parent": "axillary bud",\n    "child": "axillary strobilus bud"\n  },\n  {\n

In [28]:
final_result = []
for ans in final_ans:
    ans_1 = json.loads(ans)
    for an in ans_1:
        if an!="Answer":
            final_result.append(an)
    

In [29]:
len(final_result)

849

In [30]:
final_result

[{'parent': 'shoot system', 'child': 'axillary shoot system'},
 {'parent': 'shoot system', 'child': 'primary shoot system'},
 {'parent': 'shoot system', 'child': 'reproductive shoot system'},
 {'parent': 'shoot system', 'child': 'vegetative shoot system'},
 {'parent': 'shoot system', 'child': 'shoot-borne shoot system'},
 {'parent': 'axillary bud', 'child': 'axillary flower bud'},
 {'parent': 'flower bud', 'child': 'axillary flower bud'},
 {'parent': 'axillary bud', 'child': 'axillary inflorescence bud'},
 {'parent': 'inflorescence bud', 'child': 'axillary inflorescence bud'},
 {'parent': 'axillary bud', 'child': 'axillary reproductive bud'},
 {'parent': 'reproductive bud', 'child': 'axillary reproductive bud'},
 {'parent': 'axillary bud', 'child': 'axillary strobilus bud'},
 {'parent': 'strobilus bud', 'child': 'axillary strobilus bud'},
 {'parent': 'axillary bud', 'child': 'axillary vegetative bud'},
 {'parent': 'vegetative bud', 'child': 'axillary vegetative bud'},
 {'parent': 'term

In [31]:
with open("predictions_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)


In [19]:
import json

# Save final_clusters_list to JSON file
with open("final_result_keywords_c_po.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [26]:
with open("_pairs.jsonl", "w", encoding="utf-8") as f:
    for entry in final_result:
        json.dump(entry, f)
        f.write("\n")
