In [1]:
import json
def load_json(data_path):
    with open(data_path, "r", encoding="utf-8") as file_1:
        data = json.load(file_1)
    return data

In [3]:
data = load_json("PO/train/po_train_pairs.json")

In [4]:
len(data), data[:5]

(2005,
 [{'ID': 'TR_32576b74',
   'parent': 'leaf lamina vein',
   'child': 'primary leaf vein'},
  {'ID': 'TR_8bdc58cb', 'parent': 'whole plant', 'child': 'thallus'},
  {'ID': 'TR_e4c6e138', 'parent': 'plant organ', 'child': 'coleorhiza'},
  {'ID': 'TR_81b052e4', 'parent': 'native plant cell', 'child': 'brachycyte'},
  {'ID': 'TR_bfbc72dc',
   'parent': 'inflorescence',
   'child': 'compound drepanium inflorescence'}])

In [5]:
from collections import defaultdict

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        # Path compression
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

def build_clusters(relationships):
    uf = UnionFind()
    
    for rel in relationships:
        parent = rel["parent"]
        child = rel["child"]
        uf.union(parent, child)

    clusters = defaultdict(set)
    for term in set([item for rel in relationships for item in (rel["parent"], rel["child"])]):
        root = uf.find(term)
        clusters[root].add(term)

    # Add isolated nodes (no parent/child relationship)
    all_terms = set()
    for rel in relationships:
        all_terms.add(rel["parent"])
        all_terms.add(rel["child"])
    for term in all_terms:
        if term not in uf.parent:
            clusters[term].add(term)

    return list(clusters.values())


In [6]:
clusters = build_clusters(data)

In [7]:
len(clusters)

75

In [8]:
clusters

[{'SE.01 one node or internode visible stage',
  'SE.02 two nodes or internodes visible stage',
  'SE.03 three nodes or internodes visible stage',
  'SE.04 four nodes or internodes visible stage',
  'SE.05 five nodes or internodes visible stage',
  'SE.06 six nodes or internodes visible stage',
  'SE.07 seven nodes or internodes visible stage',
  'SE.09 nine nodes or internodes visible stage',
  'SE.10 ten nodes or internodes visible stage',
  'SE.11 eleven nodes or internodes visible stage',
  'SE.12 twelve nodes or internodes visible stage',
  'SE.14 fourteen nodes or internodes visible stage',
  'SE.15 fifteen nodes or internodes visible stage',
  'SE.97 flag leaf visible stage',
  'stem elongation stage'},
 {'PO_0025349',
  'abaxial nucellar projection',
  'abaxial protoderm',
  'abscission zone',
  'adaxial nucellar projection',
  'adult vascular leaf',
  'adventitious root nodule',
  'aerenchyma',
  'aerial tuber cortex',
  'aerial tuber interfascicular region',
  'aerial tuber p

In [9]:
# Load a text file in Python
final_content = []
with open("PO/test/po_test_types.txt", "r", encoding="utf-8") as f:
    contents = f.readlines()
    for content in contents:
        final_content.append(content.strip('\n'))


print(contents[:500])  # Print first 500 characters


['leaf lamina tooth\n', 'parenchyma cell\n', 'vascular leaf initiation stage\n', 'embryo hypocotyl\n', 'basal endosperm transfer layer\n', 'phloem\n', 'phyllome abaxial meristem\n', 'root cap\n', 'socket cell\n', 'root apical meristem\n', 'anthela inflorescence\n', 'axillary hair basal cell\n', 'thorn\n', 'xylem vessel\n', 'seminal root\n', 'root lateral meristem\n', 'microsporophyll\n', 'papilla cell\n', 'megasporangium endothecium\n', 'modified aleurone\n', 'plant axis differentiation zone\n', 'anther theca\n', 'stem base\n', 'tassel spikelet pair meristem\n', 'rhizoid meristematic apical cell\n', 'monarch protoxylem\n', 'endodermis\n', 'sporangium\n', 'portion of vascular tissue\n', 'intermediary companion cell\n', 'axillary inflorescence bud\n', 'root emergence stage\n', 'leaf substomatal cavity\n', 'plant ovule micropyle\n', 'xylem\n', 'scale leaf margin\n', 'seed chalaza\n', 'phyllome development stage\n', 'persistent sepal\n', 'plant structure\n', 'gynoecium ridge\n', 'apical me

In [10]:
len(contents), contents[0]

(916, 'leaf lamina tooth\n')

In [11]:
len(final_content), final_content[0]

(916, 'leaf lamina tooth')

In [12]:
# Step 3: Assign function
from collections import Counter
from tqdm.notebook import tqdm
def assign_term_to_cluster(new_term, clusters):
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]
    
    if max_score == 0:
        return None  # No match
    return best_cluster_index

# Step 4: Assign all new terms
final_clusters = [set(cluster) for cluster in clusters]  # copy original clusters

for term in tqdm(final_content):
    assigned_idx = assign_term_to_cluster(term, final_clusters)
    if assigned_idx is not None:
        final_clusters[assigned_idx].add(term)
    else:
        # Create new cluster if no match found
        final_clusters.append(set([term]))

# Step 5: Convert to list of lists and return
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")

  0%|          | 0/916 [00:00<?, ?it/s]

Cluster 1: ['SE.01 one node or internode visible stage', 'SE.02 two nodes or internodes visible stage', 'SE.03 three nodes or internodes visible stage', 'SE.04 four nodes or internodes visible stage', 'SE.05 five nodes or internodes visible stage', 'SE.06 six nodes or internodes visible stage', 'SE.07 seven nodes or internodes visible stage', 'SE.09 nine nodes or internodes visible stage', 'SE.10 ten nodes or internodes visible stage', 'SE.11 eleven nodes or internodes visible stage', 'SE.12 twelve nodes or internodes visible stage', 'SE.14 fourteen nodes or internodes visible stage', 'SE.15 fifteen nodes or internodes visible stage', 'SE.97 flag leaf visible stage', 'stem elongation stage']
Cluster 2: ['PO_0025349', 'abaxial nucellar projection', 'abaxial protoderm', 'abscission zone', 'achene fruit', 'adaxial nucellar projection', 'adaxial protoderm', 'adult vascular leaf', 'adventitious root nodule', 'aerenchyma', 'aerial tuber', 'aerial tuber axillary bud meristem', 'aerial tuber a

In [None]:
from google import genai
model = "gemini-2.5-pro"
from together import Together
client = Together(api_key="")

In [16]:
final_ans = []
for i, list_of_elements in tqdm(enumerate(final_clusters_list)):
    if len(list_of_elements)>1:
        prompt = f""""Analyze the following list of biological terms and identify all direct and indirect **parent-child relationships**.

        A **parent-child relationship** exists when one term (the parent) is a broader, more general category or a whole that conceptually encompasses or contains another term (the child). The child term is a more specific instance, a part, or a developmental stage of the parent term.

        **Guidelines for identifying relationships:**
        * **"Is-a" relationship:** e.g., "A 'leaf procambium' **is a** type of 'procambium'." (Parent: procambium, Child: leaf procambium)
        * **"Part-of" relationship:** e.g., "A 'petal' **is part of** a 'flower'." (Parent: flower, Child: petal)
        * **Developmental sequence:** e.g., "'Petal primordium visible stage' **is a stage in** 'petal development'." (Parent: petal development, Child: petal primordium visible stage)
        * **Anatomical hierarchy:** e.g., "'Root endodermis' **is part of** 'root' and **is a type of** 'endodermis'." (Parent: root and endodermis, Child: root endodermis)

        **List of Elements:**
        {list_of_elements}
        **Output Format:**
        Provide the output as a single JSON array. Each element in the array must be a JSON object with two keys: `"parent"` and `"child"`, whose values are the corresponding terms.

        **Example Output (Illustrative, showing the expected JSON structure):**
        {json.dumps({"Answer":[
        {
            "parent": "procambium",
            "child": "branch procambium"
        },
        {
            "parent": "procambium",
            "child": "leaf procambium"
        },
        {
            "parent": "shoot system",
            "child": "axillary shoot system"
        },
        {
            "parent": "flower",
            "child": "petal"
        },
        {
            "parent": "flower development",
            "child": "petal primordium visible stage"
        }
        ]})}
        """
        # generation_config = {"response_mime_type": "application/json"}

        response = client.chat.completions.create(
            model="deepseek-ai/DeepSeek-V3",
            messages=[
            {
                "role": "user",
                "content": prompt
            }
            ]
        )
        final_ans.append(response.choices[0].message.content)
        if i%20==0:
            print(response.choices[0].message.content)

0it [00:00, ?it/s]

```json
{
  "Answer": [
    {
      "parent": "stem elongation stage",
      "child": "SE.01 one node or internode visible stage"
    },
    {
      "parent": "stem elongation stage",
      "child": "SE.02 two nodes or internodes visible stage"
    },
    {
      "parent": "stem elongation stage",
      "child": "SE.03 three nodes or internodes visible stage"
    },
    {
      "parent": "stem elongation stage",
      "child": "SE.04 four nodes or internodes visible stage"
    },
    {
      "parent": "stem elongation stage",
      "child": "SE.05 five nodes or internodes visible stage"
    },
    {
      "parent": "stem elongation stage",
      "child": "SE.06 six nodes or internodes visible stage"
    },
    {
      "parent": "stem elongation stage",
      "child": "SE.07 seven nodes or internodes visible stage"
    },
    {
      "parent": "stem elongation stage",
      "child": "SE.09 nine nodes or internodes visible stage"
    },
    {
      "parent": "stem elongation stage",
    

In [None]:
final_ans = []
for i, list_of_elements in tqdm(enumerate(final_clusters_list)):
    if len(list_of_elements)>1:
        prompt = f""""Analyze the following list of biological terms and identify all direct and indirect **parent-child relationships**.

        A **parent-child relationship** exists when one term (the parent) is a broader, more general category or a whole that conceptually encompasses or contains another term (the child). The child term is a more specific instance, a part, or a developmental stage of the parent term.

        **Guidelines for identifying relationships:**
        * **"Is-a" relationship:** e.g., "A 'leaf procambium' **is a** type of 'procambium'." (Parent: procambium, Child: leaf procambium)
        * **"Part-of" relationship:** e.g., "A 'petal' **is part of** a 'flower'." (Parent: flower, Child: petal)
        * **Developmental sequence:** e.g., "'Petal primordium visible stage' **is a stage in** 'petal development'." (Parent: petal development, Child: petal primordium visible stage)
        * **Anatomical hierarchy:** e.g., "'Root endodermis' **is part of** 'root' and **is a type of** 'endodermis'." (Parent: root and endodermis, Child: root endodermis)

        **List of Elements:**
        {list_of_elements}
        **Output Format:**
        Provide the output as a single JSON array. Each element in the array must be a JSON object with two keys: `"parent"` and `"child"`, whose values are the corresponding terms.

        **Example Output (Illustrative, showing the expected JSON structure):**
        {json.dumps({"Answer":[
        {
            "parent": "procambium",
            "child": "branch procambium"
        },
        {
            "parent": "procambium",
            "child": "leaf procambium"
        },
        {
            "parent": "shoot system",
            "child": "axillary shoot system"
        },
        {
            "parent": "flower",
            "child": "petal"
        },
        {
            "parent": "flower development",
            "child": "petal primordium visible stage"
        }
        ]})}
        """
        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

0it [00:00, ?it/s]