In [1]:
import json
def load_json(data_path):
    with open(data_path, "r", encoding="utf-8") as file_1:
        data = json.load(file_1)
    return data

In [2]:
data = load_json("PROCO/train/proco_train_pairs.json")

In [3]:
len(data), data[:5]

(1313,
 [{'ID': 'TR_ad6670cf', 'parent': 'AFE_0001128', 'child': 'Bunsen burner'},
  {'ID': 'TR_a990a3ed',
   'parent': 'document part',
   'child': 'CTA Section 3.2.S.2.4 Controls of Critical Steps and Intermediates'},
  {'ID': 'TR_cca21ca5',
   'parent': 'electron acceptor',
   'child': 'hydrogen acceptor'},
  {'ID': 'TR_515d09c1',
   'parent': 'distillation',
   'child': 'simple distillation'},
  {'ID': 'TR_0696fbe0',
   'parent': 'directive information entity',
   'child': 'plan specification'}])

In [4]:
from collections import defaultdict

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        # Path compression
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

def build_clusters(relationships):
    uf = UnionFind()
    
    for rel in relationships:
        parent = rel["parent"]
        child = rel["child"]
        uf.union(parent, child)

    clusters = defaultdict(set)
    for term in set([item for rel in relationships for item in (rel["parent"], rel["child"])]):
        root = uf.find(term)
        clusters[root].add(term)

    # Add isolated nodes (no parent/child relationship)
    all_terms = set()
    for rel in relationships:
        all_terms.add(rel["parent"])
        all_terms.add(rel["child"])
    for term in all_terms:
        if term not in uf.parent:
            clusters[term].add(term)

    return list(clusters.values())


In [5]:
clusters = build_clusters(data)

In [6]:
len(clusters)

30

In [7]:
clusters

[{'dihexagonal-dipyramidal',
  'dihexagonal-pyramidal',
  'diploidal',
  'ditetragonal-dipyramidal',
  'ditetragonal-pyramidal',
  'ditrigonal-dipyramidal',
  'ditrigonal-pyramidal',
  'ditrigonal-scalenohedral',
  'domatic',
  'gyroidal',
  'hexagonal-dipyrimidal',
  'hexagonal-pyrimidal',
  'hexagonal-trapezohedral',
  'hexoctahedral',
  'hextetrahedral',
  'pedial',
  'pinacoidal',
  'point group symmetry',
  'prismatic',
  'rhombic-dipyramidal',
  'rhombic-disphenoidal',
  'rhombic-pyramidal',
  'rhombohedral',
  'sphenoidal',
  'tetartoidal',
  'tetragonal-dipyramidal',
  'tetragonal-disphenoidal',
  'tetragonal-pyramidal',
  'tetragonal-scalenohedral',
  'tetragonal-trapezohedral',
  'trigonal-dipyramidal',
  'trigonal-pyramidal',
  'trigonal-trapezohedral'},
 {'1,4-dioxane',
  'AFE_0000354',
  'AFE_0000407',
  'AFE_0001128',
  'AFFN_0000014',
  'AFFN_0000055',
  'AFFN_0000111',
  'AFFN_0000127',
  'AFFN_0000128',
  'AFFN_0000137',
  'AFM_0001034',
  'AFM_0001038',
  'AFP_0003306

In [8]:
# Load a text file in Python
final_content = []
with open("PROCO/test/proco_test_types.txt", "r", encoding="utf-8") as f:
    contents = f.readlines()
    for content in contents:
        final_content.append(content.strip('\n'))


print(contents[:500])  # Print first 500 characters


['application\n', 'chromophore\n', 'process chemistry datum\n', 'methanol\n', 'Ostwald ripening\n', 'rhombohedral\n', 'commercial chemical product\n', 'washing\n', 'heterocyclic compound\n', 'macronutrient\n', 'annealing\n', 'carboxylic acid\n', 'AFRL_0000423\n', 'AFFN_0000187\n', 'organic heterocyclic compound\n', 'AFQ_0000113\n', 'Variable Time Normalisation Analysis\n', 'redox reaction property\n', 'ditrigonal-dipyramidal\n', 'mouse metabolite\n', 'elution\n', 'tetartoidal\n', 'rhombic-pyramidal\n', 'factorial design\n', 'eukaryotic metabolite\n', 'organooxygen compound\n', 'generically dependent continuant\n', 'iron(3+) chelator\n', 'quality specification\n', 'mathematical constant\n', 'AFRL_0000364\n', 'propanones\n', 'AFRL_0000249\n', 'obsolescence reason specification\n', 'annulene\n', 'chemical role\n', 'chemical product\n', 'DNA polymerase complex\n', 'sample quenching\n', 'role\n', 'secondary fatty alcohol\n', 'chemical abstracts name\n', 'luciferin\n', 'fatty acid anion\n', 

In [9]:
len(contents), contents[0]

(530, 'application\n')

In [10]:
len(final_content), final_content[0]

(530, 'application')

In [11]:
# Step 3: Assign function
from collections import Counter
from tqdm.notebook import tqdm
def assign_term_to_cluster(new_term, clusters):
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]
    
    if max_score == 0:
        return None  # No match
    return best_cluster_index

# Step 4: Assign all new terms
final_clusters = [set(cluster) for cluster in clusters]  # copy original clusters

for term in tqdm(final_content):
    assigned_idx = assign_term_to_cluster(term, final_clusters)
    if assigned_idx is not None:
        final_clusters[assigned_idx].add(term)
    else:
        # Create new cluster if no match found
        final_clusters.append(set([term]))

# Step 5: Convert to list of lists and return
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")

  0%|          | 0/530 [00:00<?, ?it/s]

Cluster 1: ['dihexagonal-dipyramidal', 'dihexagonal-pyramidal', 'diploidal', 'ditetragonal-dipyramidal', 'ditetragonal-pyramidal', 'ditrigonal-dipyramidal', 'ditrigonal-pyramidal', 'ditrigonal-scalenohedral', 'domatic', 'gyroidal', 'hexagonal-dipyrimidal', 'hexagonal-pyrimidal', 'hexagonal-trapezohedral', 'hexoctahedral', 'hextetrahedral', 'pedial', 'pinacoidal', 'point group symmetry', 'prismatic', 'rhombic-dipyramidal', 'rhombic-disphenoidal', 'rhombic-pyramidal', 'rhombohedral', 'sphenoidal', 'tetartoidal', 'tetragonal-dipyramidal', 'tetragonal-disphenoidal', 'tetragonal-pyramidal', 'tetragonal-scalenohedral', 'tetragonal-trapezohedral', 'trigonal-dipyramidal', 'trigonal-pyramidal', 'trigonal-trapezohedral']
Cluster 2: ['1,4-dioxane', 'AFE_0000354', 'AFE_0000407', 'AFE_0001128', 'AFFN_0000014', 'AFFN_0000055', 'AFFN_0000111', 'AFFN_0000127', 'AFFN_0000128', 'AFFN_0000137', 'AFM_0001034', 'AFM_0001038', 'AFP_0003306', 'AFP_0003312', 'AFP_0003348', 'AFP_0003359', 'AFP_0003486', 'AFP_0

In [None]:
from google import genai
model = "gemini-2.5-pro"
client= genai.Client(api_key="")

In [13]:
final_ans = []
for i, list_of_elements in tqdm(enumerate(final_clusters_list)):
    if len(list_of_elements)>1:
        prompt = f""""Analyze the following list of biological terms and identify all direct and indirect **parent-child relationships**.

        A **parent-child relationship** exists when one term (the parent) is a broader, more general category or a whole that conceptually encompasses or contains another term (the child). The child term is a more specific instance, a part, or a developmental stage of the parent term.

        **Guidelines for identifying relationships:**
        * **"Is-a" relationship:** e.g., "A 'leaf procambium' **is a** type of 'procambium'." (Parent: procambium, Child: leaf procambium)
        * **"Part-of" relationship:** e.g., "A 'petal' **is part of** a 'flower'." (Parent: flower, Child: petal)
        * **Developmental sequence:** e.g., "'Petal primordium visible stage' **is a stage in** 'petal development'." (Parent: petal development, Child: petal primordium visible stage)
        * **Anatomical hierarchy:** e.g., "'Root endodermis' **is part of** 'root' and **is a type of** 'endodermis'." (Parent: root and endodermis, Child: root endodermis)

        **List of Elements:**
        {list_of_elements}
        **Output Format:**
        Provide the output as a single JSON array. Each element in the array must be a JSON object with two keys: `"parent"` and `"child"`, whose values are the corresponding terms.

        **Example Output (Illustrative, showing the expected JSON structure):**
        {json.dumps({"Answer":[
        {
            "parent": "procambium",
            "child": "branch procambium"
        },
        {
            "parent": "procambium",
            "child": "leaf procambium"
        },
        {
            "parent": "shoot system",
            "child": "axillary shoot system"
        },
        {
            "parent": "flower",
            "child": "petal"
        },
        {
            "parent": "flower development",
            "child": "petal primordium visible stage"
        }
        ]})}
        """
        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

0it [00:00, ?it/s]

[
  {
    "parent": "point group symmetry",
    "child": "dihexagonal-dipyramidal"
  },
  {
    "parent": "point group symmetry",
    "child": "dihexagonal-pyramidal"
  },
  {
    "parent": "point group symmetry",
    "child": "diploidal"
  },
  {
    "parent": "point group symmetry",
    "child": "ditetragonal-dipyramidal"
  },
  {
    "parent": "point group symmetry",
    "child": "ditetragonal-pyramidal"
  },
  {
    "parent": "point group symmetry",
    "child": "ditrigonal-dipyramidal"
  },
  {
    "parent": "point group symmetry",
    "child": "ditrigonal-pyramidal"
  },
  {
    "parent": "point group symmetry",
    "child": "ditrigonal-scalenohedral"
  },
  {
    "parent": "point group symmetry",
    "child": "domatic"
  },
  {
    "parent": "point group symmetry",
    "child": "gyroidal"
  },
  {
    "parent": "point group symmetry",
    "child": "hexagonal-dipyrimidal"
  },
  {
    "parent": "point group symmetry",
    "child": "hexagonal-pyrimidal"
  },
  {
    "parent": "poi

In [14]:
final_ans

['[\n  {\n    "parent": "point group symmetry",\n    "child": "dihexagonal-dipyramidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "dihexagonal-pyramidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "diploidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "ditetragonal-dipyramidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "ditetragonal-pyramidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "ditrigonal-dipyramidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "ditrigonal-pyramidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "ditrigonal-scalenohedral"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "domatic"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "gyroidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": "hexagonal-dipyrimidal"\n  },\n  {\n    "parent": "point group symmetry",\n    "child": 

In [15]:
final_result = []
for ans in final_ans:
    ans_1 = json.loads(ans)
    for an in ans_1:
        if an!="Answer":
            final_result.append(an)
    

In [16]:
len(final_result)

929

In [17]:
final_result

[{'parent': 'point group symmetry', 'child': 'dihexagonal-dipyramidal'},
 {'parent': 'point group symmetry', 'child': 'dihexagonal-pyramidal'},
 {'parent': 'point group symmetry', 'child': 'diploidal'},
 {'parent': 'point group symmetry', 'child': 'ditetragonal-dipyramidal'},
 {'parent': 'point group symmetry', 'child': 'ditetragonal-pyramidal'},
 {'parent': 'point group symmetry', 'child': 'ditrigonal-dipyramidal'},
 {'parent': 'point group symmetry', 'child': 'ditrigonal-pyramidal'},
 {'parent': 'point group symmetry', 'child': 'ditrigonal-scalenohedral'},
 {'parent': 'point group symmetry', 'child': 'domatic'},
 {'parent': 'point group symmetry', 'child': 'gyroidal'},
 {'parent': 'point group symmetry', 'child': 'hexagonal-dipyrimidal'},
 {'parent': 'point group symmetry', 'child': 'hexagonal-pyrimidal'},
 {'parent': 'point group symmetry', 'child': 'hexagonal-trapezohedral'},
 {'parent': 'point group symmetry', 'child': 'hexoctahedral'},
 {'parent': 'point group symmetry', 'child':

In [18]:
with open("predictions_c_proco_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)


In [19]:
import json

# Save final_clusters_list to JSON file
with open("final_result_keywords_c_po.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [26]:
with open("_pairs.jsonl", "w", encoding="utf-8") as f:
    for entry in final_result:
        json.dump(entry, f)
        f.write("\n")
