In [46]:
import json
def load_json(data_path):
    with open(data_path, "r", encoding="utf-8") as file_1:
        data = json.load(file_1)
    return data

In [47]:
data = load_json("OBI/train/obi_train_pairs.json")

In [48]:
len(data), data[:5]

(8249,
 [{'ID': 'TR_bb9941a6',
   'parent': 'hemoglobin assay',
   'child': 'cooximitery arterial blood hemoglobin assay'},
  {'ID': 'TR_a5a2af93',
   'parent': 'signal conversion function',
   'child': 'signal amplification function'},
  {'ID': 'TR_e2e1a388',
   'parent': 'exclusion criterion',
   'child': 'chemotherapy treatment exclusion criterion'},
  {'ID': 'TR_5ad85031',
   'parent': 'automatic tissue processor',
   'child': 'Leica Peloris rapid tissue processor'},
  {'ID': 'TR_f2e67ac9',
   'parent': 'cytometry assay',
   'child': 'cerebrospinal fluid mesothelial cell count assay'}])

In [49]:
from collections import defaultdict

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        # Path compression
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

def build_clusters(relationships):
    uf = UnionFind()
    
    for rel in relationships:
        parent = rel["parent"]
        child = rel["child"]
        uf.union(parent, child)

    clusters = defaultdict(set)
    for term in set([item for rel in relationships for item in (rel["parent"], rel["child"])]):
        root = uf.find(term)
        clusters[root].add(term)

    # Add isolated nodes (no parent/child relationship)
    all_terms = set()
    for rel in relationships:
        all_terms.add(rel["parent"])
        all_terms.add(rel["child"])
    for term in all_terms:
        if term not in uf.parent:
            clusters[term].add(term)

    return list(clusters.values())


In [50]:
clusters = build_clusters(data)

In [51]:
len(clusters)

159

In [52]:
for cluster in clusters:
    print(len(cluster))

1906
24
1030
113
14
172
3
2
17
16
15
2
103
3
49
20
3
6
14
6
3
2
36
6
9
2
2
2
6
12
35
3
2
5
25
9
7
9
6
5
5
2
14
82
3
3
2
26
2
9
4
3
11
15
2
5
4
4
4
3
5
5
4
3
15
11
3
6
5
4
2
4
4
4
8
4
4
5
2
4
2
3
2
2
2
2
9
3
2
2
3
2
2
2
2
3
3
5
8
2
2
2
3
4
2
4
2
3
2
3
4
6
2
3
6
3
2
3
2
4
3
2
4
3
2
3
3
3
3
3
9
3
2
3
3
2
3
2
2
2
2
2
3
3
3
2
2
2
2
4
2
3
2
3
2
2
2
2
2


In [42]:
# Load a text file in Python
final_content = []
with open("OBI/test/obi_test_types.txt", "r", encoding="utf-8") as f:
    contents = f.readlines()
    for content in contents:
        final_content.append(content.strip('\n'))


print(contents[:500])  # Print first 500 characters


['GEM premier blood gas venous blood ionized calcium assay\n', 'standard deviation calculation\n', 'insecticide resistance by detecting carboxylic ester hydrolase activity assay\n', 'age measurement datum\n', 'intracellular cytokine staining assay measuring epitope specific interleukin-22 production by T cells\n', 'creatinine clearance urine creatinine assay\n', 'brachioradialis functionality\n', 'ELISA measuring epitope specific macrophage inflammatory protein-1 alpha production by T cells\n', 'hemoglobin oxygen saturation arterial blood oxygen assay\n', 'areola\n', 'disposition to cause an allergic reaction\n', 'direct venous blood bilirubin assay\n', 'spatial region\n', 'POC chem8 arterial blood sodium assay\n', 'assay measuring epitope specific interleukin-27 production by T cells\n', 'material sample\n', 'solid NMR probe\n', 'B cell epitope specific antibody-dependent cellular cytotoxicity\n', 'machine learning\n', 'laser\n', 'umbilical cord blood specimen\n', 'photomultiplier tub

In [43]:
len(contents), contents[0]

(2821, 'GEM premier blood gas venous blood ionized calcium assay\n')

In [44]:
len(final_content), final_content[0]

(2821, 'GEM premier blood gas venous blood ionized calcium assay')

In [45]:
# Step 3: Assign function
from collections import Counter
from tqdm.notebook import tqdm
def assign_term_to_cluster(new_term, clusters):
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]
    
    if max_score == 0:
        return None  # No match
    return best_cluster_index

# Step 4: Assign all new terms
final_clusters = [set(cluster) for cluster in clusters]  # copy original clusters

for term in tqdm(final_content):
    assigned_idx = assign_term_to_cluster(term, final_clusters)
    if assigned_idx is not None:
        final_clusters[assigned_idx].add(term)
    else:
        # Create new cluster if no match found
        final_clusters.append(set([term]))

# Step 5: Convert to list of lists and return
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")

  0%|          | 0/2821 [00:00<?, ?it/s]

Cluster 1: ['16s ribosomal gene sequencing assay', '24 hour total urine volume assay', '24 hour urine creatinine assay', '24 hour urine protein assay', '25-hydroxyvitamin D2', '293-T cell line', "3,3',5'-triiodothyronine", "3,3',5'-triiodothyronine assay", "3,3',5'-triiodothyronine concentration assay", '3-hydroxybutyric acid', '3-hydroxybutyric acid assay', '3D cell structure determination assay', '3D molecular structure determination assay', '3D molecular structure determination assay of a MHC:ligand complex', '3D molecular structure determination assay of a T cell epitope:MHC:TCR complex', '3D molecular structure determination assay of an antigen:antibody complex', '3D neural cell structure determination assay', '3D structure determination assay', '3D structure determination of bound molecular complex assay', '3H-thymidine assay measuring epitope specific proliferation of T cells', "5'-nucleotidase activity level assay", '51 chromium assay measuring epitope specific T cell killing',

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [15]:
final_ans = []

In [16]:
for i, list_of_elements in tqdm(enumerate(final_clusters_list[24:])):
    if len(list_of_elements)>1:
        prompt = f""""Analyze the following list of biological terms and identify all direct and indirect **parent-child relationships**.

        A **parent-child relationship** exists when one term (the parent) is a broader, more general category or a whole that conceptually encompasses or contains another term (the child). The child term is a more specific instance, a part, or a developmental stage of the parent term.

        **Guidelines for identifying relationships:**
        * **"Is-a" relationship:** e.g., "A 'leaf procambium' **is a** type of 'procambium'." (Parent: procambium, Child: leaf procambium)
        * **"Part-of" relationship:** e.g., "A 'petal' **is part of** a 'flower'." (Parent: flower, Child: petal)
        * **Developmental sequence:** e.g., "'Petal primordium visible stage' **is a stage in** 'petal development'." (Parent: petal development, Child: petal primordium visible stage)
        * **Anatomical hierarchy:** e.g., "'Root endodermis' **is part of** 'root' and **is a type of** 'endodermis'." (Parent: root and endodermis, Child: root endodermis)

        **List of Elements:**
        {list_of_elements}
        **Output Format:**
        Provide the output as a single JSON array. Each element in the array must be a JSON object with two keys: `"parent"` and `"child"`, whose values are the corresponding terms.

        **Example Output (Illustrative, showing the expected JSON structure):**
        {json.dumps({"Answer":[
        {
            "parent": "procambium",
            "child": "branch procambium"
        },
        {
            "parent": "procambium",
            "child": "leaf procambium"
        },
        {
            "parent": "shoot system",
            "child": "axillary shoot system"
        },
        {
            "parent": "flower",
            "child": "petal"
        },
        {
            "parent": "flower development",
            "child": "petal primordium visible stage"
        }
        ]})}
        """
        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

0it [00:00, ?it/s]

[
  {
    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",
    "child": "DNASE 1 structure mapping assay"
  },
  {
    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",
    "child": "RNA ADA I RNA structure mapping assay"
  },
  {
    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",
    "child": "RNASE CL3 structure mapping assay"
  },
  {
    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",
    "child": "RNASE T1 structure mapping assay"
  },
  {
    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",
    "child": "RNASE T2 structure mapping assay"
  },
  {
    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",
    "child": "RNASE U2 structure mapping assay"
  },
  {
    "

In [17]:
final_ans

['[\n  {\n    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",\n    "child": "DNASE 1 structure mapping assay"\n  },\n  {\n    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",\n    "child": "RNA ADA I RNA structure mapping assay"\n  },\n  {\n    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",\n    "child": "RNASE CL3 structure mapping assay"\n  },\n  {\n    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",\n    "child": "RNASE T1 structure mapping assay"\n  },\n  {\n    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",\n    "child": "RNASE T2 structure mapping assay"\n  },\n  {\n    "parent": "single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing",\n    "child": "RNASE U2 structure mapp

In [21]:
final_result = []
for ans in final_ans:
    ans_1 = json.loads(ans)
    if "Answer" not in ans_1:
        for an in ans_1:
            # print(an)
            # if an!="Answer":
                if an not in final_result:
                    final_result.append(an)
    else:
        for an in ans_1['Answer']:
                if an not in final_result:
                    final_result.append(an)
        
            
    

In [22]:
len(final_result)

652

In [23]:
final_result

[{'parent': 'single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing',
  'child': 'DNASE 1 structure mapping assay'},
 {'parent': 'single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing',
  'child': 'RNA ADA I RNA structure mapping assay'},
 {'parent': 'single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing',
  'child': 'RNASE CL3 structure mapping assay'},
 {'parent': 'single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing',
  'child': 'RNASE T1 structure mapping assay'},
 {'parent': 'single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing',
  'child': 'RNASE T2 structure mapping assay'},
 {'parent': 'single-nucleotide-resolution nucleic acid structure mapping assay using enzymatic probing',
  'child': 'RNASE U2 structure mapping assay'},
 {'parent': 'single-nucleotide-resolution nucleic acid structure mapping assa

In [24]:
with open("predictions_c_simple_method_keyword_OBI_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)


In [19]:
import json

# Save final_clusters_list to JSON file
with open("final_result_keywords_c_po.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [26]:
with open("_pairs.jsonl", "w", encoding="utf-8") as f:
    for entry in final_result:
        json.dump(entry, f)
        f.write("\n")
