In [1]:
import json
def load_json(data_path):
    with open(data_path, "r", encoding="utf-8") as file_1:
        data = json.load(file_1)
    return data

In [2]:
data = load_json("SchemaOrg/train/schemaorg_train_pairs.json")

In [3]:
len(data), data[:5]

(723,
 [{'ID': 'TR_56ac8cd6', 'parent': 'Enumeration', 'child': 'WarrantyScope'},
  {'ID': 'TR_f00fae20', 'parent': 'Text', 'child': 'CssSelectorType'},
  {'ID': 'TR_0d2dca15',
   'parent': 'Intangible',
   'child': 'HealthInsurancePlan'},
  {'ID': 'TR_e6303dd1', 'parent': 'PerformingGroup', 'child': 'TheaterGroup'},
  {'ID': 'TR_5a0a6adc', 'parent': 'Event', 'child': 'TheaterEvent'}])

In [4]:
from collections import defaultdict

class UnionFind:
    def __init__(self):
        self.parent = {}

    def find(self, x):
        # Path compression
        if x != self.parent.setdefault(x, x):
            self.parent[x] = self.find(self.parent[x])
        return self.parent[x]

    def union(self, x, y):
        self.parent[self.find(x)] = self.find(y)

def build_clusters(relationships):
    uf = UnionFind()
    
    for rel in relationships:
        parent = rel["parent"]
        child = rel["child"]
        uf.union(parent, child)

    clusters = defaultdict(set)
    for term in set([item for rel in relationships for item in (rel["parent"], rel["child"])]):
        root = uf.find(term)
        clusters[root].add(term)

    # Add isolated nodes (no parent/child relationship)
    all_terms = set()
    for rel in relationships:
        all_terms.add(rel["parent"])
        all_terms.add(rel["child"])
    for term in all_terms:
        if term not in uf.parent:
            clusters[term].add(term)

    return list(clusters.values())


In [5]:
clusters = build_clusters(data)

In [6]:
len(clusters)

44

In [7]:
clusters

[{'3DModel',
  'APIReference',
  'AboutPage',
  'ActionStatusType',
  'AdultOrientedEnumeration',
  'AdvertiserContentArticle',
  'AggregateRating',
  'AmpStory',
  'AnatomicalStructure',
  'ApprovedIndication',
  'ArchiveComponent',
  'Article',
  'Audience',
  'Audiobook',
  'BedDetails',
  'BlogPosting',
  'BoardingPolicyType',
  'BoatReservation',
  'BoatTrip',
  'Bone',
  'Book',
  'BookFormatType',
  'BookSeries',
  'BrainStructure',
  'BroadcastChannel',
  'BroadcastService',
  'BusTrip',
  'BusinessAudience',
  'BusinessEntityType',
  'BusinessEvent',
  'BusinessFunction',
  'CableOrSatelliteService',
  'CarUsageType',
  'Certification',
  'CertificationStatusEnumeration',
  'Chapter',
  'CheckoutPage',
  'Claim',
  'Code',
  'Collection',
  'CollectionPage',
  'ComedyEvent',
  'ComicCoverArt',
  'ComicIssue',
  'ComicStory',
  'CompoundPriceSpecification',
  'ConstraintNode',
  'ContactPage',
  'ContactPoint',
  'ContactPointOption',
  'Conversation',
  'Course',
  'CourseInst

In [8]:
# Load a text file in Python
final_content = []
with open("SchemaOrg/test/schemaorg_test_types.txt", "r", encoding="utf-8") as f:
    contents = f.readlines()
    for content in contents:
        final_content.append(content.strip('\n'))


print(contents[:500])  # Print first 500 characters


['FollowAction\n', 'PreOrderAction\n', 'InteractionCounter\n', 'SoftwareSourceCode\n', 'MedicalRiskCalculator\n', 'InteractAction\n', 'MedicalAudience\n', 'MotorcycleDealer\n', 'OnlineBusiness\n', 'Mosque\n', 'Language\n', 'Order\n', 'Pharmacy\n', 'HotelRoom\n', 'DryCleaningOrLaundry\n', 'Synagogue\n', 'MemberProgramTier\n', 'Consortium\n', 'PublicationVolume\n', 'MediaReview\n', 'RealEstateListing\n', 'IceCreamShop\n', 'Vessel\n', 'Organization\n', 'ChemicalSubstance\n', 'VideoGallery\n', 'MediaGallery\n', 'DisagreeAction\n', 'InvestmentFund\n', 'PlayGameAction\n', 'Locksmith\n', 'SpecialAnnouncement\n', 'EUEnergyEfficiencyEnumeration\n', 'AutoPartsStore\n', 'Message\n', 'ComputerLanguage\n', 'LendAction\n', 'CommentAction\n', 'TouristInformationCenter\n', 'VisualArtsEvent\n', 'Brand\n', 'StructuredValue\n', 'CreateAction\n', 'HealthPlanNetwork\n', 'VoteAction\n', 'Pond\n', 'LearningResource\n', 'MedicalGuidelineRecommendation\n', 'Course\n', 'MapCategoryType\n', 'Service\n', 'Car\n',

In [9]:
len(contents), contents[0]

(359, 'FollowAction\n')

In [10]:
len(final_content), final_content[0]

(359, 'FollowAction')

In [11]:
# Step 3: Assign function
from collections import Counter
from tqdm.notebook import tqdm
def assign_term_to_cluster(new_term, clusters):
    new_words = new_term.lower().split()
    cluster_scores = []

    for cluster in clusters:
        word_count = Counter()
        for term in cluster:
            for word in term.lower().split():
                word_count[word] += 1
        score = sum(word_count[word] for word in new_words)
        cluster_scores.append(score)
    
    best_cluster_index = max(range(len(cluster_scores)), key=lambda i: cluster_scores[i])
    max_score = cluster_scores[best_cluster_index]
    
    if max_score == 0:
        return None  # No match
    return best_cluster_index

# Step 4: Assign all new terms
final_clusters = [set(cluster) for cluster in clusters]  # copy original clusters

for term in tqdm(final_content):
    assigned_idx = assign_term_to_cluster(term, final_clusters)
    if assigned_idx is not None:
        final_clusters[assigned_idx].add(term)
    else:
        # Create new cluster if no match found
        final_clusters.append(set([term]))

# Step 5: Convert to list of lists and return
final_clusters_list = [sorted(list(cluster)) for cluster in final_clusters]

# Display final clusters
for i, cluster in enumerate(final_clusters_list, 1):
    print(f"Cluster {i}: {cluster}")

  0%|          | 0/359 [00:00<?, ?it/s]

Cluster 1: ['3DModel', 'APIReference', 'AboutPage', 'ActionStatusType', 'AdultOrientedEnumeration', 'AdvertiserContentArticle', 'AggregateRating', 'AmpStory', 'AnatomicalStructure', 'ApprovedIndication', 'ArchiveComponent', 'Article', 'Audience', 'Audiobook', 'BedDetails', 'BlogPosting', 'BoardingPolicyType', 'BoatReservation', 'BoatTrip', 'Bone', 'Book', 'BookFormatType', 'BookSeries', 'BrainStructure', 'BroadcastChannel', 'BroadcastService', 'BusTrip', 'BusinessAudience', 'BusinessEntityType', 'BusinessEvent', 'BusinessFunction', 'CableOrSatelliteService', 'CarUsageType', 'Certification', 'CertificationStatusEnumeration', 'Chapter', 'CheckoutPage', 'Claim', 'Code', 'Collection', 'CollectionPage', 'ComedyEvent', 'ComicCoverArt', 'ComicIssue', 'ComicStory', 'CompoundPriceSpecification', 'ConstraintNode', 'ContactPage', 'ContactPoint', 'ContactPointOption', 'Conversation', 'Course', 'CourseInstance', 'CoverArt', 'CreativeWork', 'CreativeWorkSeason', 'CreativeWorkSeries', 'DanceEvent', '

In [None]:
from google import genai
model = "gemini-2.5-flash"
client= genai.Client(api_key="")

In [13]:
final_ans = []

In [14]:
for i, list_of_elements in tqdm(enumerate(final_clusters_list)):
    if len(list_of_elements)>1:
        prompt = f""""Analyze the following list of biological terms and identify all direct and indirect **parent-child relationships**.

        A **parent-child relationship** exists when one term (the parent) is a broader, more general category or a whole that conceptually encompasses or contains another term (the child). The child term is a more specific instance, a part, or a developmental stage of the parent term.

        **Guidelines for identifying relationships:**
        * **"Is-a" relationship:** e.g., "A 'leaf procambium' **is a** type of 'procambium'." (Parent: procambium, Child: leaf procambium)
        * **"Part-of" relationship:** e.g., "A 'petal' **is part of** a 'flower'." (Parent: flower, Child: petal)
        * **Developmental sequence:** e.g., "'Petal primordium visible stage' **is a stage in** 'petal development'." (Parent: petal development, Child: petal primordium visible stage)
        * **Anatomical hierarchy:** e.g., "'Root endodermis' **is part of** 'root' and **is a type of** 'endodermis'." (Parent: root and endodermis, Child: root endodermis)

        **List of Elements:**
        {list_of_elements}
        **Output Format:**
        Provide the output as a single JSON array. Each element in the array must be a JSON object with two keys: `"parent"` and `"child"`, whose values are the corresponding terms.

        **Example Output (Illustrative, showing the expected JSON structure):**
        {json.dumps({"Answer":[
        {
            "parent": "procambium",
            "child": "branch procambium"
        },
        {
            "parent": "procambium",
            "child": "leaf procambium"
        },
        {
            "parent": "shoot system",
            "child": "axillary shoot system"
        },
        {
            "parent": "flower",
            "child": "petal"
        },
        {
            "parent": "flower development",
            "child": "petal primordium visible stage"
        }
        ]})}
        """
        generation_config = {"response_mime_type": "application/json"}

        response = client.models.generate_content(
                contents= prompt,
                config=generation_config,
                model=model
            )
        final_ans.append(response.text)
        if i%20==0:
            print(response.text)

0it [00:00, ?it/s]

[
  {
    "parent": "AnatomicalStructure",
    "child": "Bone"
  },
  {
    "parent": "AnatomicalStructure",
    "child": "BrainStructure"
  },
  {
    "parent": "AnatomicalStructure",
    "child": "Joint"
  },
  {
    "parent": "AnatomicalStructure",
    "child": "LymphaticVessel"
  },
  {
    "parent": "AnatomicalStructure",
    "child": "SuperficialAnatomy"
  },
  {
    "parent": "AnatomicalStructure",
    "child": "Vein"
  },
  {
    "parent": "AnatomicalStructure",
    "child": "Vessel"
  },
  {
    "parent": "Article",
    "child": "AdvertiserContentArticle"
  },
  {
    "parent": "Article",
    "child": "BlogPosting"
  },
  {
    "parent": "Article",
    "child": "LiveBlogPosting"
  },
  {
    "parent": "Article",
    "child": "MedicalScholarlyArticle"
  },
  {
    "parent": "Article",
    "child": "ScholarlyArticle"
  },
  {
    "parent": "Article",
    "child": "TechArticle"
  },
  {
    "parent": "Audience",
    "child": "BusinessAudience"
  },
  {
    "parent": "Audience",
 

In [15]:
final_ans

['[\n  {\n    "parent": "AnatomicalStructure",\n    "child": "Bone"\n  },\n  {\n    "parent": "AnatomicalStructure",\n    "child": "BrainStructure"\n  },\n  {\n    "parent": "AnatomicalStructure",\n    "child": "Joint"\n  },\n  {\n    "parent": "AnatomicalStructure",\n    "child": "LymphaticVessel"\n  },\n  {\n    "parent": "AnatomicalStructure",\n    "child": "SuperficialAnatomy"\n  },\n  {\n    "parent": "AnatomicalStructure",\n    "child": "Vein"\n  },\n  {\n    "parent": "AnatomicalStructure",\n    "child": "Vessel"\n  },\n  {\n    "parent": "Article",\n    "child": "AdvertiserContentArticle"\n  },\n  {\n    "parent": "Article",\n    "child": "BlogPosting"\n  },\n  {\n    "parent": "Article",\n    "child": "LiveBlogPosting"\n  },\n  {\n    "parent": "Article",\n    "child": "MedicalScholarlyArticle"\n  },\n  {\n    "parent": "Article",\n    "child": "ScholarlyArticle"\n  },\n  {\n    "parent": "Article",\n    "child": "TechArticle"\n  },\n  {\n    "parent": "Audience",\n    "child"

In [16]:
final_result = []
for ans in final_ans:
    ans_1 = json.loads(ans)
    for an in ans_1:
        if an!="Answer":
            if an not in final_result:
                final_result.append(an)
    

In [17]:
len(final_result)

659

In [18]:
final_result

[{'parent': 'AnatomicalStructure', 'child': 'Bone'},
 {'parent': 'AnatomicalStructure', 'child': 'BrainStructure'},
 {'parent': 'AnatomicalStructure', 'child': 'Joint'},
 {'parent': 'AnatomicalStructure', 'child': 'LymphaticVessel'},
 {'parent': 'AnatomicalStructure', 'child': 'SuperficialAnatomy'},
 {'parent': 'AnatomicalStructure', 'child': 'Vein'},
 {'parent': 'AnatomicalStructure', 'child': 'Vessel'},
 {'parent': 'Article', 'child': 'AdvertiserContentArticle'},
 {'parent': 'Article', 'child': 'BlogPosting'},
 {'parent': 'Article', 'child': 'LiveBlogPosting'},
 {'parent': 'Article', 'child': 'MedicalScholarlyArticle'},
 {'parent': 'Article', 'child': 'ScholarlyArticle'},
 {'parent': 'Article', 'child': 'TechArticle'},
 {'parent': 'Audience', 'child': 'BusinessAudience'},
 {'parent': 'Audience', 'child': 'EducationalAudience'},
 {'parent': 'Audience', 'child': 'MedicalAudience'},
 {'parent': 'Audience', 'child': 'ParentAudience'},
 {'parent': 'Audience', 'child': 'PeopleAudience'},
 

In [19]:
with open("predictions_c_SchemaOrg_pairs.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)


In [19]:
import json

# Save final_clusters_list to JSON file
with open("final_result_keywords_c_po.json", "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)

print("Clusters saved to final_clusters.json ✅")

Clusters saved to final_clusters.json ✅


In [26]:
with open("_pairs.jsonl", "w", encoding="utf-8") as f:
    for entry in final_result:
        json.dump(entry, f)
        f.write("\n")
