## Post Process OntoGPT Results 

In [1]:
import os
import yaml
from collections import defaultdict

In [5]:
def load_yaml_files(directory_path):
    data = []
    for file_name in os.listdir(directory_path):
        if file_name.endswith('.yaml'):
            pubmed_id = os.path.splitext(file_name)[0]
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, 'r') as f:
                content = yaml.safe_load(f)
                content['pubmed_id'] = pubmed_id  # Add PubMed ID to the article data
                data.append(content)
    return data

def extract_triplets(data):
    triplets = []
    named_entities = {}

    def add_triplets(section, pubmed_id):
        for triplet in article.get('extracted_object', {}).get(section, []):
            for obj in triplet['object']:
                triplets.append((triplet['subject'], triplet['predicate'], obj, triplet.get('object_qualifier'), pubmed_id))

    for article in data:
        pubmed_id = article['pubmed_id']
        add_triplets('action_to_disease', pubmed_id)
        add_triplets('action_to_symptom', pubmed_id)
        named_entities[pubmed_id] = article['named_entities']

    return triplets, named_entities



def count_triplets(triplets):
    triplet_counts = defaultdict(lambda: {'count': 0, 'pubmed_ids': set()})
    for triplet in triplets:
        key = triplet[:-1]  # Exclude the PubMed ID from the key
        pubmed_id = triplet[-1]
        triplet_counts[key]['count'] += 1
        triplet_counts[key]['pubmed_ids'].add(pubmed_id)
    return triplet_counts

def rank_triplets(triplet_counts):
    ranked_triplets = sorted(triplet_counts.items(), key=lambda x: x[1]['count'], reverse=True)
    return ranked_triplets

directory_path = '../test_case/test_ontogpt_result_non_replaced'
data = load_yaml_files(directory_path)
triplets, named_entities = extract_triplets(data)
triplet_counts = count_triplets(triplets)
ranked_triplets = rank_triplets(triplet_counts)

for triplet, info in ranked_triplets:
    print(f'Triplet: {triplet}, Count: {info["count"]}, PubMed IDs: {list(info["pubmed_ids"])}')

print("\nNamed Entities:")
for pubmed_id, entities in named_entities.items():
    print(f'PubMed ID: {pubmed_id}, Entities: {entities}')

Triplet: ('disease modifying agents', 'TREATS', 'MONDO:0007374', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('nutritional supplements', 'TREATS', 'MONDO:0007374', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('anti-malarials', 'TREATS', 'MONDO:0005136', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('supplementary arginine', 'TREATS', 'MONDO:0007374', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('supplementary citrulline', 'TREATS', 'MONDO:0007374', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('daily chloroquine', 'TREATS', 'MONDO:0007374', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('hydroxyurea', 'TREATS', 'MONDO:0007374', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('disease modifying agents', 'IS USED TO TREAT', 'pain crisis', None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('nutritional supplements', 'IS USED TO IMPROVE', "children's growth", None), Count: 1, PubMed IDs: ['33596221']
Triplet: ('anti-malarials', 'IS USED TO REDUCE'