In [1]:
import subprocess
import re
import requests
import pandas as pd
import csv
import sys

# Increase the field size limit
csv.field_size_limit(sys.maxsize)



131072

131072

## Using OAK for Ontology Mapping

In [2]:
def map_mesh_to_mondo_or_hp(mesh_id):
    try:
        # First, try to find a MONDO mapping
        mondo_command = f"runoak -i translator: normalize MESH:{mesh_id} -M MONDO"
        mondo_result = subprocess.run(mondo_command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        # Try to extract MONDO ID
        if mondo_result.stdout:
            mondo_match = re.search(r'MONDO:(\d+)', mondo_result.stdout)
            if mondo_match:
                return f"MONDO:{mondo_match.group(1)}"  # Return the MONDO ID
        
        # If no MONDO ID found, try to find an HP mapping
        hp_command = f"runoak -i translator: normalize MESH:{mesh_id} -M HP"
        hp_result = subprocess.run(hp_command, shell=True, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        
        # Try to extract HP ID
        if hp_result.stdout:
            hp_match = re.search(r'HP:(\d+)', hp_result.stdout)
            if hp_match:
                return f"HP:{hp_match.group(1)}"  # Return the HP ID

        # If neither command returns a result
        print(f"No MONDO or HP ID found for MESH:{mesh_id}.")
        return None

    except subprocess.CalledProcessError as e:
        print(f"Error executing command: {e}\nOutput: {e.output}")
        return None



In [3]:
def replace_disease_with_mondo_in_text(text):
    # Regular expression to find "diseaseDxxxxxx"
    pattern = re.compile(r'(disease)(D\d{6})')
    
    # Function to replace each match
    def replace_func(match):
        prefix = match.group(1)  # Extract the "disease" prefix
        mesh_id = match.group(2)  # Extract MESH ID
        mondo_id = map_mesh_to_mondo_or_hp(mesh_id)  
        if mondo_id:
            return f"{mondo_id}"  
        else:
            return match.group(0)  # Return the original text if no MONDO ID is found
    
    # Replace all occurrences in the text
    updated_text = pattern.sub(replace_func, text)
    return updated_text


In [4]:
def process_tsv_and_replace_disease(input_tsv_path, output_tsv_path):
    with open(input_tsv_path, mode='r', encoding='utf-8') as infile, \
         open(output_tsv_path, mode='w', encoding='utf-8', newline='') as outfile:
        
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')
        
        for row in reader:
            if not row:  # Skip empty rows
                continue
            
            pmc_id = row[0]
            original_text = row[1]
            updated_text = replace_disease_with_mondo_in_text(original_text)
            
            writer.writerow([pmc_id, updated_text])
        
        print(f"Processed and saved updated texts to: {output_tsv_path}")

In [5]:
# Example usage
input_tsv_path = 'test_files/replaced_test_file.tsv'
output_tsv_path = 'test_files/oak_replaced_test_file.tsv'
process_tsv_and_replace_disease(input_tsv_path, output_tsv_path)


Error executing command: Command 'runoak -i translator: normalize MESH:D000755 -M MONDO' returned non-zero exit status 126.
Output: 
Error executing command: Command 'runoak -i translator: normalize MESH:D000755 -M MONDO' returned non-zero exit status 126.
Output: 
Error executing command: Command 'runoak -i translator: normalize MESH:D000755 -M MONDO' returned non-zero exit status 126.
Output: 
Error executing command: Command 'runoak -i translator: normalize MESH:D000755 -M MONDO' returned non-zero exit status 126.
Output: 
Error executing command: Command 'runoak -i translator: normalize MESH:D000755 -M MONDO' returned non-zero exit status 126.
Output: 
Error executing command: Command 'runoak -i translator: normalize MESH:D006453 -M MONDO' returned non-zero exit status 126.
Output: 
Error executing command: Command 'runoak -i translator: normalize MESH:D000755 -M MONDO' returned non-zero exit status 126.
Output: 
Error executing command: Command 'runoak -i translator: normalize MES

## Using POET for Ontology Mapping

### 1. Create a MESH to MONDO Dictionary 

In [44]:

api_url = "https://ontology.jax.org/api/mondo/terms/"
response = requests.get(api_url)
data = response.json()

# Initialize an empty dictionary to store the MESH to MONDO mappings
mesh_to_mondo = {}

# Iterate through each item in the response
for item in data:
    # Extract the ID and xrefs for each item
    mondo_id = item.get('id')
    xrefs = item.get('xrefs')

    # Iterate through each xref and add the MESH to MONDO mapping to the dictionary
    for xref in xrefs:
        if xref.startswith(('MESH:', 'MSH:', 'MeSH:')):
            mesh_code = xref.split(':')[1] 
            mesh_to_mondo[mesh_code] = mondo_id
            

print(mesh_to_mondo)


{'D004194': 'MONDO:0000001', 'D000309': 'MONDO:0000004', 'D053206': 'MONDO:0000022', 'D056887': 'MONDO:0000082', 'D065706': 'MONDO:0000087', 'D011629': 'MONDO:0000088', 'C567144': 'MONDO:0000104', 'C538270': 'MONDO:0000107', 'C535441': 'MONDO:0000110', 'D001139': 'MONDO:0000115', 'D056768': 'MONDO:0000128', 'C537362': 'MONDO:0000133', 'C536987': 'MONDO:0000141', 'D014188': 'MONDO:0000153', 'D000082602': 'MONDO:0000158', 'C536572': 'MONDO:0000159', 'C537463': 'MONDO:0000170', 'D058494': 'MONDO:0000171', 'C536405': 'MONDO:0000179', 'D014693': 'MONDO:0000190', 'C536447': 'MONDO:0000193', 'C536725': 'MONDO:0000200', 'C563783': 'MONDO:0000211', 'C562999': 'MONDO:0000212', 'C000656784': 'MONDO:0000239', 'C536166': 'MONDO:0000241', 'C000656825': 'MONDO:0000242', 'D019595': 'MONDO:0000248', 'D010854': 'MONDO:0000253', 'D059249': 'MONDO:0000262', 'D011015': 'MONDO:0002572', 'D055732': 'MONDO:0000266', 'C535275': 'MONDO:0000290', 'D060586': 'MONDO:0000306', 'D000839': 'MONDO:0000309', 'D017674':

### 2. Create a MESH to HP Dictionary 

In [35]:
api_url = "https://ontology.jax.org/api/hp/terms/"
response = requests.get(api_url)
data = response.json()

# Initialize an empty dictionary to store the HP to MONDO mappings
mesh_to_hp = {}

# Iterate through each item in the response
for item in data:
    # Extract the ID and xrefs for each item
    hp_id = item.get('id')
    xrefs = item.get('xrefs')

    # Iterate through each xref and add the MESH to HP mapping to the dictionary
    for xref in xrefs:
        if xref.startswith(('MESH:', 'MSH:', 'MeSH:')):
            mesh_code = xref.split(':')[1] 
            mesh_to_hp[mesh_code] = hp_id

print(mesh_to_hp)


{'D021782': 'HP:0000003', 'D001750': 'HP:0000011', 'C562406': 'HP:0000015', 'D016055': 'HP:0000016', 'D053158': 'HP:0000017', 'D014549': 'HP:0000020', 'C536139': 'HP:0010956', 'D006552': 'HP:0000023', 'D011472': 'HP:0000024', 'D005058': 'HP:0000026', 'D053713': 'HP:0000027', 'D003456': 'HP:0012741', 'D004823': 'HP:0000031', 'D006848': 'HP:0100673', 'D058490': 'HP:0000037', 'D007006': 'HP:0008213', 'D012734': 'HP:0000062', 'D014518': 'HP:0000070', 'C537373': 'HP:0000074', 'D014718': 'HP:0000076', 'D007674': 'HP:0000112', 'D051437': 'HP:0000083', 'D000069337': 'HP:0000085', 'D011507': 'HP:0000093', 'D005923': 'HP:0000097', 'D005921': 'HP:0000099', 'D009404': 'HP:0000100', 'D011141': 'HP:0000103', 'D052177': 'HP:0000107', 'C563261': 'HP:0000110', 'D007690': 'HP:0000113', 'D014564': 'HP:0000119', 'D009397': 'HP:0000121', 'D009393': 'HP:0000123', 'D006869': 'HP:0000126', 'C562565': 'HP:0000130', 'D007889': 'HP:0000131', 'D008595': 'HP:0000132', 'D006059': 'HP:0000133', 'D006060': 'HP:000013

In [36]:
len(mesh_to_hp)

1800

### 3. Create a MESH to MaXO Dictionary 

In [37]:
api_url = "https://ontology.jax.org/api/maxo/terms/"
response = requests.get(api_url)
data = response.json()

# Initialize an empty dictionary to store the MaXO to MONDO mappings
mesh_to_maxo = {}

# Iterate through each item in the response
for item in data:
    # Extract the ID and xrefs for each item
    maxo_id = item.get('id')
    xrefs = item.get('xrefs')

    # Iterate through each xref and add the MESH to MaXO mapping to the dictionary
    for xref in xrefs:
        if xref.startswith(('MESH:', 'MSH:', 'MeSH:')):
            mesh_code = xref.split(':')[1] 
            mesh_to_maxo[mesh_code] = maxo_id

print(mesh_to_maxo)


{'D010102': 'MAXO:0000066', 'D019637': 'MAXO:0000477', 'D001852': 'MAXO:0000484', 'D018889': 'MAXO:0000485', 'D019857': 'MAXO:0000486', 'D058109': 'MAXO:0000500', 'D060666': 'MAXO:0000501', 'D007442': 'MAXO:0000502', 'D014140': 'MAXO:0000504', 'D063087': 'MAXO:0000506', 'D006612': 'MAXO:0000507', 'D021061': 'MAXO:0000508', 'D011175': 'MAXO:0000509', 'D015919': 'MAXO:0000510', 'D045422': 'MAXO:0000512', 'D015916': 'MAXO:0000513', 'D035641': 'MAXO:0000514', 'D015199': 'MAXO:0000515', 'D007440': 'MAXO:0000517', 'D017582': 'MAXO:0000600'}


In [38]:
len(mesh_to_maxo)

20

### 3. Save ontologies using SSSOM

### 4. Replaceme 

In [20]:
# Dictionaries:
# mesh_to_mondo
# mesh_to_hp
# mesh_to_maxo


In [39]:
def map_mesh(mesh_id):
    # Try to extract MONDO ID
    if mesh_id in mesh_to_mondo:
        return mesh_to_mondo.get(mesh_id)
    elif mesh_id in mesh_to_hp:
        return mesh_to_hp.get(mesh_id)
    elif mesh_id in mesh_to_maxo:
        return mesh_to_maxo.get(mesh_id)
    
    # If neither command returns a result
    print(f"No mapping found for MESH:{mesh_id}.")
    return None

    

In [40]:
def replace_disease_in_text(text):
    # Regular expression to find "diseaseDxxxxxx"
    pattern = re.compile(r'(disease)(D\d{6})')
    
    # Function to replace each match
    def replace_func(match):
        prefix = match.group(1)  # Extract the "disease" prefix
        mesh_id = match.group(2)  # Extract MESH ID
        mapped_id = map_mesh(mesh_id)  
        if mapped_id:
            return f"{mapped_id}"  
        else:
            return match.group(0)  # Return the original text if no corresponding mapping found
    
    # Replace all occurrences in the text
    updated_text = pattern.sub(replace_func, text)
    return updated_text


In [45]:
def process_tsv_and_replace_disease(input_tsv_path, output_tsv_path):
    with open(input_tsv_path, mode='r', encoding='utf-8') as infile, \
         open(output_tsv_path, mode='w', encoding='utf-8', newline='') as outfile:
        
        reader = csv.reader(infile, delimiter='\t')
        writer = csv.writer(outfile, delimiter='\t')
        
        for row in reader:
            if not row or len(row) < 2:
                continue
            
            pmc_id = row[0]
            original_text = row[1]
            updated_text = replace_disease_in_text(original_text)
            
            writer.writerow([pmc_id, updated_text])
        
        print(f"Processed and saved updated texts to: {output_tsv_path}")

    

In [53]:
# Example usage
input_tsv_path = 'mesh_replaced.tsv'
output_tsv_path = 'poet_replaced.tsv'
process_tsv_and_replace_disease(input_tsv_path, output_tsv_path)


No mapping found for MESH:D006940.
No mapping found for MESH:D006940.
No mapping found for MESH:D001157.
No mapping found for MESH:D007249.
No mapping found for MESH:D001157.
No mapping found for MESH:D000090.
No mapping found for MESH:D007249.
No mapping found for MESH:D007249.
No mapping found for MESH:D007249.
No mapping found for MESH:D005355.
No mapping found for MESH:D006461.
No mapping found for MESH:D007249.
No mapping found for MESH:D001157.
No mapping found for MESH:D001157.
No mapping found for MESH:D001157.
No mapping found for MESH:D007249.
No mapping found for MESH:D007238.
No mapping found for MESH:D006461.
No mapping found for MESH:D006461.
No mapping found for MESH:D006461.
No mapping found for MESH:D006461.
No mapping found for MESH:D001157.
No mapping found for MESH:D001157.
No mapping found for MESH:D007238.
No mapping found for MESH:D007249.
No mapping found for MESH:D000080.
No mapping found for MESH:D007249.
No mapping found for MESH:D001157.
No mapping found for