In [83]:
import biothings_client as bt
import os
import re
import pandas as pd

In [114]:
path = os.path.join(os.getcwd(),"data", "hmdad")
hmdad_path = os.path.join(path, "hmdad_mid.txt")
mid = []
diseases = []
microbes = []
with open(hmdad_path, "r") as file:
    for idx, line in enumerate(file):
        if idx == 0:
            continue
        cols = line.split("\t")[:2]
        cols_lower = [col.lower().strip() for col in cols]
        cols_clean = [
            re.match(r"(.*)\(", col).group(1).strip() if re.match(r"(.*)\(", col) else col
            for col in cols_lower
        ]
        diseases.append(cols_clean[0])
        microbes.append(cols_clean[1])
        mid.append({cols_clean[0]: cols_clean[1]})

print(f"Number of unique diseases: {len(set(diseases))}")
print(f"Number of unique microbes: {len(set(microbes))}")
print(f"Number of edges: {len(mid)}")

Number of unique diseases: 39
Number of unique microbes: 292
Number of edges: 483


In [78]:
bt_d = bt.get_client("disease")
d_query = bt_d.querymany(set(diseases), scopes="disease_ontology.name", fields=["mondo"])
d_mondo = {}
best_results = {}

for entry in d_query:
    if 'query' in entry:
        query = entry['query']
        if 'notfound' not in entry and '_score' in entry:
            if query not in best_results or entry['_score'] > best_results[query]['_score']:
                best_results[query] = entry
        else:
            d_mondo[query] = None

d_mondo.update({query: result['_id'] for query, result in best_results.items()})

Input sequence provided is already in string format. No operation performed
13 input query terms found dup hits:	[('liver cirrhosis', 2), ('inflammatory bowel disease', 10), ('allergy', 10), ('atopic dermatitis', 
19 input query terms found no hit:	['atopy', "whipple's disease", "ileal crohn's disease", 'kidney stones', 'systemic inflammatory resp


In [79]:
len(d_mondo)

39

In [80]:
d_mondo

{'atopy': None,
 "whipple's disease": None,
 "ileal crohn's disease": None,
 'kidney stones': None,
 'systemic inflammatory response syndrome': None,
 'guttate psoriasis': None,
 'gastric and duodenal ulcer': None,
 'infectious colitis': None,
 'recurrent wheeze': None,
 'atopic sensitisation': None,
 'copd': None,
 'rheumatoid arthrits': None,
 'gastro-oesophageal reflux': None,
 'constipation irritable bowel syndrome': None,
 'new-onset untreated rheumatoid arthrits': None,
 'skin and mucosal infections': None,
 'allergic sensitization': None,
 'clostridium difficile infection': None,
 'diarrhea irritable bowel syndrome': None,
 'liver cirrhosis': 'MONDO:0005155',
 'inflammatory bowel disease': 'MONDO:0005265',
 'allergy': 'MONDO:0000775',
 'colorectal carcinoma': 'MONDO:0024331',
 'irritable bowel syndrome': 'MONDO:0005052',
 'atopic dermatitis': 'MONDO:0004980',
 'type 2 diabetes': 'MONDO:0005148',
 'cystic fibrosis': 'MONDO:0009061',
 "crohn's disease": 'MONDO:0005011',
 'type 1 d

In [87]:
df_d = pd.DataFrame(list(d_mondo.items()), columns=["disease", "mondo"])
out_f = os.path.join(path, "unmapped_diseases.xlsx")
df_d.to_excel(out_f, index=False)

***

In [115]:
microbes = [microbe for microbe in microbes if re.findall(r"\s", microbe)]
print(f"Number of microbes after removing the genus: {len(set(microbes))}")

Number of microbes after removing the genus: 148


In [116]:
microbes

['collinsella aerofaciens',
 'porphyromonas gingivalis',
 'helicobacter pylori',
 'helicobacter pylori',
 'staphylococcus aureus',
 'bifidobacterium catenulatum',
 'clostridium coccoides',
 'ruminococcus productus',
 'bacteroides fragilis',
 'clostridium coccoides',
 'clostridium leptum',
 'atopobium vaginae',
 'gardnerella vaginalis',
 'lactobacillus  crispatus',
 'clostridium coccoides',
 'clostridium leptum',
 'clostridium difficile',
 'clostridium difficile',
 'clostridium difficile',
 'clostridium difficile',
 'clostridium difficile',
 'escherichia coli',
 'helicobacter pylori',
 'helicobacter pylori',
 'clostridium cocleatum',
 'collinsella aerofaciens',
 'dietzia maris',
 'stenotrophomonas maltophilia',
 'staphylococcus aureus',
 'stenotrophomonas maltophilia',
 'streptococcus mitis',
 'staphylococcus aureus',
 'bacteroides ovatus',
 'bacteroides ovatus',
 'bacteroides uniformis',
 'bacteroides uniformis',
 'bacteroides vulgatus',
 'bacteroides vulgatus',
 'candidate division tm

In [117]:
bt_taxon = bt.get_client("taxon")
taxon_query = bt_taxon.querymany(set(microbes), scopes=["scientific_name"], fields=["taxid", "rank"])

56 input query terms found dup hits:	[('candidate division tm7', 10), ('shigella dysenteriae', 10), ('streptococcus anginosus', 10), ('ba
20 input query terms found no hit:	['ruminococcus productus', 'eubacterium rectale', 'leptotrichia amnionii', 'swine manure pit bacteri


In [120]:
taxon_query

[{'query': 'human intestinal firmicute co4',
  '_id': '165133',
  '_score': 37.408928,
  'rank': 'species',
  'taxid': 165133},
 {'query': 'butyrate-producing bacterium ph05yb02',
  '_id': '340475',
  '_score': 36.8601,
  'rank': 'species',
  'taxid': 340475},
 {'query': 'bacterium ic1391',
  '_id': '330059',
  '_score': 21.309988,
  'rank': 'species',
  'taxid': 330059},
 {'query': 'mastodon intestinal bacterium an-22-6',
  '_id': '51840',
  '_score': 35.371857,
  'rank': 'species',
  'taxid': 51840},
 {'query': 'candidate division tm7',
  '_id': '1364860',
  '_score': 22.966686,
  'rank': 'species',
  'taxid': 1364860},
 {'query': 'candidate division tm7',
  '_id': '239137',
  '_score': 22.966686,
  'rank': 'species',
  'taxid': 239137},
 {'query': 'candidate division tm7',
  '_id': '443342',
  '_score': 22.966686,
  'rank': 'species',
  'taxid': 443342},
 {'query': 'candidate division tm7',
  '_id': '1409904',
  '_score': 22.966686,
  'rank': 'species',
  'taxid': 1409904},
 {'query