In [22]:
import os
import re
import biothings_client as bt
import pandas as pd

In [3]:
phenodb_path = os.path.join(os.getcwd(), "core_table.txt")

In [37]:
edges = []
microbes_l = []
diseases_l = []
with open(phenodb_path, 'r') as f:
    next(f)
    for lns in f:
        lns = lns.strip().split("\t")
        if "??" or "\"" in lns[1]:
            microbes = lns[1].replace("??", "").replace("\"", "").lower()
            microbes_l.append(microbes)
        diseases = lns[2].lower()
        diseases_l.append(diseases)
        edges.append((microbes, diseases))
print(f"Number of edges: {len(edges)}")
print(f"Number of unique edges: {len(set(edges))}")
print(f"Number of unique microbes: {len(set(microbes_l))}")
print(f"Number of unique diseases: {len(set(diseases_l))}")

Number of edges: 5529
Number of unique edges: 5511
Number of unique microbes: 1774
Number of unique diseases: 500


In [93]:
unique_edges = set(edges)
unique_microbes = set(microbes_l)
unique_diseases = set(diseases_l)


unique_microbes = [re.match(r"^[^()]+", microbe).group().strip() for microbe in unique_microbes]
print(f"Number of unique microbes after cleaning: {len(unique_microbes)}")

Number of unique microbes after cleaning: 1774


In [94]:
bt_taxon = bt.get_client("taxon")
taxon_query = bt_taxon.querymany(unique_microbes, scopes=["scientific_name"], fields=["taxid", "rank"])

1161 input query terms found dup hits:	[('tannerella forsythia', 4), ('rous sarcoma virus', 9), ('angiostrongylus', 10), ('flavobacterium',
367 input query terms found no hit:	['eubacterium cylindroides', 'caulobacteriaceae', 'prevotella tannerae', 'penicillium marneffei', 'c


In [108]:
taxon_species = {
    queries["query"]: f"NCBITaxon:{queries['taxid']}"
    for queries in taxon_query
    if "notfound" not in queries and queries["rank"] == "species"}
print(f"Number of species: {len(taxon_species)}")

Number of species: 1316


In [109]:
taxon_species

{'tannerella forsythia': 'NCBITaxon:28112',
 'rous sarcoma virus': 'NCBITaxon:11886',
 'fusobacterium naviforme': 'NCBITaxon:77917',
 'angiostrongylus': 'NCBITaxon:334426',
 'flavobacterium': 'NCBITaxon:996',
 'microsporum audouinii': 'NCBITaxon:34393',
 'butyrate-producing bacterium ph07ay02': 'NCBITaxon:340478',
 'firmicutes': 'NCBITaxon:2013779',
 'coprococcus': 'NCBITaxon:3133154',
 'nitrosospira': 'NCBITaxon:136543',
 'butyrate-producing bacterium t2-132': 'NCBITaxon:105837',
 'aeribacillus': 'NCBITaxon:1549886',
 'peptostreptococcus stomatis': 'NCBITaxon:341694',
 'leptotrichia buccalis': 'NCBITaxon:40542',
 'oribacterium sinus': 'NCBITaxon:237576',
 'acetanaerobacterium elongatum': 'NCBITaxon:258515',
 'lachnospiraceae bacterium a2': 'NCBITaxon:397290',
 'lactonifactor': 'NCBITaxon:2584657',
 'veillonella': 'NCBITaxon:1911679',
 'trichinella spiralis': 'NCBITaxon:3157812',
 'corynebacterium ulcerans': 'NCBITaxon:65058',
 'holdemania': 'NCBITaxon:305955',
 'staphylococcus capitis

***

In [97]:
bt_d = bt.get_client("disease")
d_query = bt_d.querymany(unique_diseases, scopes="disease_ontology.name", fields=["mondo"])

Input sequence provided is already in string format. No operation performed
128 input query terms found dup hits:	[('tinea', 10), ('rheumatoid arthritis', 3), ('arthritis', 8), ('hyperlipidemia', 2), ('anemia', 10)
185 input query terms found no hit:	["hashimoto's thyroiditis", 'mild disease course', 'pathogenic', 'oligoarticular juvenile idiopathic


In [98]:
d_mondo = {}
best_results = {}

for entry in d_query:
    if 'query' in entry:
        query = entry['query']
        if 'notfound' not in entry and '_score' in entry:
            if query not in best_results or entry['_score'] > best_results[query]['_score']:
                best_results[query] = entry
        else:
            d_mondo[query] = None

d_mondo.update({query: result['_id'] for query, result in best_results.items()})
print(f"Number of mapped diseases: {len(d_mondo)}")

Number of mapped diseases: 501


In [99]:
d_mondo

{"hashimoto's thyroiditis": None,
 'mild disease course': None,
 'pathogenic': None,
 'oligoarticular juvenile idiopathic arthritis': None,
 'bacteria vaginosis': None,
 'abdominal abscess': None,
 'premature birth': None,
 'non-alcoholic steatohepatitis': None,
 'lymphoproliferative disease': None,
 'dacryocystitis - osteopoikilosis': None,
 'autoimmune polyendocrinopathy type 1': None,
 'probiotic': None,
 'scedosporiosis': None,
 'vincent angina bacteria': None,
 'skin and soft tissue staphylococcus aureus infection': None,
 'major depressive episode': None,
 'congenital microcephaly': None,
 'chlamydiosis': None,
 'atopy': None,
 'pervasive developmental disorder - not otherwise specified': None,
 'complications during parturition': None,
 'virus syndromes': None,
 'type i diabetes mellitus': None,
 'hiv infection': None,
 'cystic fibrosis associated meconium ileum': None,
 'hepatitis b infection': None,
 'bacteriemia': None,
 'causing nosocomial': None,
 'coronary heart disease': 

In [100]:
df_d = pd.DataFrame(list(d_mondo.items()), columns=["disease", "mondo"])
out_f = os.path.join("unmapped_diseases.xlsx")
df_d.to_excel(out_f, index=False)

In [101]:
mapped_diseases = pd.read_csv("unmapped_diseases_filled.csv")

In [102]:
mapped_diseases_d = dict(zip(mapped_diseases["disease"], mapped_diseases["mondo"]))

In [103]:
mapped_diseases_d

{"hashimoto's thyroiditis": 'MONDO:0007699',
 'mild disease course': nan,
 'pathogenic': nan,
 'oligoarticular juvenile idiopathic arthritis': 'MONDO:0011429',
 'bacteria vaginosis': 'MONDO:0005316',
 'abdominal abscess': nan,
 'premature birth': 'MESH:D007234',
 'non-alcoholic steatohepatitis': 'MONDO:0007027',
 'lymphoproliferative disease': 'MONDO:0016537',
 'dacryocystitis - osteopoikilosis': 'MONDO:0008158',
 'autoimmune polyendocrinopathy type 1': 'MONDO:0009411',
 'probiotic': nan,
 'scedosporiosis': 'MONDO:0018668',
 'vincent angina bacteria': nan,
 'skin and soft tissue staphylococcus aureus infection': 'MONDO:0005545',
 'major depressive episode': 'MONDO:0002009',
 'congenital microcephaly': 'MONDO:0016056',
 'chlamydiosis': 'MONDO:0021697',
 'atopy': 'MONDO:0005202',
 'pervasive developmental disorder - not otherwise specified': 'MONDO:0000594',
 'complications during parturition': 'GO:0007567',
 'virus syndromes': 'MONDO:0005108',
 'type i diabetes mellitus': 'MONDO:0005147

***

In [110]:
taxon_species

{'tannerella forsythia': 'NCBITaxon:28112',
 'rous sarcoma virus': 'NCBITaxon:11886',
 'fusobacterium naviforme': 'NCBITaxon:77917',
 'angiostrongylus': 'NCBITaxon:334426',
 'flavobacterium': 'NCBITaxon:996',
 'microsporum audouinii': 'NCBITaxon:34393',
 'butyrate-producing bacterium ph07ay02': 'NCBITaxon:340478',
 'firmicutes': 'NCBITaxon:2013779',
 'coprococcus': 'NCBITaxon:3133154',
 'nitrosospira': 'NCBITaxon:136543',
 'butyrate-producing bacterium t2-132': 'NCBITaxon:105837',
 'aeribacillus': 'NCBITaxon:1549886',
 'peptostreptococcus stomatis': 'NCBITaxon:341694',
 'leptotrichia buccalis': 'NCBITaxon:40542',
 'oribacterium sinus': 'NCBITaxon:237576',
 'acetanaerobacterium elongatum': 'NCBITaxon:258515',
 'lachnospiraceae bacterium a2': 'NCBITaxon:397290',
 'lactonifactor': 'NCBITaxon:2584657',
 'veillonella': 'NCBITaxon:1911679',
 'trichinella spiralis': 'NCBITaxon:3157812',
 'corynebacterium ulcerans': 'NCBITaxon:65058',
 'holdemania': 'NCBITaxon:305955',
 'staphylococcus capitis

In [111]:
mapped_diseases_d

{"hashimoto's thyroiditis": 'MONDO:0007699',
 'mild disease course': nan,
 'pathogenic': nan,
 'oligoarticular juvenile idiopathic arthritis': 'MONDO:0011429',
 'bacteria vaginosis': 'MONDO:0005316',
 'abdominal abscess': nan,
 'premature birth': 'MESH:D007234',
 'non-alcoholic steatohepatitis': 'MONDO:0007027',
 'lymphoproliferative disease': 'MONDO:0016537',
 'dacryocystitis - osteopoikilosis': 'MONDO:0008158',
 'autoimmune polyendocrinopathy type 1': 'MONDO:0009411',
 'probiotic': nan,
 'scedosporiosis': 'MONDO:0018668',
 'vincent angina bacteria': nan,
 'skin and soft tissue staphylococcus aureus infection': 'MONDO:0005545',
 'major depressive episode': 'MONDO:0002009',
 'congenital microcephaly': 'MONDO:0016056',
 'chlamydiosis': 'MONDO:0021697',
 'atopy': 'MONDO:0005202',
 'pervasive developmental disorder - not otherwise specified': 'MONDO:0000594',
 'complications during parturition': 'GO:0007567',
 'virus syndromes': 'MONDO:0005108',
 'type i diabetes mellitus': 'MONDO:0005147

In [112]:
mapped_edges = [(taxon_species.get(m), mapped_diseases_d.get(d)) for m, d in unique_edges if m in taxon_species and d in mapped_diseases_d]
print(f"Number of mapped edges: {len(mapped_edges)}")

Number of mapped edges: 4527


In [113]:
mapped_edges

[('NCBITaxon:2838600', 'MONDO:0008171'),
 ('NCBITaxon:29391', 'MONDO:0021245'),
 ('NCBITaxon:2040332', 'MONDO:0005011'),
 ('NCBITaxon:38285', 'MONDO:0018646'),
 ('NCBITaxon:1200579', 'MONDO:0005011'),
 ('NCBITaxon:1688', 'MONDO:0005147'),
 ('NCBITaxon:2558277', 'MONDO:0005441'),
 ('NCBITaxon:180360', 'MONDO:0005147'),
 ('NCBITaxon:212216', 'MONDO:0015254'),
 ('NCBITaxon:71451', 'MONDO:0005130'),
 ('NCBITaxon:1725', 'EFO:0003033'),
 ('NCBITaxon:747372', 'MONDO:0005002'),
 ('NCBITaxon:53419', 'MONDO:0005884'),
 ('NCBITaxon:1924944', 'MONDO:0005044'),
 ('NCBITaxon:39397', nan),
 ('NCBITaxon:3133154', 'MONDO:0019100'),
 ('NCBITaxon:178', 'MONDO:0005312'),
 ('NCBITaxon:1952272', 'MONDO:0005053'),
 ('NCBITaxon:84112', 'MONDO:0005311'),
 ('NCBITaxon:2702', 'MONDO:0005316'),
 ('NCBITaxon:817', 'MONDO:0019437'),
 ('NCBITaxon:218538', 'MONDO:0016052'),
 ('NCBITaxon:1911679', 'MONDO:0006727'),
 ('NCBITaxon:1311', 'MONDO:0006031'),
 ('NCBITaxon:1804973', 'MONDO:0007027'),
 ('NCBITaxon:134821', 'MO

***

In [138]:
microbe_idx = pd.read_csv("../sampled/unique_microbes_idx.dat", sep="\t", header=None, encoding="utf-8")
microbe_idx_d = dict(zip(microbe_idx[0], microbe_idx[1]))

In [139]:
disease_idx = pd.read_csv("../sampled/unique_diseases_idx.dat", sep="\t", header=None, encoding="utf-8")
disease_idx_d = dict(zip(disease_idx[0], disease_idx[1]))

In [144]:
indexed_edges = [(microbe_idx_d.get(m), disease_idx_d.get(d)) for m, d in mapped_edges if m in microbe_idx_d and d in disease_idx_d]
print(f"Number of indexed edges: {len(indexed_edges)}")

Number of indexed edges: 77


In [145]:
indexed_edges

[(111, 12),
 (161, 10),
 (11, 10),
 (259, 26),
 (81, 29),
 (248, 11),
 (259, 24),
 (214, 24),
 (259, 12),
 (247, 17),
 (219, 29),
 (84, 24),
 (247, 11),
 (259, 31),
 (247, 10),
 (259, 12),
 (71, 12),
 (259, 10),
 (259, 15),
 (214, 10),
 (56, 12),
 (84, 10),
 (85, 29),
 (21, 29),
 (21, 10),
 (257, 26),
 (101, 10),
 (27, 13),
 (106, 12),
 (111, 11),
 (161, 29),
 (151, 29),
 (259, 12),
 (123, 24),
 (29, 29),
 (44, 12),
 (249, 11),
 (257, 12),
 (259, 12),
 (259, 11),
 (219, 8),
 (259, 10),
 (47, 10),
 (64, 11),
 (28, 29),
 (71, 9),
 (259, 17),
 (214, 17),
 (110, 11),
 (248, 11),
 (259, 2),
 (79, 29),
 (259, 11),
 (24, 12),
 (259, 29),
 (214, 29),
 (84, 11),
 (247, 11),
 (245, 29),
 (214, 12),
 (11, 8),
 (199, 10),
 (106, 29),
 (111, 11),
 (106, 10),
 (111, 29),
 (84, 12),
 (244, 12),
 (84, 15),
 (143, 12),
 (259, 17),
 (259, 32),
 (247, 23),
 (251, 12),
 (123, 29),
 (259, 29),
 (47, 29)]

***

In [128]:
mkg_edges = pd.read_csv("../sampled/common_microbe_disease_idx.dat", sep="\t", header=None, encoding="utf-8", usecols=[0, 1])

In [146]:
mkg_edges_set = set(zip(mkg_edges[0], mkg_edges[1]))

In [147]:
mkg_edges_set

{(113, 26),
 (227, 11),
 (247, 26),
 (79, 33),
 (258, 35),
 (71, 29),
 (144, 13),
 (250, 31),
 (91, 7),
 (50, 6),
 (29, 32),
 (94, 12),
 (228, 12),
 (11, 14),
 (158, 33),
 (137, 10),
 (178, 11),
 (170, 7),
 (75, 35),
 (116, 36),
 (95, 13),
 (136, 14),
 (201, 31),
 (254, 6),
 (72, 29),
 (200, 35),
 (45, 12),
 (117, 37),
 (109, 33),
 (129, 11),
 (121, 7),
 (67, 36),
 (46, 13),
 (59, 32),
 (151, 29),
 (79, 10),
 (258, 12),
 (101, 28),
 (208, 11),
 (18, 30),
 (146, 36),
 (125, 13),
 (193, 31),
 (102, 29),
 (143, 30),
 (230, 35),
 (122, 7),
 (135, 26),
 (209, 12),
 (75, 12),
 (60, 32),
 (25, 11),
 (72, 6),
 (159, 11),
 (189, 33),
 (117, 14),
 (222, 30),
 (109, 10),
 (147, 36),
 (139, 32),
 (86, 26),
 (151, 6),
 (97, 35),
 (89, 31),
 (231, 35),
 (223, 31),
 (14, 37),
 (26, 11),
 (132, 29),
 (173, 30),
 (18, 7),
 (165, 26),
 (98, 36),
 (90, 32),
 (110, 10),
 (102, 6),
 (257, 29),
 (48, 35),
 (40, 31),
 (174, 31),
 (177, 36),
 (218, 37),
 (116, 26),
 (189, 10),
 (41, 32),
 (127, 35),
 (74, 29)

In [148]:
filtered_edges = [edge for edge in indexed_edges if edge not in mkg_edges_set]

In [150]:
print(f"Number of filtered edges: {len(filtered_edges)}")

Number of filtered edges: 5
