In [64]:
import os
import pandas as pd
from ete3 import NCBITaxa
import biothings_client as bt
import tarfile
import csv
import gzip

from micro_disease_parser import line_generator_4_midi, get_taxon_info, load_merged_from_tar, get_current_taxid
from micro_meta_parser import get_bigg_metabolite_mapping
from data_utils import line_generator, check_line_fields, print_misalignment_report, check_missing_data

In [2]:
os.getcwd()

'/Users/bailinzhang/Documents/Wu_Lab/Projects/GMMAD2'

## Microbe-Disease

In [3]:
micro_disease_path = os.path.join("downloads", "disease_species.csv")
micro_disease = [line for line in line_generator(micro_disease_path)]

In [4]:
header = micro_disease[0]
print(header)
print(len(header))

['id', 'disease_id', 'disease', 'organism', 'level', 'species_id', 'disease_samples', 'disease_mean', 'disease_median', 'disease_sd', 'health_id', 'health', 'health_samples', 'health_mean', 'health_median', 'health_sd', 'change', 'alteration', 'disease_info', 'phylum', 'class', 'order', 'family', 'genus']
24


In [5]:
fields = check_line_fields(micro_disease)
print_misalignment_report(fields, len(micro_disease))

330467 lines have different numbers of fields out of 508142 total lines.
Example from line 68163: ['68163', 'D012595', 'Scleroderma', ' Systemic', 'Curtobacterium flaccumfaciens', 'species', '2035', '0', '0', '0', '0', 'D006262', 'Health', '18', '0.000776612', '0.00064064', '0.000434879', '-0.00064064', 'Decrease', 'A chronic multi-system disorder of CONNECTIVE TISSUE. It is characterized by SCLEROSIS in the SKIN', ' the LUNGS', ' the HEART', ' the GASTROINTESTINAL TRACT', ' the KIDNEYS', ' and the MUSCULOSKELETAL SYSTEM. Other important features include diseased small BLOOD VESSELS and AUTOANTIBODIES. The disorder is named for its most prominent feature (hard skin)', ' and classified into subsets by the extent of skin thickening: LIMITED SCLERODERMA and DIFFUSE SCLERODERMA.', 'Actinomycetota', 'Actinomycetes', 'Micrococcales', 'Microbacteriaceae', 'Curtobacterium']
Number of fields in this line: 31


In [29]:
micro_disease = [line for line in line_generator_4_midi(micro_disease_path, skip_header=False)]
for i, line in enumerate(micro_disease):
    if i == 68162:
        print(len(line))
        print(line)
        break

24
['68162', 'D015212', 'Inflammatory Bowel Diseases', 'Curtobacterium flaccumfaciens', 'species', '2035', '8', '0.000436402', '0.000380536', '0.000197322', 'D006262', 'Health', '18', '0.000776612', '0.00064064', '0.000434879', '-0.000260104', 'Decrease', 'Chronic,non-specific inflammation of the GASTROINTESTINAL TRACT. Etiology may be genetic or environmental. This term includes CROHN DISEASE and ULCERATIVE COLITIS.', 'Actinomycetota', 'Actinomycetes', 'Micrococcales', 'Microbacteriaceae', 'Curtobacterium']


In [30]:
header = micro_disease[0]
data_rows = micro_disease[1:]
micro_disease_df = pd.DataFrame(data_rows, columns=header)

In [31]:
print(micro_disease_df.columns)

Index(['id', 'disease_id', 'disease', 'organism', 'level', 'species_id',
       'disease_samples', 'disease_mean', 'disease_median', 'disease_sd',
       'health_id', 'health', 'health_samples', 'health_mean', 'health_median',
       'health_sd', 'change', 'alteration', 'disease_info', 'phylum', 'class',
       'order', 'family', 'genus'],
      dtype='object')


In [33]:
check_missing_data(micro_disease_df)

{'phylum': {'not available'},
 'class': {'not available'},
 'order': {'not available'},
 'family': {'not available'},
 'genus': {'not available'}}

In [34]:
taxids = [line[5] for line in line_generator_4_midi(micro_disease_path)]
print(len(set(taxids)))

6966


In [35]:
notfound = [
    taxon["query"]
    for taxon in get_taxon_info(micro_disease_path)
    if "notfound" in taxon.keys()
]

In [36]:
print(len(notfound))
print(len(set(notfound)))

15
15


In [37]:
mapping = load_merged_from_tar("taxdump.tar.gz")
mapped_taxid = get_current_taxid(notfound, mapping)

In [38]:
if "194866" in mapping:
    print("194866 is in the mapping")
    print(f"current taxid for 194866: {mapping['194866']}")
else:
    print("194866 is not in the mapping")

194866 is in the mapping
current taxid for 194866: 46624


In [39]:
len(mapped_taxid)

15

In [40]:
new_taxids = [new for old, new in mapped_taxid.items()]
print(len(new_taxids))
print(len(set(new_taxids)))

15
1


In [41]:
print(set(new_taxids))

{None}


In [42]:
still_notfound = [taxid for taxid in notfound if taxid not in mapped_taxid.keys()]
print(len(still_notfound))

0


In [43]:
new_taxid_mapped = get_taxon_info(new_taxids)
new_taxid_mapped

[{'query': 'None', 'notfound': True}]

In [44]:
_ids = [taxon["query"] for taxon in new_taxid_mapped]
still_notfound = [taxid for taxid in new_taxids if taxid not in _ids]

In [45]:
len(set(_ids))

1

In [46]:
taxids = sorted([line[5] for line in line_generator_4_midi(micro_disease_path)])
print(len(taxids))
print(len(set(taxids)))

508141
6966


## Microbe-Metabolite

In [47]:
micro_meta_df = pd.read_csv(os.path.join("downloads", "micro_metabolic.csv"), low_memory=False)

In [48]:
micro_meta_col_map = dict(enumerate(micro_meta_df.columns))
micro_meta_col_map

{0: 'id',
 1: 'g_micro',
 2: 'organism',
 3: 'g_meta',
 4: 'metabolic',
 5: 'pubchem_compound',
 6: 'pubchem_id',
 7: 'formula',
 8: 'kegg_id',
 9: 'tax_id',
 10: 'phylum',
 11: 'class',
 12: 'order',
 13: 'family',
 14: 'genus',
 15: 'species',
 16: 'species_id',
 17: 'source',
 18: 'smiles_sequence',
 19: 'HMDBID',
 20: 'Origin'}

In [49]:
micro_meta = [line for line in line_generator(os.path.join("downloads", "micro_metabolic.csv"), skip_header=False)]

In [50]:
fields = check_line_fields(micro_meta)
print_misalignment_report(fields, len(micro_meta))

All lines have the same number of fields.


In [51]:
check_missing_data(micro_meta_df)

{'pubchem_id': {'not available'},
 'formula': {'not available'},
 'kegg_id': {'not available'},
 'tax_id': {'not available'},
 'phylum': {'not available'},
 'class': {'not available'},
 'order': {'not available'},
 'family': {'not available'},
 'genus': {'not available'},
 'species': {'not available'},
 'species_id': {'not available'},
 'smiles_sequence': {'not available'},
 'HMDBID': {'not available'},
 'Origin': {'Unknown'}}

In [52]:
no_chem_id = []
for line in micro_meta:
    if "not available" in line[6] and "not available" in line[19]:
        no_chem_id.append(line[4])
print(len(no_chem_id))

304266


In [53]:
print(len(set(no_chem_id)))

1349


In [54]:
pubchem_cids = [line[6] for line in micro_meta if "not available" not in line[6]]
print(len(set(pubchem_cids)))

1572


In [55]:
hmdb_ids = [line[19] for line in micro_meta if "not available" not in line[19] and "not available" in line[6]]
print(len(set(hmdb_ids)))

25


In [56]:
total_metabolites = [line[4] for line in micro_meta]
print(len(set(total_metabolites)))

3201


In [57]:
bigg_mapped = get_bigg_metabolite_mapping(os.path.join("downloads", "bigg_models_metabolites.txt"))

In [58]:
len(bigg_mapped)

8771

In [59]:
bigg_mapped_ids = [name.lower() for name in set(no_chem_id) if name.lower() in bigg_mapped]

In [60]:
len(bigg_mapped_ids)

264

In [61]:
set(no_chem_id)

{'1-tetradec-7-enoyl-sn-glycerol 3-phosphate',
 'mucin-type O-glycan No 77',
 'released mucin-type O-glycan No 87',
 'procollagen type 1 n-terminal propeptide',
 'Diglucosyl-1,2 diisotetradecanoylglycerol',
 'released mucin-type O-glycan No 195',
 'Kestose (2 fru, 1 glc inulin-type fructo-oligosaccharide)',
 '1,2-Diisohexadecanoyl-sn-glycerol',
 'R-3-hydroxypalmitoyl-[acyl-carrier protein]',
 '12-methyl-tridecanoyl-ACP',
 'Apoprotein [acyl carrier protein]',
 'Beta-1,6-N-acetylglucosamine-N-acetylgalactosamine',
 'released mucin-type O-glycan No 88',
 'C6h8n2o2_141.0657p_0n_18.9_rb_biocrustexp',
 'mucin-type O-glycan No 124',
 'Unk_298.1133p_0n_10.1_rb_biocrustexp',
 'Glycogen, Structure 4 (Glycogenin-1,6-{2[1,4-Glc], [1,4-Glc]})',
 'Isotetradecanoyllipoteichoic acid (n=24), linked, glucose substituted',
 '1,2-diisoheptadecanoyl-sn-glycerol 3-phosphate (15-methyl-hexadecanoyl, iso-C17)',
 'released mucin-type O-glycan No 178',
 'released mucin-type O-glycan No 186',
 'Unk_325.2379p_0n_

## Metabolite - Gene

In [62]:
meta_gene_df = pd.read_csv(os.path.join("downloads", "meta_gene_net.csv"), low_memory=False)

In [63]:
dict(enumerate(meta_gene_df.columns))

{0: 'id',
 1: 'g_meta',
 2: 'compound',
 3: 'pubchem_id',
 4: 'formula',
 5: 'kegg_id',
 6: 'HMDBID',
 7: 'drug_id',
 8: 'drug_name',
 9: 'Origin',
 10: 'smiles_sequence',
 11: 'gene_id',
 12: 'gene',
 13: 'ensembl_id',
 14: 'NCBI',
 15: 'HGNC',
 16: 'UniProt',
 17: 'protein_size',
 18: 'annonation',
 19: 'score',
 20: 'alteration',
 21: 'PMID',
 22: 'source'}

In [65]:
meta_gene_data = [line for line in line_generator(os.path.join("downloads", "meta_gene_net.csv"), skip_header=False)]

In [68]:
fields = check_line_fields(meta_gene_data)
print_misalignment_report(fields, len(meta_gene_data))

All lines have the same number of fields.


In [69]:
for i, line in enumerate(meta_gene_data):
    if i == 2:
        print(len(line))
        print(line)
        break

23
['2', 'meta2', '(1,6-alpha-D-Glucosyl)m', '3662', '(C12H20O10)n', 'C00372', 'not available', 'Not available', 'Not available', 'Microbiota; Food related; Drug related', 'CC(C)C(=O)C12C(=O)C(=C(C(C1=O)(CC(C2(C)CCC=C(C)C)CC=C(C)C)CC=C(C)C)O)CC=C(C)C', 'g380', 'ALOX5', 'ENSG00000012779', '240', '435', 'P09917', '674', 'This gene encodes a member of the lipoxygenase gene family and plays a dual role in the synthesis of leukotrienes from arachidonic acid. The encoded protein, which is expressed specifically in bone marrow-derived cells, catalyzes the conversion of arachidonic acid to 5(S)-hydroperoxy-6-trans-8,11,14-cis-eicosatetraenoic acid, and further to the allylic epoxide 5(S)-trans-7,9-trans-11,14-cis-eicosatetrenoic acid (leukotriene A4). Leukotrienes are important mediators of a number of inflammatory and allergic conditions. Mutations in the promoter region of this gene lead to a diminished response to antileukotriene drugs used in the treatment of asthma and may also be associa

In [76]:
print(len(check_missing_data(meta_gene_df)))
check_missing_data(meta_gene_df)

16


{'pubchem_id': {'Not available'},
 'formula': {'not available'},
 'kegg_id': {'not available'},
 'HMDBID': {'not available'},
 'drug_id': {'Not available'},
 'drug_name': {'Not available'},
 'Origin': {'Unknown'},
 'smiles_sequence': {'not available'},
 'ensembl_id': {'Not available'},
 'NCBI': {'Not available'},
 'HGNC': {'Not available'},
 'UniProt': {'Not available'},
 'annonation': {'Not Available'},
 'score': {'Not available'},
 'alteration': {'Unknown'},
 'PMID': {'Not available'}}

In [75]:
meta_gene_df["pubchem_id"].unique()

array(['3662', '339', '11213350', '439194', '99290', '440847', '193305',
       '440878', '222656', '439888', '440854', '439400', '439417',
       '439430', '217', 'Not available', '5280626', '439960', '92135',
       '439350', '225936', '440162', '439982', '5459811', '11966158',
       '11966160', '440873', '11966173', '11966216', '11966179',
       '11966177', '440077', '5462303', '439269', '1172', '3081033',
       '535', '4', '263', '445395', '443201', '27476', '79143', '457',
       '440932', '9547180', '68161', '1031', '440282', '5462190', '428',
       '10442', '1045', '439153', '44140569', '14080393', '24970825',
       '10308378', '79034', '9963391', '91451', '440043', '13730',
       '188966', '12599', '13711', '65091', '13712', '65063', '101812',
       '800', '5287432', '1530', '1000', '5280673', '65275', '363',
       '6657', '1015', '469', '6119', '5281909', '192781', '9378', '524',
       '92779', '5462148', '11266', '7866', '43', '11970', '77',
       '3080745', '237246

In [70]:
meta_gene_df["Origin"].unique()

array(['Microbiota; Food related; Drug related',
       'Host; Microbiota; Food related', 'Unknown', 'Microbiota',
       'Host; Microbiota; Food related; Drug related',
       'Microbiota; Food related', 'Host; Microbiota',
       'Microbiota; Drug related',
       'Microbiota; Food related; Drug related; Environment',
       'Microbiota; Food related; Environment',
       'Host; Microbiota; Food related; Drug related; Environment',
       'Host; Microbiota; Food related; Environment',
       'Host; Microbiota; Drug related'], dtype=object)

In [72]:
meta_gene_df["alteration"].unique()

array(['Unknown', 'elevated', 'reduced', 'target', 'Inhibitor',
       'Activator'], dtype=object)

In [73]:
meta_gene_df["source"].unique()

array(['stitch', 'gutMGene', 'stitch, gutMGene', 'stitch, drugbank',
       'drugbank'], dtype=object)

In [77]:
meta_gene_df["PMID"].unique()

array(['Not available', '31142855', '25913757', '25749343', '26879219',
       '30401435', '19865172', '32795610', '16534848', '28266623',
       '31734354', '21178864', '23300800', '30683619', '25382172',
       '30767231', '21889493', '19966295', '28658542', '26241311',
       '30566883'], dtype=object)