In [1]:
import os
import pandas as pd
from ete3 import NCBITaxa
import biothings_client as bt
import tarfile
import csv
import gzip

from micro_disease_parser import line_generator_4_midi, get_taxon_info, load_merged_from_tar, get_current_taxid
from micro_meta_parser import line_generator, get_bigg_metabolite_mapping
from data_utils import check_missing_data

In [2]:
os.getcwd()

'/Users/bailinzhang/Documents/Wu_Lab/Projects/GMMAD2'

## Microbe-Disease

In [None]:
with open(os.path.join("downloads", "disease_species.csv"), "r") as f:
    for i, line in enumerate(f, start=1):
        if i == 4:
            print(len(line.strip().split(",")))
            print(line)
            break

In [None]:
micro_disease_path = os.path.join("downloads", "disease_species.csv")
micro_disease = line_generator(micro_disease_path)
for i, line in enumerate(micro_disease):
    if i == 3:
        print(len(line))
        print(line)
    elif i == 15:
        print(len(line))
        print(line)
    elif i == 16:
        print(len(line))
        print(line)
        break

In [None]:
taxids = [line[5] for line in line_generator(micro_disease_path)]
print(len(set(taxids)))

In [None]:
notfound = [
    taxon["query"]
    for taxon in get_taxon_info(micro_disease_path)
    if "notfound" in taxon.keys()
]

In [None]:
print(len(notfound))
print(len(set(notfound)))

In [None]:
mapping = load_merged_from_tar("taxdump.tar.gz")
mapped_taxid = get_current_taxid(notfound, mapping)

In [None]:
if "194866" in mapping:
    print("194866 is in the mapping")
    print(f"current taxid for 194866: {mapping['194866']}")
else:
    print("194866 is not in the mapping")

In [None]:
len(mapped_taxid)

In [None]:
new_taxids = [new for old, new in mapped_taxid.items()]
print(len(new_taxids))
print(len(set(new_taxids)))

In [None]:
print(set(new_taxids))

In [None]:
still_notfound = [taxid for taxid in notfound if taxid not in mapped_taxid.keys()]
print(len(still_notfound))

In [None]:
new_taxid_mapped = get_taxon_info(new_taxids)
new_taxid_mapped

In [None]:
_ids = [taxon["query"] for taxon in new_taxid_mapped]
still_notfound = [taxid for taxid in new_taxids if taxid not in _ids]

In [None]:
len(set(_ids))

In [None]:
taxids = sorted([line[5] for line in line_generator(micro_disease_path)])
print(len(taxids))
print(len(set(taxids)))

## Microbe-Metabolite

In [None]:
micro_meta_df = pd.read_csv(os.path.join("downloads", "micro_metabolic.csv"), low_memory=False)

In [None]:
micro_meta_col_map = dict(enumerate(micro_meta_df.columns))
micro_meta_col_map

In [None]:
micro_meta = [line for line in line_generator(os.path.join("downloads", "micro_metabolic.csv"))]
for i, line in enumerate(micro_meta):
    if i == 4:
        print(len(line))
        print(line)
    elif i == 500:
        print(len(line))
        print(line)
    elif i == 1480:
        print(len(line))
        print(line)
        break

In [None]:
for i, line in enumerate(micro_meta):
    if len(line) != 21:
        print(f"Line {i} has {len(line)} columns: {line}")

In [None]:
no_chem_id = []
for line in micro_meta:
    if "not available" in line[6] and "not available" in line[19]:
        no_chem_id.append(line[4])
print(len(no_chem_id))

In [None]:
print(len(set(no_chem_id)))

In [None]:
pubchem_cids = [line[6] for line in micro_meta if "not available" not in line[6]]
print(len(set(pubchem_cids)))

In [None]:
hmdb_ids = [line[19] for line in micro_meta if "not available" not in line[19] and "not available" in line[6]]
print(len(set(hmdb_ids)))

In [None]:
total_metabolites = [line[4] for line in micro_meta]
print(len(set(total_metabolites)))

In [None]:
bigg_mapped = get_bigg_metabolite_mapping(os.path.join("downloads", "bigg_models_metabolites.txt"))

In [None]:
len(bigg_mapped)

In [None]:
bigg_mapped_ids = [name.lower() for name in set(no_chem_id) if name.lower() in bigg_mapped]

In [None]:
len(bigg_mapped_ids)

In [None]:
set(no_chem_id)

## Metabolite - Gene

In [3]:
meta_gene_df = pd.read_csv(os.path.join("downloads", "meta_gene_net.csv"), low_memory=False)

In [4]:
dict(enumerate(meta_gene_df.columns))

{0: 'id',
 1: 'g_meta',
 2: 'compound',
 3: 'pubchem_id',
 4: 'formula',
 5: 'kegg_id',
 6: 'HMDBID',
 7: 'drug_id',
 8: 'drug_name',
 9: 'Origin',
 10: 'smiles_sequence',
 11: 'gene_id',
 12: 'gene',
 13: 'ensembl_id',
 14: 'NCBI',
 15: 'HGNC',
 16: 'UniProt',
 17: 'protein_size',
 18: 'annonation',
 19: 'score',
 20: 'alteration',
 21: 'PMID',
 22: 'source'}

In [5]:
meta_gene_data = [line for line in line_generator(os.path.join("downloads", "meta_gene_net.csv"))]

In [6]:
for i, line in enumerate(meta_gene_data):
    if i == 2:
        print(len(line))
        print(line)
    elif i == 1000:
        print(len(line))
        print(line)
    elif i == 2000:
        print(len(line))
        print(line)

23
['3', 'meta2', '(1,6-alpha-D-Glucosyl)m', '3662', '(C12H20O10)n', 'C00372', 'not available', 'Not available', 'Not available', 'Microbiota; Food related; Drug related', 'CC(C)C(=O)C12C(=O)C(=C(C(C1=O)(CC(C2(C)CCC=C(C)C)CC=C(C)C)CC=C(C)C)O)CC=C(C)C', 'g1758', 'CXCL8', 'ENSG00000169429', '3576', '6025', 'P10145', '99', 'The protein encoded by this gene is a member of the CXC chemokine family and is a major mediator of the inflammatory response. The encoded protein is commonly referred to as interleukin-8 (IL-8). IL-8 is secreted by mononuclear macrophages, neutrophils, eosinophils, T lymphocytes, epithelial cells, and fibroblasts. It functions as a chemotactic factor by guiding the neutrophils to the site of infection. Bacterial and viral products rapidly induce IL-8 expression. IL-8 also participates with other cytokines in the proinflammatory signaling cascade and plays a role in systemic inflammatory response syndrome (SIRS). This gene is believed to play a role in the pathogenesis

In [7]:
misaligned_lines = []
for i, line in enumerate(meta_gene_data):
    if len(line) != 23:
        misaligned_lines.append(i)
print(len(misaligned_lines))
print(misaligned_lines)

0
[]


In [10]:
meta_gene_missing = check_missing_data(meta_gene_df)
meta_gene_missing

{'pubchem_id': {'Not available'},
 'formula': {'not available'},
 'kegg_id': {'not available'},
 'HMDBID': {'not available'},
 'drug_id': {'Not available'},
 'drug_name': {'Not available'},
 'Origin': {'Unknown'},
 'smiles_sequence': {'not available'},
 'ensembl_id': {'Not available'},
 'NCBI': {'Not available'},
 'HGNC': {'Not available'},
 'UniProt': {'Not available'},
 'annonation': {'Not Available'},
 'score': {'Not available'},
 'alteration': {'Unknown'},
 'PMID': {'Not available'}}

In [9]:
meta_gene_df["compound"].unique()

array(['(1,6-alpha-D-Glucosyl)m', '(2-Aminoethyl)phosphonic acid',
       '(20S)-Protopanaxadiol', '(2R)-2,3-Dihydroxypropanoic acid',
       '(2R,6S)-2,6-diaminoheptanedioic acid',
       '(2S)-2-(3-Carboxypropanoylamino)pentanedioic acid',
       '(2S)-2-amino-5-oxopentanoic acid',
       '(2S)-2-hydroxy-2-methyl-3-oxobutanoic acid',
       '(2S)-2-Hydroxybutanedioic acid', '(2S,3S)-butane-2,3-diol',
       '(2S,4R)-2-amino-4-hydroxypentanedioic acid',
       '(3R)-3-hydroxy-3-methyl-5-(phosphonooxy)pentanoic acid',
       '(3S)-3,6-diaminohexanoic acid',
       '(3S,5S)-3,5-diaminohexanoic acid',
       '(4-Amino-2-methylpyrimidin-5-yl)methyl trihydrogen diphosphate',
       '(4s)-4-[(2e)-Oct-2-Enoyloxy]-4-(Trimethylazaniumyl)Butanoate',
       '(4S)-4,5-dihydroxy-2-oxopentanoic acid',
       '(R)-2-Hydroxy-4-methylpentanoic acid',
       '(R)-3-Hydroxybutyric acid', '(R)-Lactaldehyde',
       '(R,R)-2,3-butanediol', '(S)-1-pyrroline-5-carboxylate',
       '(S)-2,3-dihydrodipicolini

In [None]:
print(len(meta_gene_df["compound"].unique()))

In [None]:
meta_gene_df["pubchem_id"].unique()

In [None]:
print(len(meta_gene_df["pubchem_id"].unique()))

In [None]:
meta_gene_df["kegg_id"].unique()

In [None]:
meta_gene_df["HMDBID"].unique()

In [None]:
meta_gene_df["drug_id"].unique()

In [None]:
meta_gene_df["drug_name"].unique()

In [None]:
meta_gene_df["Origin"].unique()

In [None]:
meta_gene_df["smiles_sequence"].unique()

In [None]:
"Unknow" or "Not available" in meta_gene_df["gene"].unique()

In [None]:
"Unknow" and "Not available" in meta_gene_df["ensembl_id"].unique()

In [None]:
"Unknow" and "Not available" in meta_gene_df["NCBI"].unique()

In [None]:
"Unknow" and "Not available" in meta_gene_df["HGNC"].unique()

In [None]:
"Unknow" and "Not available" in meta_gene_df["UniProt"].unique()

In [None]:
"Unknow" or "Not available" in meta_gene_df["protein_size"].unique()

In [None]:
"Unknow" or "Not available" in meta_gene_df["annonation"].unique()

In [None]:
"Unknow" and "Not available" in meta_gene_df["score"].unique()

In [None]:
meta_gene_df["score"].unique()

In [None]:
"Unknow" or "Not available" in meta_gene_df["alteration"].unique()

In [None]:
meta_gene_df["alteration"].unique()

In [None]:
"Unknow" and "Not available" in meta_gene_df["PMID"].unique()

In [None]:
meta_gene_df["PMID"].unique()

In [None]:
any(val in meta_gene_df["source"].unique() for val in ["Unknow","Not available"])

In [None]:
meta_gene_df["source"].unique()