In [1]:
import os
import pandas as pd
from ete3 import NCBITaxa
import biothings_client as bt
import tarfile
import csv
import gzip

from micro_disease_parser import line_generator_4_midi, get_taxon_info, load_merged_from_tar, get_current_taxid
from micro_meta_parser import line_generator, get_bigg_metabolite_mapping
from data_utils import check_missing_data

In [2]:
os.getcwd()

'/Users/bailinzhang/Documents/Wu_Lab/Projects/GMMAD2'

## Microbe-Disease

In [3]:
with open(os.path.join("downloads", "disease_species.csv"), "r") as f:
    for i, line in enumerate(f, start=1):
        if i == 4:
            print(len(line.strip().split(",")))
            print(line)
            break

29
3,D009765,Obesity,Azorhizobium caulinodans,species,7,0,0,0,0,D006262,Health,3,0.029020807,0.0248571,0.031027083,-0.0248571,Decrease,A status with BODY WEIGHT that is grossly above the acceptable or desirable weight, usually due to accumulation of excess FATS in the body. The standards may vary with age, sex, genetic or cultural background. In the BODY MASS INDEX, a BMI greater than 30.0 kg/m2 is considered obese, and a BMI greater than 40.0 kg/m2 is considered morbidly obese (MORBID OBESITY).,Pseudomonadota,Alphaproteobacteria,Hyphomicrobiales,Xanthobacteraceae,Azorhizobium



In [4]:
micro_disease_path = os.path.join("downloads", "disease_species.csv")
micro_disease = line_generator(micro_disease_path)
for i, line in enumerate(micro_disease):
    if i == 3:
        print(len(line))
        print(line)
    elif i == 15:
        print(len(line))
        print(line)
    elif i == 16:
        print(len(line))
        print(line)
        break

26
['4', 'D008103', 'Liver Cirrhosis', 'Azorhizobium caulinodans', 'species', '7', '0', '0', '0', '0', 'D006262', 'Health', '3', '0.029020807', '0.0248571', '0.031027083', '-0.0248571', 'Decrease', 'Liver disease in which the normal microcirculation', ' the gross vascular anatomy', ' and the hepatic architecture have been variably destroyed and altered with fibrous septa surrounding regenerated or regenerating parenchymal nodules.', 'Pseudomonadota', 'Alphaproteobacteria', 'Hyphomicrobiales', 'Xanthobacteraceae', 'Azorhizobium']
30
['16', 'D001172', 'Arthritis', ' Rheumatoid', 'Azorhizobium caulinodans', 'species', '7', '0', '0', '0', '0', 'D006262', 'Health', '3', '0.029020807', '0.0248571', '0.031027083', '-0.0248571', 'Decrease', 'A chronic systemic disease', ' primarily of the joints', ' marked by inflammatory changes in the synovial membranes and articular structures', ' widespread fibrinoid degeneration of the collagen fibers in mesenchymal tissues', ' and by atrophy and rarefact

In [5]:
taxids = [line[5] for line in line_generator(micro_disease_path)]
print(len(set(taxids)))

6902


In [6]:
notfound = [
    taxon["query"]
    for taxon in get_taxon_info(micro_disease_path)
    if "notfound" in taxon.keys()
]

In [7]:
print(len(notfound))
print(len(set(notfound)))

15
15


In [8]:
mapping = load_merged_from_tar("taxdump.tar.gz")
mapped_taxid = get_current_taxid(notfound, mapping)

In [9]:
if "194866" in mapping:
    print("194866 is in the mapping")
    print(f"current taxid for 194866: {mapping['194866']}")
else:
    print("194866 is not in the mapping")

194866 is in the mapping
current taxid for 194866: 46624


In [10]:
len(mapped_taxid)

15

In [11]:
new_taxids = [new for old, new in mapped_taxid.items()]
print(len(new_taxids))
print(len(set(new_taxids)))

15
1


In [14]:
print(set(new_taxids))

{None}


In [15]:
still_notfound = [taxid for taxid in notfound if taxid not in mapped_taxid.keys()]
print(len(still_notfound))

0


In [16]:
new_taxid_mapped = get_taxon_info(new_taxids)
new_taxid_mapped

[{'query': 'None', 'notfound': True}]

In [17]:
_ids = [taxon["query"] for taxon in new_taxid_mapped]
still_notfound = [taxid for taxid in new_taxids if taxid not in _ids]

In [18]:
len(set(_ids))

1

In [19]:
taxids = sorted([line[5] for line in line_generator(micro_disease_path)])
print(len(taxids))
print(len(set(taxids)))

508141
6902


## Microbe-Metabolite

In [20]:
micro_meta_df = pd.read_csv(os.path.join("downloads", "micro_metabolic.csv"), low_memory=False)

In [21]:
micro_meta_col_map = dict(enumerate(micro_meta_df.columns))
micro_meta_col_map

{0: 'id',
 1: 'g_micro',
 2: 'organism',
 3: 'g_meta',
 4: 'metabolic',
 5: 'pubchem_compound',
 6: 'pubchem_id',
 7: 'formula',
 8: 'kegg_id',
 9: 'tax_id',
 10: 'phylum',
 11: 'class',
 12: 'order',
 13: 'family',
 14: 'genus',
 15: 'species',
 16: 'species_id',
 17: 'source',
 18: 'smiles_sequence',
 19: 'HMDBID',
 20: 'Origin'}

In [26]:
micro_meta = [line for line in line_generator(os.path.join("downloads", "micro_metabolic.csv"))]
for i, line in enumerate(micro_meta):
    if i == 4:
        print(len(line))
        print(line)
    elif i == 500:
        print(len(line))
        print(line)
    elif i == 1480:
        print(len(line))
        print(line)
        break

21
['5', 'micro1', 'Abiotrophia defectiva ATCC 49176', 'meta21', 'Meso-2,6-Diaminoheptanedioate', '(2R,6S)-2,6-diaminoheptanedioic acid', '99290', 'C7H14N2O4', 'C00680', '592010', 'Bacillota', 'Bacilli', 'Lactobacillales', 'Aerococcaceae', 'Abiotrophia', 'Abiotrophia defectiva', '46125', 'vmh', 'C(CC(C(=O)O)N)CC(C(=O)O)N', 'not available', 'Microbiota']
21
['501', 'micro1', 'Abiotrophia defectiva ATCC 49176', 'meta1286', 'Fructose 1,6-bisphosphate', 'Fructose 1,6-bisphosphate', '5460765', 'C6H10O12P2-4', 'C00354', '592010', 'Bacillota', 'Bacilli', 'Lactobacillales', 'Aerococcaceae', 'Abiotrophia', 'Abiotrophia defectiva', '46125', 'vmh', 'C(C1C(C(C(O1)(COP(=O)([O-])[O-])O)O)O)OP(=O)([O-])[O-]', 'HMDB0001058', 'Host; Microbiota; Food related; Drug related']
21
['1481', 'micro2', 'Achromobacter xylosoxidans A8', 'meta349', '2-octadec-11-enoyl-sn-glycerol 3-phosphate', '2-Octadec-11-Enoyl-Sn-Glycerol 3-Phosphate', 'not available', 'C21H40O7P1', 'not available', '762376', 'Pseudomonadota',

In [27]:
for i, line in enumerate(micro_meta):
    if len(line) != 21:
        print(f"Line {i} has {len(line)} columns: {line}")

In [28]:
no_chem_id = []
for line in micro_meta:
    if "not available" in line[6] and "not available" in line[19]:
        no_chem_id.append(line[4])
print(len(no_chem_id))

304266


In [29]:
print(len(set(no_chem_id)))

1349


In [30]:
pubchem_cids = [line[6] for line in micro_meta if "not available" not in line[6]]
print(len(set(pubchem_cids)))

1571


In [31]:
hmdb_ids = [line[19] for line in micro_meta if "not available" not in line[19] and "not available" in line[6]]
print(len(set(hmdb_ids)))

25


In [32]:
total_metabolites = [line[4] for line in micro_meta]
print(len(set(total_metabolites)))

3200


In [33]:
bigg_mapped = get_bigg_metabolite_mapping(os.path.join("downloads", "bigg_models_metabolites.txt"))

In [34]:
len(bigg_mapped)

8771

In [35]:
bigg_mapped_ids = [name.lower() for name in set(no_chem_id) if name.lower() in bigg_mapped]

In [36]:
len(bigg_mapped_ids)

264

In [37]:
set(no_chem_id)

{'released mucin-type O-glycan No 162',
 '4-methyl-trans-hex-2-enoyl-ACP',
 'released mucin-type O-glycan No 23',
 'C9h18n2o4_219.134p_0n_24.2_rb_biocrustexp',
 'Unk_298.1294p_0n_7.9_rb_biocrustexp',
 'Arsenate',
 '2,3-dihydroxicinnamic acid',
 'Citrate-Mg',
 'Glycerol teichoic acid (n=45), linked, glucose substituted',
 'released mucin-type O-glycan No 145',
 'released mucin-type O-glycan No 94',
 'mucin-type O-glycan No 65',
 '1-tetradec-7-enoyl-sn-glycerol 3-phosphate',
 'released mucin-type O-glycan No 10',
 'Teichuronic acid (GlcA + GalNac, 45 repeating unit)',
 'mucin-type O-glycan No 66',
 'mucin-type O-glycan No 28',
 'released mucin-type O-glycan No 7',
 '9-methyl-3-oxo-decanoyl-ACP',
 'mucin-type O-glycan No 187',
 'Anteisopentadecanoyllipoteichoic acid (n=24), linked, N-acetyl-D-glucosamine',
 'Unk_746.4142p_0n_17.1_rb_biocrustexp',
 'released mucin-type O-glycan No 168',
 'Cardiolipin (12-methyl-tetradecanoyl, anteiso-C15)',
 '12-methyl-trans-tetra-dec-2-enoyl-ACP',
 '5-met

## Metabolite - Gene

In [2]:
meta_gene_df = pd.read_csv(os.path.join("downloads", "meta_gene_net.csv"), low_memory=False)

In [3]:
dict(enumerate(meta_gene_df.columns))

{0: 'id',
 1: 'g_meta',
 2: 'compound',
 3: 'pubchem_id',
 4: 'formula',
 5: 'kegg_id',
 6: 'HMDBID',
 7: 'drug_id',
 8: 'drug_name',
 9: 'Origin',
 10: 'smiles_sequence',
 11: 'gene_id',
 12: 'gene',
 13: 'ensembl_id',
 14: 'NCBI',
 15: 'HGNC',
 16: 'UniProt',
 17: 'protein_size',
 18: 'annonation',
 19: 'score',
 20: 'alteration',
 21: 'PMID',
 22: 'source'}

In [4]:
meta_gene_data = [line for line in line_generator(os.path.join("downloads", "meta_gene_net.csv"))]

In [5]:
for i, line in enumerate(meta_gene_data):
    if i == 2:
        print(len(line))
        print(line)
    elif i == 1000:
        print(len(line))
        print(line)
    elif i == 2000:
        print(len(line))
        print(line)

23
['3', 'meta2', '(1,6-alpha-D-Glucosyl)m', '3662', '(C12H20O10)n', 'C00372', 'not available', 'Not available', 'Not available', 'Microbiota; Food related; Drug related', 'CC(C)C(=O)C12C(=O)C(=C(C(C1=O)(CC(C2(C)CCC=C(C)C)CC=C(C)C)CC=C(C)C)O)CC=C(C)C', 'g1758', 'CXCL8', 'ENSG00000169429', '3576', '6025', 'P10145', '99', 'The protein encoded by this gene is a member of the CXC chemokine family and is a major mediator of the inflammatory response. The encoded protein is commonly referred to as interleukin-8 (IL-8). IL-8 is secreted by mononuclear macrophages, neutrophils, eosinophils, T lymphocytes, epithelial cells, and fibroblasts. It functions as a chemotactic factor by guiding the neutrophils to the site of infection. Bacterial and viral products rapidly induce IL-8 expression. IL-8 also participates with other cytokines in the proinflammatory signaling cascade and plays a role in systemic inflammatory response syndrome (SIRS). This gene is believed to play a role in the pathogenesis

In [6]:
misaligned_lines = []
for i, line in enumerate(meta_gene_data):
    if len(line) != 23:
        misaligned_lines.append(i)
print(len(misaligned_lines))
print(misaligned_lines)

0
[]


In [7]:
meta_gene_missing = check_missing_data(meta_gene_df)
meta_gene_missing

{'pubchem_id': {'Not available'},
 'formula': {'not available'},
 'kegg_id': {'not available'},
 'HMDBID': {'not available'},
 'drug_id': {'Not available'},
 'drug_name': {'Not available'},
 'Origin': {'Unknown'},
 'smiles_sequence': {'not available'},
 'ensembl_id': {'Not available'},
 'NCBI': {'Not available'},
 'HGNC': {'Not available'},
 'UniProt': {'Not available'},
 'annonation': {'Not Available'},
 'score': {'Not available'},
 'alteration': {'Unknown'},
 'PMID': {'Not available'}}

In [47]:
meta_gene_df["compound"].unique()

array(['(1,6-alpha-D-Glucosyl)m', '(2-Aminoethyl)phosphonic acid',
       '(20S)-Protopanaxadiol', '(2R)-2,3-Dihydroxypropanoic acid',
       '(2R,6S)-2,6-diaminoheptanedioic acid',
       '(2S)-2-(3-Carboxypropanoylamino)pentanedioic acid',
       '(2S)-2-amino-5-oxopentanoic acid',
       '(2S)-2-hydroxy-2-methyl-3-oxobutanoic acid',
       '(2S)-2-Hydroxybutanedioic acid', '(2S,3S)-butane-2,3-diol',
       '(2S,4R)-2-amino-4-hydroxypentanedioic acid',
       '(3R)-3-hydroxy-3-methyl-5-(phosphonooxy)pentanoic acid',
       '(3S)-3,6-diaminohexanoic acid',
       '(3S,5S)-3,5-diaminohexanoic acid',
       '(4-Amino-2-methylpyrimidin-5-yl)methyl trihydrogen diphosphate',
       '(4s)-4-[(2e)-Oct-2-Enoyloxy]-4-(Trimethylazaniumyl)Butanoate',
       '(4S)-4,5-dihydroxy-2-oxopentanoic acid',
       '(R)-2-Hydroxy-4-methylpentanoic acid',
       '(R)-3-Hydroxybutyric acid', '(R)-Lactaldehyde',
       '(R,R)-2,3-butanediol', '(S)-1-pyrroline-5-carboxylate',
       '(S)-2,3-dihydrodipicolini

In [48]:
print(len(meta_gene_df["compound"].unique()))

925


In [49]:
meta_gene_df["pubchem_id"].unique()

array(['3662', '339', '11213350', '439194', '99290', '440847', '193305',
       '440878', '222656', '439888', '440854', '439400', '439417',
       '439430', '217', 'Not available', '5280626', '439960', '92135',
       '439350', '225936', '440162', '439982', '5459811', '11966158',
       '11966160', '440873', '11966173', '11966216', '11966179',
       '11966177', '440077', '5462303', '439269', '1172', '3081033',
       '535', '4', '263', '445395', '443201', '27476', '79143', '457',
       '440932', '9547180', '68161', '1031', '440282', '5462190', '428',
       '10442', '1045', '439153', '44140569', '14080393', '24970825',
       '10308378', '79034', '9963391', '91451', '440043', '13730',
       '188966', '12599', '13711', '65091', '13712', '65063', '101812',
       '800', '5287432', '1530', '1000', '5280673', '65275', '363',
       '6657', '1015', '469', '6119', '5281909', '192781', '9378', '524',
       '92779', '5462148', '11266', '7866', '43', '11970', '77',
       '3080745', '237246

In [50]:
print(len(meta_gene_df["pubchem_id"].unique()))

925


In [52]:
meta_gene_df["kegg_id"].unique()

array(['C00372', 'C03557', 'C20715', 'C00258', 'C00680', 'C05931',
       'C01165', 'C06010', 'C00149', 'C03046', 'C05947', 'C01107',
       'C01142', 'C01186', 'C04752', 'not available', 'C03826', 'C03264',
       'C01089', 'C00937', 'C03044', 'C03912 ', 'C03340', 'C01054',
       'C05264', 'C05268', 'C06001', 'C05262', 'C05266', 'C05258',
       'C05260', 'C03656', 'C06002', 'C00603', 'C01234', 'C05771',
       'C06142', 'C01847', 'C11437', 'C02494', 'C02918', 'C06178',
       'C06181', 'C05979', 'C04282', 'C15606', 'C00986', 'C02457',
       'C00134', 'C00004', 'C15547', 'C01007', 'C08317', 'C05421',
       'C05138', 'C03546', 'C00559', 'C00206', 'C00360', 'C00881',
       'C00458', 'C00526', 'C00365', 'C02353', 'C00637', 'C05817',
       'C16038', 'C02735', 'C04409', 'C06735', 'C00346', 'C00956',
       'C03665', 'C11356', 'C00684', 'C03672', 'C03459', 'C15651',
       'C05984', 'C05123', 'C02630', 'C05852', 'C02504', 'C04442',
       'C15978', 'C15976', 'C00349', 'C03344', 'C18319

In [53]:
meta_gene_df["HMDBID"].unique()

array(['not available', 'HMDB0011747', 'HMDB0000139', 'HMDB0002104',
       'HMDB0006855', 'HMDB0000156', 'HMDB0002273', 'HMDB0001343',
       'HMDB0012114', 'HMDB0012115', 'HMDB0000624', 'HMDB0000011',
       'HMDB0006458', 'HMDB0033007', 'HMDB0001301', 'HMDB0012247',
       'HMDB0001188', 'HMDB0003938', 'HMDB0003942', 'HMDB0000023',
       'HMDB0003936', 'HMDB0003940', 'HMDB0003932', 'HMDB0003934',
       'HMDB0012131', 'HMDB0002217', 'HMDB0001005', 'HMDB0036458',
       'HMDB0012136', 'HMDB0004327', 'HMDB0001142', 'HMDB0001213',
       'HMDB0003331', 'HMDB0000699', 'HMDB0060253', 'HMDB0000820',
       'HMDB0002234', 'HMDB0012134', 'HMDB0000002', 'HMDB0001414',
       'HMDB0001487', 'HMDB0304010', 'HMDB0001557', 'HMDB0002059',
       'HMDB0039093', 'HMDB0000363', 'HMDB0001313', 'HMDB0000101',
       'HMDB0001508', 'HMDB0000905', 'HMDB0000014', 'HMDB0000998',
       'HMDB0000012', 'HMDB0001409', 'HMDB0011616', 'HMDB0001190',
       'HMDB0303951', 'HMDB0041008', 'HMDB0001065', 'HMDB000

In [54]:
meta_gene_df["drug_id"].unique()

array(['Not available', 'DB03175', 'DB02774', 'DB01917', 'DB00157',
       'DB03704', 'DB02594', 'DB02256', 'DB03800', 'DB08398', 'DB01738',
       'DB02952', 'DB01709', 'DB02726', 'DB02714', 'DB03612', 'DB04594',
       'DB07069', 'DB02758', 'DB04074', 'DB02838', 'DB03434', 'DB03560',
       'DB04242', 'DB07718', 'DB04214', 'DB03403', 'DB02282', 'DB02076',
       'DB11145', 'DB04944', 'DB00316', 'DB14511', 'DB01762', 'DB03059',
       'DB03128', 'DB06151', 'DB00173', 'DB00640', 'DB01812', 'DB03708',
       'DB00171', 'DB02527', 'DB00131', 'DB08838', 'DB00160', 'DB11100',
       'DB02835', 'DB01632', 'DB00855', 'DB02289', 'DB11118', 'DB04166',
       'DB07352', 'DB11217', 'DB00125', 'DB00126', 'DB00174', 'DB00128',
       'DB01086', 'DB03793', 'DB06770', 'DB03107', 'DB06755', 'DB02379',
       'DB06756', 'DB02073', 'DB00121', 'DB03568', 'DB03854', 'DB01880',
       'DB01373', 'DB03600', 'DB02232', 'DB06777', 'DB00169', 'DB04540',
       'DB02659', 'DB00122', 'DB04660', 'DB15581', 'DB14

In [55]:
meta_gene_df["drug_name"].unique()

array(['Not available', 'Propyl alcohol', '1,3-Propanediol', 'Putrescine',
       'NADH', '12-Hydroxydodecanoic Acid', "2'-Deoxycytidine",
       "2'-Deoxyuridine", 'Deoxyuridine monophosphate',
       '2-Amino-1-methyl-6-phenylimidazo(4,5-b)pyridine',
       'Phosphorylcolamine', '2-aminoisobutyric acid',
       '2-phospho-D-glyceric acid', '2-Phosphoglycolic Acid',
       "3'-Uridinemonophosphate", '3-Hydroxybutyryl-Coenzyme A',
       '3-hydroxyglutaric acid', 'm-Hydroxyhippuric acid',
       'Indolepropionic acid', 'alpha-Ketoisovalerate',
       '3,4-Dihydro-2h-Pyrrolium-5-Carboxylate',
       '3-(N-morpholino)propanesulfonic acid', 'P-Hydroxybenzaldehyde',
       '4-hydroxybenzoic acid', '4-Hydroxyphenylpyruvic acid',
       '4-Nitrophenyl Phosphate', "Cytidine-5'-Monophosphate",
       "5'-S-methyl-5'-thioadenosine", '6-phospho-D-gluconic acid',
       'Oxyquinoline', 'Acadesine', 'Acetaminophen', 'Acetate',
       'Acetoacetic acid', 'Acetoacetyl-CoA', 'Acetylcholine',
       '

In [56]:
meta_gene_df["Origin"].unique()

array(['Microbiota; Food related; Drug related',
       'Host; Microbiota; Food related', 'Unknown', 'Microbiota',
       'Host; Microbiota; Food related; Drug related',
       'Microbiota; Food related', 'Host; Microbiota',
       'Microbiota; Drug related',
       'Microbiota; Food related; Drug related; Environment',
       'Microbiota; Food related; Environment',
       'Host; Microbiota; Food related; Drug related; Environment',
       'Host; Microbiota; Food related; Environment',
       'Host; Microbiota; Drug related'], dtype=object)

In [57]:
meta_gene_df["smiles_sequence"].unique()

array(['CC(C)C(=O)C12C(=O)C(=C(C(C1=O)(CC(C2(C)CCC=C(C)C)CC=C(C)C)CC=C(C)C)O)CC=C(C)C',
       'C(CP(=O)(O)O)N',
       'CC(=CCCC(C)(C1CCC2(C1C(CC3C2(CCC4C3(CCC(C4(C)C)O)C)C)O)C)O)C',
       'C(C(C(=O)O)O)O', 'C(CC(C(=O)O)N)CC(C(=O)O)N',
       'C(CC(=O)O)C(C(=O)O)NC(=O)CCC(=O)O', 'C(CC(C(=O)O)N)C=O',
       'CC(=O)C(C)(C(=O)O)O', 'C(C(C(=O)O)O)C(=O)O', 'CC(C(C)O)O',
       'C(C(C(=O)O)N)C(C(=O)O)O', 'CC(CCOP(=O)(O)O)(CC(=O)O)O',
       'C(CC(CC(=O)O)N)CN', 'CC(CC(CC(=O)O)N)N',
       'CC1=NC=C(C(=N1)N)COP(=O)(O)OP(=O)(O)O', 'not available',
       'C(C(CO)O)C(=O)C(=O)O', 'CC(C)CC(C(=O)O)O', 'CC(CC(=O)O)O',
       'CC(C=O)O', 'C1CC(N=C1)C(=O)O', 'C1C=CC(=NC1C(=O)O)C(=O)O',
       'CC(=CCCC(=CCCC(=CCCC=C(C)CCC=C(C)CCC1C(O1)(C)C)C)C)C',
       'CCCCCCCC(CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)O',
       'CCCC(CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)O',
       'CC(CO)C(=

In [87]:
"Unknow" or "Not available" in meta_gene_df["gene"].unique()

'Unknow'

In [85]:
"Unknow" and "Not available" in meta_gene_df["ensembl_id"].unique()

True

In [84]:
"Unknow" and "Not available" in meta_gene_df["NCBI"].unique()

True

In [83]:
"Unknow" and "Not available" in meta_gene_df["HGNC"].unique()

True

In [82]:
"Unknow" and "Not available" in meta_gene_df["UniProt"].unique()

True

In [81]:
"Unknow" or "Not available" in meta_gene_df["protein_size"].unique()

'Unknow'

In [79]:
"Unknow" or "Not available" in meta_gene_df["annonation"].unique()

'Unknow'

In [77]:
"Unknow" and "Not available" in meta_gene_df["score"].unique()

True

In [90]:
meta_gene_df["score"].unique()

array(['0.629', '0.879', '0.938', '0.7', '0.8', '0.8955', '0.869', '0.44',
       '0.412', '0.699', '0.828', '0.402', '0.404', '0.478', '0.474',
       '0.934', '0.907', '0.9', '0.618', '0.969', '0.997', '0.471',
       '0.667', '0.741', '0.695', '0.957', '0.503', '0.58', '0.627',
       '0.405', '0.683', '0.849', '0.635', '0.646', '0.63', '0.415',
       '0.413', '0.962', '0.936', '0.453', '0.447', '0.445', '0.444',
       '0.74', '0.826', '0.4', '0.911', '0.979', '0.411', '0.588',
       '0.912', '0.944', '0.939', '0.69', '0.908', '0.644', '0.423',
       '0.429', '0.909', '0.972', '0.975', '0.548', '0.577', '0.443',
       '0.568', '0.527', '0.466', 'Not available', '0.534', '0.421',
       '0.694', '0.708', '0.477', '0.51', '0.53', '0.654', '0.5', '0.795',
       '0.674', '0.676', '0.692', '0.672', '0.597', '0.945', '0.913',
       '0.628', '0.41', '0.932', '0.601', '0.918', '0.982', '0.973',
       '0.458', '0.508', '0.978', '0.976', '0.929', '0.838', '0.719',
       '0.946', '0.7

In [76]:
"Unknow" or "Not available" in meta_gene_df["alteration"].unique()

False

In [72]:
meta_gene_df["alteration"].unique()

array(['Unknown', 'elevated', 'reduced', 'target', 'Inhibitor',
       'Activator'], dtype=object)

In [73]:
"Unknow" and "Not available" in meta_gene_df["PMID"].unique()

True

In [89]:
meta_gene_df["PMID"].unique()

array(['Not available', '31142855', '25913757', '25749343', '26879219',
       '30401435', '19865172', '32795610', '16534848', '28266623',
       '31734354', '21178864', '23300800', '30683619', '25382172',
       '30767231', '21889493', '19966295', '28658542', '26241311',
       '30566883'], dtype=object)

In [92]:
any(val in meta_gene_df["source"].unique() for val in ["Unknow","Not available"])

False

In [88]:
meta_gene_df["source"].unique()

array(['stitch', 'gutMGene', 'stitch, gutMGene', 'stitch, drugbank',
       'drugbank'], dtype=object)