# Metabolism Questions

In [8]:
from collections import defaultdict
import time
import pandas as pd
import numpy as np
from scipy.io import loadmat
# import escher
# from escher import Builder
import cobra
from cobra.io import load_matlab_model, load_json_model, save_json_model, load_model
# import cobramod
import json, copy
import os
from xml.etree import cElementTree as ET
from tqdm import tqdm
import re
import copy
import requests
from cobra.core.metabolite import Metabolite
from bioservices import ChEBI
import warnings

In [9]:
def xml2dict(t):
    d = {}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in map(xml2dict, children):
            for k, v in dc.items():
                if (not k == "") and not ("reference" in k):
                    dd[k.split("}")[-1]].append(v)
        d = {
            t.tag.split("}")[-1]: {
                k.split("}")[-1]: v[0] if len(v) == 1 else v for k, v in dd.items()
            }
        }

    if t.text:
        text = t.text.strip()
        if (not text == "") and not ("reference" in text):
            d[t.tag] = text
    return d

Outstanding questions:
- Experimental val
- Link to drug: drugbank to target id
- Link to disease (kg): Get disease ids from json; API to download individual diseases; biopython kgml to parse
- Link to drug screen db (depmap, cmap): 


for masking:
    data is: mini-pathways 
for gap filling: 
    network -> growth/no growth
for flux:
    network -> edge labels

validate reaction? 

```
Data Structure:
    {
        "metabolites": {},  # id to metadata
        "drugs": {},        # id to metadata
        "proteins": {},     # id to metadata
        "reactions": {}, 
        "pathways/groups" ?
        "network" ? 
    }
```


- Use cobrapy with BiGG models 
- Use cobramod to get KEGG if need be
- BioPython integrate other databases

------------
TODO:

- [ ] Get list of pathways per organism
- [ ] Programmatically make pathways and add to model
- [ ] Save as mat files
- [ ] Make drugbank and hmdb metadata files


# KEGG


- KEGG NETWORK represents a renewed attempt by KEGG to capture knowledge of diseases and drugs in terms of perturbed molecular networks

: Metabolic pathways -> individual pathways and reactions

```
pathway
|
|-- module
|    |
|    |-- reactions
```

In [4]:
from cobramod import create_object
from pathlib import Path
from cobramod.parsing.kegg import retrieve_data
from cobramod import get_data
import cobramod
from cobra import Model

In [13]:
# Using copy of test model
test_model = Model('test_model')

In [48]:
newobj= retrieve_data(
    directory = Path.cwd().resolve().joinpath("tempdir"),
    identifier="hsa05210")  # M00051

In [49]:
newobj

{'ENTRY': ['hsa05210                    Pathway'],
 'NAME': ['Colorectal cancer - Homo sapiens (human)'],
 'DESCRIPTION': ['Colorectal cancer (CRC) is the second largest cause of cancer-related deaths in Western countries. CRC arises from the colorectal epithelium as a result of the accumulation of genetic alterations in defined oncogenes and tumour suppressor genes (TSG). Two major mechanisms of genomic instability have been identified in sporadic CRC progression. The first, known as chromosomal instability (CIN), results from a series of genetic changes that involve the activation of oncogenes such as K-ras and inactivation of TSG such as p53, DCC/Smad4, and APC. The second, known as microsatellite instability (MSI), results from inactivation of the DNA mismatch repair genes MLH1 and/or MSH2 by hypermethylation of their promoter, and secondary mutation of genes with coding microsatellites, such as transforming growth factor receptor II (TGF-RII) and BAX. Hereditary syndromes have ger

# Escher

In [None]:

"""
gen_map() passing the Cobra model, a list of
the reactions you wish to include and a list of metabolites you do
not wish to be used to create links between reactions (e.g. ATP). This
returns an EscherMap object. Use map_obj.dump_json() to get a JSON that
can be passed to escher.Builder()
"""
import sys
sys.path.append("escher_helper")
from escher_helper.generate_escher_map import gen_map

recon3 = load_matlab_model('/Mounts/rbg-storage1/datasets/Metabo/Recon3D/Recon3D_301/Recon3D_301.mat')

reactions_of_interest = list(recon3.groups[0].members)
metabolites_of_interest = list(set([l.id for m in recon3.groups[0].members for l in list(m.metabolites.keys())]))
bldr = gen_map(recon3,  reactions_of_interest, metabolites_of_interest)

---------------------------
# HMDB (Done)

In [11]:
metabolites_db = json.load(open("/Mounts/rbg-storage1/datasets/Metabo/HMDB/metabolites.json", 'r'))

In [12]:
protein_db = json.load(open("/Mounts/rbg-storage1/datasets/Metabo/HMDB/proteins.json", 'r'))

# Drug Bank (To Parse)

In [14]:
# enzymes that are targets
targets = pd.read_csv("drugbank_all_target_polypeptide_ids/all.csv")
enzymes = pd.read_csv("drugbank_all_enzyme_polypeptide_ids/all.csv")
approved = pd.read_csv("drugbank_approved_target_polypeptide_ids/all.csv")

targets_as_enzymes = [ t for t in set(targets['UniProt ID']) if t in set(enzymes['UniProt ID'])]
approved_targets_as_enzymes = [ t for t in targets_as_enzymes if t in set(approved['UniProt ID'])]

summary = "Targets that are enzymes: {}, Found in HMDB: {} \
\nTargets that are enzymes for approved drugs: {}, Found in HMDB: {}"

print( 
    summary.format(
    len(targets_as_enzymes), len([t for t in targets_as_enzymes if hmdb_to_uniprot.get(t, False) ]),
    len(approved_targets_as_enzymes), len([t for t in approved_targets_as_enzymes if hmdb_to_uniprot.get(t, False) ]))
)

Targets that are enzymes: 324, Found in HMDB: 287 
Targets that are enzymes for approved drugs: 273, Found in HMDB: 249


In [2]:
proteins = []
with open("protein.fasta", "r") as f:
    for line in f:
        if line.startswith("HMDBP"):
            proteins.append(line.split(' ')[0])

In [3]:
len(proteins)

5629

In [46]:
drugbank_levels = defaultdict(list)

n = 0
level = 0
entries = 0
start_time = time.time()
for event, elem in ET.iterparse('../DrugBank/drugbank.xml', events=('start', 'end') ):
    if event == 'start':
        level += 1
        if elem.tag.split('}')[-1] not in drugbank_levels[f"l{level}"]:
            drugbank_levels[f"l{level}"].append(elem.tag.split('}')[-1])
        if level == 2:
            entries += 1
    
    if event == 'end':
        level -= 1


    if n % 10e6 == 0:
        mins = round((time.time() - start_time)/60, 4)
        print("Processed {} entries in {} minutes".format(entries, mins))
    
    n+=1

Processed 0 entries in 0.0001 minutes
Processed 528 entries in 0.391 minutes
Processed 1024 entries in 0.757 minutes
Processed 3929 entries in 1.1797 minutes
Processed 8221 entries in 1.5274 minutes
Processed 10415 entries in 1.8946 minutes


In [48]:
drugbank_levels

defaultdict(list,
            {'l1': ['drugbank'],
             'l2': ['drug'],
             'l3': ['drugbank-id',
              'name',
              'description',
              'cas-number',
              'unii',
              'state',
              'groups',
              'general-references',
              'synthesis-reference',
              'indication',
              'pharmacodynamics',
              'mechanism-of-action',
              'toxicity',
              'metabolism',
              'absorption',
              'half-life',
              'protein-binding',
              'route-of-elimination',
              'volume-of-distribution',
              'clearance',
              'classification',
              'salts',
              'synonyms',
              'products',
              'international-brands',
              'mixtures',
              'packagers',
              'manufacturers',
              'prices',
              'categories',
              'affected-organisms',
 

# BiGG Models

In [10]:
model = load_matlab_model(f"/Mounts/rbg-storage1/datasets/Metabo/BiGG/e_coli_core.mat")

This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model e_coli_core. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e


In [11]:
model

0,1
Name,e_coli_core
Memory address,0x07f52396d3190
Number of metabolites,72
Number of reactions,95
Number of groups,11
Objective expression,1.0*BIOMASS_Ecoli_core_w_GAM - 1.0*BIOMASS_Ecoli_core_w_GAM_reverse_712e5
Compartments,"e, c"


In [12]:
for pathway in model.groups:
    print(pathway)

Anaplerotic reactions
Biomass and maintenance functions
Citric Acid Cycle
Extracellular exchange
Glutamate Metabolism
Glycolysis/Gluconeogenesis
Inorganic Ion Transport and Metabolism
Oxidative Phosphorylation
Pentose Phosphate Pathway
Pyruvate Metabolism
Transport, Extracellular


In [51]:
val, = model.groups[0].members[0].genes

In [46]:
all_genes = set()
for pathway in model.groups:
    for reaction in pathway.members:
        for gene in reaction.genes:
            all_genes.add(gene.id)

In [49]:
len(all_genes)

137

In [None]:
metabolite2pathway = {}
for pathway in model.groups:
    for reaction in pathway.members:
        for metabolite in reaction.metabolites:
            metabolite2pathway.setdefault(metabolite.id, set())
            metabolite2pathway[metabolite.id].add(pathway.id)

In [None]:
sorted({(k, len(v)) for k,v in metabolite2pathway.items()}, key=lambda x: x[1], reverse=True)

72

In [50]:
ds = json.load(open("/Mounts/rbg-storage1/datasets/Metabo/datasets/e_coli_core_dataset.json", "rb"))

In [80]:
all_uniprots = set()
for sample in ds:
    if 'proteins' in sample:
        for prot in sample['proteins']:
            if 'UniProt' in prot['database_links']:
                for i in prot['database_links']['UniProt']:
                    all_uniprots.add(i['id'])

In [81]:
len(all_uniprots)

136

In [85]:
df = pd.read_csv(open("/Mounts/rbg-storage1/datasets/Metabo/antibiotics/stokes2019antibiotic_deduplicated_chembl_ids_targets_uniprot.csv", "rb"))

In [91]:
len(set(i for i in df.columns if i != "SMILES" and df[i].sum() >= 1))

965

In [95]:
any([j for j in set(i for i in df.columns if i != "SMILES" and df[i].sum() >= 1) if j in all_uniprots])

False

In [None]:
from bioservices import UniProt

In [6]:
u = UniProt()

In [14]:
u.mapping("P_ENTREZGENEID", "ACC", query='314')

defaultdict(list, {'314': ['O75106']})

In [15]:
df = u.get_df(['O75106'])

  output = output.append(df, ignore_index=True)


In [13]:
pd.set_option('display.max_columns', None)
df[(df['Entry'] == 'Q9UJ70') | (df['Entry'] == 'A0A384N6G7') | (df['Entry'] ==  'C9JEV6')]

Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,Proteomes,Taxonomic lineage (ALL),Taxonomic lineage IDs,Virus hosts,Fragment,Sequence,Length,Mass,Gene encoded by,Alternative products (isoforms),Erroneous gene model prediction,Erroneous initiation,Erroneous termination,Erroneous translation,Frameshift,Mass spectrometry,Polymorphism,RNA editing,Sequence caution,Alternative sequence,Natural variant,Non-adjacent residues,Non-standard residue,Non-terminal residue,Sequence conflict,Sequence uncertainty,Version (sequence),Domains,Domain count,Domain [CC],Sequence similarities,Coiled coil,Compositional bias,Domain [FT],Motif,Region,Repeat,Zinc finger,EC number,Absorption,Catalytic activity,Cofactor,Function [CC],Kinetics,Pathway,Redox potential,Temperature dependence,pH dependence,Active site,Binding site,DNA binding,Metal binding,Nucleotide binding,Site,Gene ontology (GO),Gene ontology (biological process),Gene ontology (molecular function),Gene ontology (cellular component),Gene ontology IDs,InterPro,Interacts with,Subunit structure [CC],PubMed ID,Mapped PubMed ID,Date of creation,Date of last modification,Date of last sequence modification,Version (entry),3D,Beta strand,Helix,Turn,Subcellular location [CC],Intramembrane,Topological domain,Transmembrane,Annotation,Features,Caution,Tissue specificity,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Keyword ID,Pathway.1,Allergenic properties,Biotechnological use,Disruption phenotype,Involvement in disease,Pharmaceutical use,Toxic dose,Post-translational modification,Chain,Cross-link,Disulfide bond,Glycosylation,Initiator methionine,Lipidation,Modified residue,Peptide,Propeptide,Signal peptide,Transit peptide,Taxonomic lineage (all),Taxonomic lineage (SUPERKINGDOM),Taxonomic lineage (KINGDOM),Taxonomic lineage (SUBKINGDOM),Taxonomic lineage (SUPERPHYLUM),Taxonomic lineage (PHYLUM),Taxonomic lineage (SUBPHYLUM),Taxonomic lineage (SUPERCLASS),Taxonomic lineage (CLASS),Taxonomic lineage (SUBCLASS),Taxonomic lineage (INFRACLASS),Taxonomic lineage (SUPERORDER),Taxonomic lineage (ORDER),Taxonomic lineage (SUBORDER),Taxonomic lineage (INFRAORDER),Taxonomic lineage (PARVORDER),Taxonomic lineage (SUPERFAMILY),Taxonomic lineage (FAMILY),Taxonomic lineage (SUBFAMILY),Taxonomic lineage (TRIBE),Taxonomic lineage (SUBTRIBE),Taxonomic lineage (GENUS),Taxonomic lineage (SUBGENUS),Taxonomic lineage (SPECIES GROUP),Taxonomic lineage (SPECIES SUBGROUP),Taxonomic lineage (SPECIES),Taxonomic lineage (SUBSPECIES),Taxonomic lineage (VARIETAS),Taxonomic lineage (FORMA),Taxonomic lineage IDs (all),Taxonomic lineage IDs (SUPERKINGDOM),Taxonomic lineage IDs (KINGDOM),Taxonomic lineage IDs (SUBKINGDOM),Taxonomic lineage IDs (SUPERPHYLUM),Taxonomic lineage IDs (PHYLUM),Taxonomic lineage IDs (SUBPHYLUM),Taxonomic lineage IDs (SUPERCLASS),Taxonomic lineage IDs (CLASS),Taxonomic lineage IDs (SUBCLASS),Taxonomic lineage IDs (INFRACLASS),Taxonomic lineage IDs (SUPERORDER),Taxonomic lineage IDs (ORDER),Taxonomic lineage IDs (SUBORDER),Taxonomic lineage IDs (INFRAORDER),Taxonomic lineage IDs (PARVORDER),Taxonomic lineage IDs (SUPERFAMILY),Taxonomic lineage IDs (FAMILY),Taxonomic lineage IDs (SUBFAMILY),Taxonomic lineage IDs (TRIBE),Taxonomic lineage IDs (SUBTRIBE),Taxonomic lineage IDs (GENUS),Taxonomic lineage IDs (SUBGENUS),Taxonomic lineage IDs (SPECIES GROUP),Taxonomic lineage IDs (SPECIES SUBGROUP),Taxonomic lineage IDs (SPECIES),Taxonomic lineage IDs (SUBSPECIES),Taxonomic lineage IDs (VARIETAS),Taxonomic lineage IDs (FORMA),Cross-reference (db_abbrev),Cross-reference (EMBL)
0,Q9UJ70,NAGK_HUMAN,[NAGK],NAGK,,,,Homo sapiens (Human),9606,N-acetyl-D-glucosamine kinase (N-acetylglucosa...,UP000005640: Chromosome 2,"cellular organisms, Eukaryota, Opisthokonta, M...",9606,,,MAAIYGGVEGGGTRSEVLLVSEDGKILAEADGLSTNHWLIGTDKCV...,344,37376,,,,,,,,,,,,"VAR_SEQ 1; /note=""M -> MRTRTGSQLAAREVTGSGAVPR...","VARIANT 38; /note=""W -> R (in dbSNP:rs1785614...",,,,"CONFLICT 70; /note=""S -> I (in Ref. 1; CAB618...",,4,[],0,,SIMILARITY: Belongs to the eukaryotic-type N-a...,,,,,"REGION 129..130; /note=""Substrate binding""; ...",,,2.7.1.59,,CATALYTIC ACTIVITY: Reaction=ATP + N-acetyl-D-...,,FUNCTION: Converts endogenous N-acetylglucosam...,,PATHWAY: Amino-sugar metabolism; N-acetylneura...,,,,,"BINDING 13; /note=""ATP; via amide nitrogen""; ...",,,,,"[cytosol [GO:0005829], extracellular exosome [...",N-acetylglucosamine metabolic process [GO:0006...,ATP binding [GO:0005524]; N-acetylglucosamine ...,cytosol [GO:0005829]; extracellular exosome [G...,"[GO:0005524, GO:0005829, GO:0006044, GO:000605...","[IPR002731, IPR043129, IPR039758, ]","[Q9NZ32, P42773, Q9UI36, Q9UI36-2, Q8TBB1, P50...",SUBUNIT: Homodimer. {ECO:0000269|PubMed:170103...,"[10824116, 14702039, 15815621, 15489334, 12665...",15231747; 15231748; 15987957; 16189514; 182348...,2002-11-01,2022-05-25,2007-01-23,186,X-ray crystallography (2),"STRAND 4..10; /evidence=""ECO:0007829|PDB:2CH5...","HELIX 37..40; /evidence=""ECO:0007829|PDB:2CH5...","TURN 76..79; /evidence=""ECO:0007829|PDB:2CH5""...",,,,,5 out of 5,Alternative sequence (1); Beta strand (14); Bi...,,TISSUE SPECIFICITY: Ubiquitous. {ECO:0000269|P...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,[Eukaryotic-type N-acetylglucosamine kinase fa...,186,"[Alternative products (1), Catalytic activity ...",,KW-0002; KW-0007; KW-0025; KW-0067; KW-0418; K...,Amino-sugar metabolism; N-acetylneuraminate de...,,,,,,,,"CHAIN 2..344; /note=""N-acetyl-D-glucosamine k...",,,,"INIT_MET 1; /note=""Removed""; /evidence=""ECO:...",,"MOD_RES 2; /note=""N-acetylalanine""; /evidenc...",,,,,"cellular organisms, Eukaryota, Opisthokonta, M...",Eukaryota,Metazoa,,,Chordata,Craniata,Sarcopterygii,Mammalia,,,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea (apes),Hominidae (great apes),Homininae,,,Homo,,,,Homo sapiens (Human),,,,"131567, 2759, 33154, 33208, 6072, 33213, 33511...",2759,33208,,,7711,89593,8287,40674,,,314146,9443,376913,314293,9526,314295,9604,207598,,,9605,,,,,,,,,AJ242910;AK001812;AK297224;CR457271;AK222645;A...
9,C9JEV6,C9JEV6_HUMAN,[NAGK],NAGK,,,,Homo sapiens (Human),9606,N-acetyl-D-glucosamine kinase (EC 2.7.1.59) (G...,UP000005640: Chromosome 2,"cellular organisms, Eukaryota, Opisthokonta, M...",9606,,,MVNRAKRKAGVDPLVPLRSLGLSLSGGDQEDAGRILIEELRDRFPY...,293,32018,,,,,,,,,,,,,,,,,,,3,[],0,,SIMILARITY: Belongs to the eukaryotic-type N-a...,,,"DOMAIN 50..224; /note=""BcrAD_BadFG""; /eviden...",,,,,2.7.1.59,,,,,,,,,,,,,,,,[N-acetylglucosamine kinase activity [GO:00451...,,N-acetylglucosamine kinase activity [GO:0045127],,[GO:0045127],"[IPR002731, IPR043129, IPR039758, ]",[],,"[15815621, 18669648, 19690332, 20068231, 21269...",15987957; 17010375; 22507750; 25921606; 276466...,2009-11-03,2022-05-25,2011-06-28,83,,,,,,,,,2 out of 5,Domain (1),,,,"[Proteomics identification, Reference proteome]",Evidence at protein level,unreviewed,,[Eukaryotic-type N-acetylglucosamine kinase fa...,83,[Sequence similarities (1)],,KW-1185; KW-1267,,,,,,,,,,,,,,,,,,,,"cellular organisms, Eukaryota, Opisthokonta, M...",Eukaryota,Metazoa,,,Chordata,Craniata,Sarcopterygii,Mammalia,,,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea (apes),Hominidae (great apes),Homininae,,,Homo,,,,Homo sapiens (Human),,,,"131567, 2759, 33154, 33208, 6072, 33213, 33511...",2759,33208,,,7711,89593,8287,40674,,,314146,9443,376913,314293,9526,314295,9604,207598,,,9605,,,,,,,,,


In [398]:
u.get_fasta_sequence('Q9UJ70')

'MAAIYGGVEGGGTRSEVLLVSEDGKILAEADGLSTNHWLIGTDKCVERINEMVNRAKRKAGVDPLVPLRSLGLSLSGGDQEDAGRILIEELRDRFPYLSESYLITTDAAGSIATATPDGGVVLISGTGSNCRLINPDGSESGCGGWGHMMGDEGSAYWIAHQAVKIVFDSIDNLEAAPHDIGYVKQAMFHYFQVPDRLGILTHLYRDFDKCRFAGFCRKIAEGAQQGDPLSRYIFRKAGEMLGRHIVAVLPEIDPVLFQGKIGLPILCVGSVWKSWELLKEGFLLALTQGREIQAQNFFSSFTLMKLRHSSALGGASLGARHIGHLLPMDYSANAIAFYSYTFS'

In [3]:
models_json = json.load(
    open("/Mounts/rbg-storage1/datasets/Metabo/BiGG/bigg_models.json", "rb")
)
models = [v["bigg_id"] for v in models_json["results"]]

In [4]:
organism2gene2sequence = {}
genes_without_sequence = [] # these in GEMPro
genes_with_sequence = []

for organism_name in tqdm(models, position=0):
    genes = []
    try:
        model = load_matlab_model(f"/Mounts/rbg-storage1/datasets/Metabo/BiGG/{organism_name}.mat")
    except Exception as e:
        print(e)
        print(f"Failed to load model: {organism_name}")
        continue
    # for gene in tqdm(list(model.genes), position=0, leave=True):
    #     r = requests.get(f"http://bigg.ucsd.edu/api/v2/models/{organism_name}/genes/{gene.id}")
    #     protein_metadata = r.json()
    #     # if gene exists in BiGG Model then add them to `organism2gene2sequence`
    #     if 'protein_sequence' in protein_metadata:
    #         organism2gene2sequence.setdefault(organism_name, {})
    #         # if this gene doesn't exist in this organism's gene2sequence dict, then add it
    #         if gene.id not in organism2gene2sequence[organism_name]:
    #             organism2gene2sequence[organism_name].update({gene.id: protein_metadata['protein_sequence']})
    #         # print(protein_metadata)
    #         # if BiGG Model does not have protein_sequence, then we store this gene to put into GEMPro pipeline
    #         try:
    #             entrez_id = protein_metadata['database_links']['NCBI Entrez Gene'][0]['id']
    #             genes_with_sequence.append(entrez_id)
    #         except:
    #             print("This gene is missing an Entrez ID: ", gene.id)
    #     else:
    #         try:
    #             print("Missing prot seq")
    #             entrez_id = protein_metadata['database_links']['NCBI Entrez Gene'][0]['id']
    #             genes_without_sequence.append(entrez_id)
    #         except:
    #             print("This gene is missing an Entrez ID: ", protein_metadata)

  0%|                                                                                                   | 0/108 [00:00<?, ?it/s]This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model e_coli_core. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e


Scaling...
 A: min|aij| =  1.000e+00  max|aij| =  1.000e+00  ratio =  1.000e+00
Problem data seem to be well scaled


  1%|▊                                                                                          | 1/108 [00:08<15:56,  8.93s/it]No defined compartments in model iAB_RBC_283. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e
  2%|█▋                                                                                         | 2/108 [00:37<36:21, 20.58s/it]This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model iAF1260. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, p
  3%|██▍                                                                                      | 3/108 [03:36<2:42:40, 92.96s/it]This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in mo

Some problem with the model, causing error Some compartments were empty. Check the model!
No COBRA model found at /Mounts/rbg-storage1/datasets/Metabo/BiGG/iCHOv1.mat.
Failed to load model: iCHOv1


This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model iCHOv1_DG44. Compartments will be deduced heuristically using regular expressions.
 17%|██████████████▋                                                                         | 18/108 [33:16<2:25:03, 96.71s/it]No defined compartments in model iCN718. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, p, r


Some problem with the model, causing error Some compartments were empty. Check the model!
No COBRA model found at /Mounts/rbg-storage1/datasets/Metabo/BiGG/iCHOv1_DG44.mat.
Failed to load model: iCHOv1_DG44


 18%|███████████████▍                                                                        | 19/108 [33:53<1:56:44, 78.71s/it]No defined compartments in model iCN900. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e
 19%|████████████████▎                                                                       | 20/108 [36:09<2:20:47, 96.00s/it]This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model iE2348C_1286. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, p
 19%|████████████████▉                                                                      | 21/108 [39:32<3:05:27, 127.91s/it]This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in mo

KeyboardInterrupt: 

In [11]:
genes_with_sequence

[]

In [12]:
genes_without_sequence

[]

In [4]:
from ssbio.pipeline.gempro import GEMPRO

In [101]:
ROOT_DIR = '../gempro/'
PROJECT = 'gempro_missing_genes'
PDB_FILE_TYPE = 'mmtf'

# Create the GEM-PRO project
my_gempro = GEMPRO(gem_name=PROJECT, 
                   root_dir=ROOT_DIR, 
                   genes_list=genes_without_sequence,
                   pdb_file_type=PDB_FILE_TYPE)

NameError: name 'GEMPRO' is not defined

In [None]:
my_gempro.uniprot_mapping_and_metadata('P_ENTREZGENEID')
print('Missing UniProt mapping: ', my_gempro.missing_uniprot_mapping)

In [None]:
my_gempro.set_representative_sequence(force_rerun=True)

In [None]:
print('Missing a representative sequence: ', my_gempro.missing_representative_sequence)

In [None]:
my_gempro.df_representative_sequences.head()

# Recon3D

## README

Recond3D human metabolism reconstruction


**Metabolites**
- mets: metabolite
- metCharges
- metFormulas
- metSmiles
- metNames
- metHMDBID
- metInChIString
- metKEGGID
- metPubChemID
- metCHEBIID
- metPdMap
- metReconMap

**Reactions**
- rxnConfidenceScores
- rxnNames
- rxns: Reaction names
- rxnNotes
- rxnECNumbers: Enzyme Commission Number (EC Number)
- rxnKEGGID
- rxnCOG: Database of Clusters of Orthologous Genes (COGs)
- rxnKeggOrthology
- rxnReconMap: abbreviation
- rxnReferences

**Genes**
- rxnGeneMat: (13543, 3697) reaction x gene, binary
- genes: List of all genes: A VMH gene ID is composed of the Entrez Gene ID followed by a "." and a number. By default this number is 1. If more than one transcript has been reported in the literature, incremental numbers are given. For the moment, it is not possible to map these transcript identifications to a particular transcript sequence.
- grRules: Gene protein rule
- rules: Gene-reaction association rule in computable form 

**Network**
- S: stoich matrix
- lb: Lower bounds
- ub: Upper bounds
- c: Objective coefficients
- csense: the constraint senses (‘L’ for lower than, ‘G’ - greated than, ‘E’ - equal), for each met
- osense: the objective sense (‘max’ or ‘min’) * .csense
- subSystems

**Other**
- modelID
- description
- version
- PleaseCite

In [16]:
x = loadmat('/Mounts/rbg-storage1/datasets/Metabo/Recon3D/Recon3D_301/Recon3D_301.mat', struct_as_record=True)

PermissionError: [Errno 13] Permission denied: '/Mounts/rbg-storage1/datasets/Metabo/Recon3D/Recon3D_301/Recon3D_301.mat'

In [6]:
x['Recon3D'][0]

AttributeError: 'numpy.ndarray' object has no attribute 'keys'

In [41]:
x['Recon3D'].dtype.names

('S',
 'mets',
 'b',
 'csense',
 'rxns',
 'lb',
 'ub',
 'c',
 'osense',
 'genes',
 'rules',
 'metCharges',
 'metFormulas',
 'metSmiles',
 'metNames',
 'metHMDBID',
 'metInChIString',
 'metKEGGID',
 'metPubChemID',
 'description',
 'grRules',
 'rxnGeneMat',
 'rxnConfidenceScores',
 'rxnNames',
 'rxnNotes',
 'rxnECNumbers',
 'rxnReferences',
 'rxnKEGGID',
 'subSystems',
 'metCHEBIID',
 'metPdMap',
 'metReconMap',
 'modelID',
 'rxnCOG',
 'rxnKeggOrthology',
 'rxnReconMap',
 'version',
 'PleaseCite')

In [46]:
x['Recon3D'][0,0]['modelID']

array(['Recon3D'], dtype='<U7')

In [38]:
dir(x['Recon3D'][0][0].dtype)

['__bool__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__mul__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rmul__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'alignment',
 'base',
 'byteorder',
 'char',
 'descr',
 'fields',
 'flags',
 'hasobject',
 'isalignedstruct',
 'isbuiltin',
 'isnative',
 'itemsize',
 'kind',
 'metadata',
 'name',
 'names',
 'ndim',
 'newbyteorder',
 'num',
 'shape',
 'str',
 'subdtype',
 'type']

In [40]:
x['Recon3D'].dtype.names

('S',
 'mets',
 'b',
 'csense',
 'rxns',
 'lb',
 'ub',
 'c',
 'osense',
 'genes',
 'rules',
 'metCharges',
 'metFormulas',
 'metSmiles',
 'metNames',
 'metHMDBID',
 'metInChIString',
 'metKEGGID',
 'metPubChemID',
 'description',
 'grRules',
 'rxnGeneMat',
 'rxnConfidenceScores',
 'rxnNames',
 'rxnNotes',
 'rxnECNumbers',
 'rxnReferences',
 'rxnKEGGID',
 'subSystems',
 'metCHEBIID',
 'metPdMap',
 'metReconMap',
 'modelID',
 'rxnCOG',
 'rxnKeggOrthology',
 'rxnReconMap',
 'version',
 'PleaseCite')

In [26]:
x['Recon3D'].keys()

AttributeError: 'numpy.ndarray' object has no attribute 'keys'

In [7]:
recon3 = load_matlab_model('/Mounts/rbg-storage1/datasets/Metabo/Recon3D/Recon3D_301/Recon3D_301.mat')

No defined compartments in model Recon3D. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, g, i, l, m, n, r, x


In [49]:
len(recon3.groups)

111

In [50]:
recon3.groups[0]

<Group Alanine and aspartate metabolism at 0x7f09c3f20af0>

In [51]:
dir(recon3.groups[0])

['KIND_TYPES',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_annotation',
 '_id',
 '_kind',
 '_members',
 '_model',
 '_set_id_with_model',
 'add_members',
 'annotation',
 'id',
 'kind',
 'members',
 'name',
 'notes',
 'remove_members']

In [53]:
recon3.groups[0].members

[<Reaction AGTim at 0x7f09d0543a00>,
 <Reaction AGTix at 0x7f09d0543fa0>,
 <Reaction ARGSS at 0x7f09d051bd30>,
 <Reaction ASNNm at 0x7f09d04aed60>,
 <Reaction ASNS1 at 0x7f09d04aef40>,
 <Reaction ASPNATm at 0x7f09d04b8550>,
 <Reaction ASPTAm at 0x7f09d04b8e80>,
 <Reaction DASPO1p at 0x7f09d03a71f0>,
 <Reaction NACASPAH at 0x7f09cffd8b80>,
 <Reaction RE1473C at 0x7f09cf8e2250>,
 <Reaction RE2031M at 0x7f09cf841a90>,
 <Reaction RE2642C at 0x7f09cb7af700>,
 <Reaction ALAR at 0x7f09cb10e730>,
 <Reaction ASPTA at 0x7f09cb10e850>,
 <Reaction r0127 at 0x7f09cb093c10>,
 <Reaction ARGSL at 0x7f09cb0452b0>]

In [13]:
save_json_model(recon3, "recon3.json")

In [10]:
solution = recon3.optimize()

In [11]:
solution

Unnamed: 0,fluxes,reduced_costs
10FTHF5GLUtl,0.000000,0.0
10FTHF5GLUtm,0.000000,0.0
10FTHF6GLUtl,0.000000,0.0
10FTHF6GLUtm,0.000000,0.0
10FTHF7GLUtl,0.000000,0.0
...,...,...
CYOR_u10mi,666.666667,0.0
Htmi,0.000000,0.0
NADH2_u10mi,0.000000,0.0
CYOOm3i,333.333333,0.0


In [10]:
save_json_model(textbook_model, "test.json")

In [9]:
textbook_model = load_model("textbook")

In [18]:
model = load_matlab_model('e_coli_core.mat')

This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model e_coli_core. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e


In [24]:
model.constraints

<optlang.container.Container at 0x7faf8fa2b0d0>