In [2]:
from collections import defaultdict
import time
import pandas as pd
import numpy as np
from scipy.io import loadmat
# import escher
# from escher import Builder
import cobra
from cobra.io import load_matlab_model, load_json_model, save_json_model, load_model
# import cobramod
import json, copy
import os
from xml.etree import cElementTree as ET
from tqdm import tqdm
import re
import copy
import requests
from cobra.core.metabolite import Metabolite
from bioservices import ChEBI
import warnings
from ssbio.pipeline.gempro import GEMPRO

In [3]:
models_json = json.load(
    open("/Mounts/rbg-storage1/datasets/Metabo/BiGG/bigg_models.json", "rb")
)
models = [v["bigg_id"] for v in models_json["results"]]

In [69]:
organism2gene2sequence = {}
genes_without_sequence = [] # these in GEMPro
genes_with_sequence = []

for organism_name in tqdm(models, position=0):
    print(organism_name)
    r = requests.get(f"http://bigg.ucsd.edu/api/v2/models/{organism_name}/genes/")
    bigg_ids = set([res['bigg_id'] for res in r.json()['results']])
    for gene in tqdm(bigg_ids, position=0):
        r = requests.get(f"http://bigg.ucsd.edu/api/v2/models/{organism_name}/genes/{gene}")
        protein_metadata = r.json()
        # if gene exists in BiGG Model then add them to `organism2gene2sequence`
        if 'protein_sequence' in protein_metadata:
            continue
        else:
            try:
                entrez_id = protein_metadata['database_links']['NCBI Entrez Gene'][0]['id']
                print(f"This gene doesnt have a sequence in BiGG! {entrez_id}")
                genes_without_sequence.append(entrez_id)
            except:
                print("This gene is missing an Entrez ID: ", protein_metadata)

  0%|                                                                                                   | 0/108 [00:00<?, ?it/s]

e_coli_core


100%|█████████████████████████████████████████████████████████████████████████████████████████| 137/137 [00:29<00:00,  4.66it/s]
  1%|▊                                                                                          | 1/108 [00:29<52:49, 29.62s/it]

iAB_RBC_283


100%|█████████████████████████████████████████████████████████████████████████████████████████| 346/346 [01:13<00:00,  4.68it/s]
  2%|█▋                                                                                       | 2/108 [01:43<1:38:38, 55.83s/it]

iAF1260


100%|███████████████████████████████████████████████████████████████████████████████████████| 1261/1261 [04:31<00:00,  4.64it/s]
  3%|██▍                                                                                     | 3/108 [06:16<4:30:36, 154.64s/it]

iAF1260b


100%|███████████████████████████████████████████████████████████████████████████████████████| 1261/1261 [04:31<00:00,  4.65it/s]
  4%|███▎                                                                                    | 4/108 [10:47<5:48:19, 200.96s/it]

iAF692


100%|█████████████████████████████████████████████████████████████████████████████████████████| 692/692 [02:31<00:00,  4.57it/s]
  5%|████                                                                                    | 5/108 [13:19<5:14:34, 183.25s/it]

iAF987


 60%|█████████████████████████████████████████████████████▋                                   | 596/987 [02:07<01:23,  4.66it/s]
  5%|████                                                                                    | 5/108 [15:28<5:18:48, 185.71s/it]

KeyboardInterrupt



In [28]:
protein_metadata['database_links']

{'Online Mendelian Inheritance in Man': [{'link': 'http://identifiers.org/omim/136850',
   'id': '136850'}],
 'Human Protein Reference Database': [{'link': 'http://identifiers.org/hprd/00652',
   'id': '00652'}],
 'NCBI Entrez Gene': [{'link': 'http://identifiers.org/ncbigene/2271',
   'id': '2271'}],
 'CCDS': [{'link': 'http://identifiers.org/ccds/CCDS1617.1',
   'id': 'CCDS1617.1'}]}

In [15]:
ROOT_DIR = '../gempro/'
PROJECT = 'gempro_missing_genes'
PDB_FILE_TYPE = 'mmtf'

# Create the GEM-PRO project
my_gempro = GEMPRO(gem_name=PROJECT, 
                   root_dir=ROOT_DIR, 
                   genes_list=genes_with_sequence)

In [16]:
my_gempro

<GEMPRO gempro_missing_genes at 0x7f99b56fd1f0>

In [17]:
my_gempro.uniprot_mapping_and_metadata('P_ENTREZGENEID')
print('Missing UniProt mapping: ', my_gempro.missing_uniprot_mapping)



A Jupyter Widget

Missing UniProt mapping:  ['944794', '6541', '4351', '946147', '947415', '3101', '946746', '945008', '2821', '5139', '55276', '4953', '947623', '948457', '947854', '349565', '948130', '1374', '946179', '2876', '56895', '3614', '51181', '5207', '57026', '7358', '8501', '948251', '1312', '946940', '945314', '946802', '948039', '2730', '5236', '948667', '946209', '51084', '4048', '55326', '946738', '947723', '3098', '4881', '23396', '203', '8525', '1608', '191', '60482', '945300', '947547', '946762', '947724', '946761', '8854', '200576', '26873', '5142', '947069', '8395', '162466', '223', '948247', '5052', '8612', '759', '949032', '9942', '3145', '948517', '944864', '948254', '4882', '118881', '2203', '3417', '948412', '946764', '948512', '262', '946886', '1606', '944834', '10555', '945837', '946880', '318', '383', '948453', '9429', '55500', '3635', '1178', '2805', '231', '945540', '945032', '25796', '5973', '5631', '945621', '947635', '948535', '221823', '490', '6888', '5230', '946187', 

In [26]:
len(set(my_gempro.missing_uniprot_mapping)), len(set(genes_with_sequence))

(383, 383)

In [13]:
my_gempro.set_representative_sequence(force_rerun=True)

A Jupyter Widget

944794: no sequences mapped
6541: no sequences mapped
4351: no sequences mapped
947415: no sequences mapped
946147: no sequences mapped
3101: no sequences mapped
946746: no sequences mapped
945008: no sequences mapped
2821: no sequences mapped
5139: no sequences mapped
55276: no sequences mapped
4953: no sequences mapped
947623: no sequences mapped
948457: no sequences mapped
947854: no sequences mapped
349565: no sequences mapped
948130: no sequences mapped
1374: no sequences mapped
946179: no sequences mapped
2876: no sequences mapped
56895: no sequences mapped
3614: no sequences mapped
51181: no sequences mapped
5207: no sequences mapped
57026: no sequences mapped
7358: no sequences mapped
8501: no sequences mapped
948251: no sequences mapped
1312: no sequences mapped
946940: no sequences mapped
945314: no sequences mapped
946802: no sequences mapped
948039: no sequences mapped
2730: no sequences mapped
5236: no sequences mapped
948667: no sequences mapped
946209: no sequences mappe

In [None]:
print('Missing a representative sequence: ', my_gempro.missing_representative_sequence)

In [14]:
my_gempro.df_representative_sequences.head()

Empty dataframe


Unnamed: 0_level_0,uniprot,kegg,num_pdbs,pdbs,seq_len,sequence_file,metadata_file
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [None]:
my_gempro.df_representative_sequences

In [44]:
my_gempro.df_representative_sequences.to_dict('records')

Empty dataframe


[]

In [19]:
from bioservices import UniProt

In [20]:
u = UniProt()

In [None]:
u.mapping(fr="P_ENTREZGENEID", to="ACC", query='944794')



{}

In [None]:
df = u.get_df([''])

  output = output.append(df, ignore_index=True)


In [13]:
pd.set_option('display.max_columns', None)
df[(df['Entry'] == 'Q9UJ70') | (df['Entry'] == 'A0A384N6G7') | (df['Entry'] ==  'C9JEV6')]

Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,Proteomes,Taxonomic lineage (ALL),Taxonomic lineage IDs,Virus hosts,Fragment,Sequence,Length,Mass,Gene encoded by,Alternative products (isoforms),Erroneous gene model prediction,Erroneous initiation,Erroneous termination,Erroneous translation,Frameshift,Mass spectrometry,Polymorphism,RNA editing,Sequence caution,Alternative sequence,Natural variant,Non-adjacent residues,Non-standard residue,Non-terminal residue,Sequence conflict,Sequence uncertainty,Version (sequence),Domains,Domain count,Domain [CC],Sequence similarities,Coiled coil,Compositional bias,Domain [FT],Motif,Region,Repeat,Zinc finger,EC number,Absorption,Catalytic activity,Cofactor,Function [CC],Kinetics,Pathway,Redox potential,Temperature dependence,pH dependence,Active site,Binding site,DNA binding,Metal binding,Nucleotide binding,Site,Gene ontology (GO),Gene ontology (biological process),Gene ontology (molecular function),Gene ontology (cellular component),Gene ontology IDs,InterPro,Interacts with,Subunit structure [CC],PubMed ID,Mapped PubMed ID,Date of creation,Date of last modification,Date of last sequence modification,Version (entry),3D,Beta strand,Helix,Turn,Subcellular location [CC],Intramembrane,Topological domain,Transmembrane,Annotation,Features,Caution,Tissue specificity,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Keyword ID,Pathway.1,Allergenic properties,Biotechnological use,Disruption phenotype,Involvement in disease,Pharmaceutical use,Toxic dose,Post-translational modification,Chain,Cross-link,Disulfide bond,Glycosylation,Initiator methionine,Lipidation,Modified residue,Peptide,Propeptide,Signal peptide,Transit peptide,Taxonomic lineage (all),Taxonomic lineage (SUPERKINGDOM),Taxonomic lineage (KINGDOM),Taxonomic lineage (SUBKINGDOM),Taxonomic lineage (SUPERPHYLUM),Taxonomic lineage (PHYLUM),Taxonomic lineage (SUBPHYLUM),Taxonomic lineage (SUPERCLASS),Taxonomic lineage (CLASS),Taxonomic lineage (SUBCLASS),Taxonomic lineage (INFRACLASS),Taxonomic lineage (SUPERORDER),Taxonomic lineage (ORDER),Taxonomic lineage (SUBORDER),Taxonomic lineage (INFRAORDER),Taxonomic lineage (PARVORDER),Taxonomic lineage (SUPERFAMILY),Taxonomic lineage (FAMILY),Taxonomic lineage (SUBFAMILY),Taxonomic lineage (TRIBE),Taxonomic lineage (SUBTRIBE),Taxonomic lineage (GENUS),Taxonomic lineage (SUBGENUS),Taxonomic lineage (SPECIES GROUP),Taxonomic lineage (SPECIES SUBGROUP),Taxonomic lineage (SPECIES),Taxonomic lineage (SUBSPECIES),Taxonomic lineage (VARIETAS),Taxonomic lineage (FORMA),Taxonomic lineage IDs (all),Taxonomic lineage IDs (SUPERKINGDOM),Taxonomic lineage IDs (KINGDOM),Taxonomic lineage IDs (SUBKINGDOM),Taxonomic lineage IDs (SUPERPHYLUM),Taxonomic lineage IDs (PHYLUM),Taxonomic lineage IDs (SUBPHYLUM),Taxonomic lineage IDs (SUPERCLASS),Taxonomic lineage IDs (CLASS),Taxonomic lineage IDs (SUBCLASS),Taxonomic lineage IDs (INFRACLASS),Taxonomic lineage IDs (SUPERORDER),Taxonomic lineage IDs (ORDER),Taxonomic lineage IDs (SUBORDER),Taxonomic lineage IDs (INFRAORDER),Taxonomic lineage IDs (PARVORDER),Taxonomic lineage IDs (SUPERFAMILY),Taxonomic lineage IDs (FAMILY),Taxonomic lineage IDs (SUBFAMILY),Taxonomic lineage IDs (TRIBE),Taxonomic lineage IDs (SUBTRIBE),Taxonomic lineage IDs (GENUS),Taxonomic lineage IDs (SUBGENUS),Taxonomic lineage IDs (SPECIES GROUP),Taxonomic lineage IDs (SPECIES SUBGROUP),Taxonomic lineage IDs (SPECIES),Taxonomic lineage IDs (SUBSPECIES),Taxonomic lineage IDs (VARIETAS),Taxonomic lineage IDs (FORMA),Cross-reference (db_abbrev),Cross-reference (EMBL)
0,Q9UJ70,NAGK_HUMAN,[NAGK],NAGK,,,,Homo sapiens (Human),9606,N-acetyl-D-glucosamine kinase (N-acetylglucosa...,UP000005640: Chromosome 2,"cellular organisms, Eukaryota, Opisthokonta, M...",9606,,,MAAIYGGVEGGGTRSEVLLVSEDGKILAEADGLSTNHWLIGTDKCV...,344,37376,,,,,,,,,,,,"VAR_SEQ 1; /note=""M -> MRTRTGSQLAAREVTGSGAVPR...","VARIANT 38; /note=""W -> R (in dbSNP:rs1785614...",,,,"CONFLICT 70; /note=""S -> I (in Ref. 1; CAB618...",,4,[],0,,SIMILARITY: Belongs to the eukaryotic-type N-a...,,,,,"REGION 129..130; /note=""Substrate binding""; ...",,,2.7.1.59,,CATALYTIC ACTIVITY: Reaction=ATP + N-acetyl-D-...,,FUNCTION: Converts endogenous N-acetylglucosam...,,PATHWAY: Amino-sugar metabolism; N-acetylneura...,,,,,"BINDING 13; /note=""ATP; via amide nitrogen""; ...",,,,,"[cytosol [GO:0005829], extracellular exosome [...",N-acetylglucosamine metabolic process [GO:0006...,ATP binding [GO:0005524]; N-acetylglucosamine ...,cytosol [GO:0005829]; extracellular exosome [G...,"[GO:0005524, GO:0005829, GO:0006044, GO:000605...","[IPR002731, IPR043129, IPR039758, ]","[Q9NZ32, P42773, Q9UI36, Q9UI36-2, Q8TBB1, P50...",SUBUNIT: Homodimer. {ECO:0000269|PubMed:170103...,"[10824116, 14702039, 15815621, 15489334, 12665...",15231747; 15231748; 15987957; 16189514; 182348...,2002-11-01,2022-05-25,2007-01-23,186,X-ray crystallography (2),"STRAND 4..10; /evidence=""ECO:0007829|PDB:2CH5...","HELIX 37..40; /evidence=""ECO:0007829|PDB:2CH5...","TURN 76..79; /evidence=""ECO:0007829|PDB:2CH5""...",,,,,5 out of 5,Alternative sequence (1); Beta strand (14); Bi...,,TISSUE SPECIFICITY: Ubiquitous. {ECO:0000269|P...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,[Eukaryotic-type N-acetylglucosamine kinase fa...,186,"[Alternative products (1), Catalytic activity ...",,KW-0002; KW-0007; KW-0025; KW-0067; KW-0418; K...,Amino-sugar metabolism; N-acetylneuraminate de...,,,,,,,,"CHAIN 2..344; /note=""N-acetyl-D-glucosamine k...",,,,"INIT_MET 1; /note=""Removed""; /evidence=""ECO:...",,"MOD_RES 2; /note=""N-acetylalanine""; /evidenc...",,,,,"cellular organisms, Eukaryota, Opisthokonta, M...",Eukaryota,Metazoa,,,Chordata,Craniata,Sarcopterygii,Mammalia,,,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea (apes),Hominidae (great apes),Homininae,,,Homo,,,,Homo sapiens (Human),,,,"131567, 2759, 33154, 33208, 6072, 33213, 33511...",2759,33208,,,7711,89593,8287,40674,,,314146,9443,376913,314293,9526,314295,9604,207598,,,9605,,,,,,,,,AJ242910;AK001812;AK297224;CR457271;AK222645;A...
9,C9JEV6,C9JEV6_HUMAN,[NAGK],NAGK,,,,Homo sapiens (Human),9606,N-acetyl-D-glucosamine kinase (EC 2.7.1.59) (G...,UP000005640: Chromosome 2,"cellular organisms, Eukaryota, Opisthokonta, M...",9606,,,MVNRAKRKAGVDPLVPLRSLGLSLSGGDQEDAGRILIEELRDRFPY...,293,32018,,,,,,,,,,,,,,,,,,,3,[],0,,SIMILARITY: Belongs to the eukaryotic-type N-a...,,,"DOMAIN 50..224; /note=""BcrAD_BadFG""; /eviden...",,,,,2.7.1.59,,,,,,,,,,,,,,,,[N-acetylglucosamine kinase activity [GO:00451...,,N-acetylglucosamine kinase activity [GO:0045127],,[GO:0045127],"[IPR002731, IPR043129, IPR039758, ]",[],,"[15815621, 18669648, 19690332, 20068231, 21269...",15987957; 17010375; 22507750; 25921606; 276466...,2009-11-03,2022-05-25,2011-06-28,83,,,,,,,,,2 out of 5,Domain (1),,,,"[Proteomics identification, Reference proteome]",Evidence at protein level,unreviewed,,[Eukaryotic-type N-acetylglucosamine kinase fa...,83,[Sequence similarities (1)],,KW-1185; KW-1267,,,,,,,,,,,,,,,,,,,,"cellular organisms, Eukaryota, Opisthokonta, M...",Eukaryota,Metazoa,,,Chordata,Craniata,Sarcopterygii,Mammalia,,,Euarchontoglires,Primates,Haplorrhini,Simiiformes,Catarrhini,Hominoidea (apes),Hominidae (great apes),Homininae,,,Homo,,,,Homo sapiens (Human),,,,"131567, 2759, 33154, 33208, 6072, 33213, 33511...",2759,33208,,,7711,89593,8287,40674,,,314146,9443,376913,314293,9526,314295,9604,207598,,,9605,,,,,,,,,


In [19]:
RECON3_PROTEINS = pd.read_excel(
    '/Mounts/rbg-storage1/datasets/Metabo/VMH/Recon3D/41587_2018_BFnbt4072_MOESM11_ESM.xlsx', 
    sheet_name = 'Supplementary Data File 11'
)

In [96]:
entrez2uniprot = RECON3_PROTEINS['seq_uniprot'].to_dict()

In [None]:
from bioservices import UniProt

In [106]:
u = UniProt()

In [101]:
u.get_fasta_sequence(entrez2uniprot[float(gene.id)])

'MNQKTILVLLILAVITIFALVCVLLVGRGGDGGEPSQLPHCPSVSPSAQPWTHPGQSQLFADLSREELTAVMRFLTQRLGPGLVDAAQARPSDNCVFSVELQLPPKAAALAHLDRGSPPPAREALAIVFFGRQPQPNVSELVVGPLPHPSYMRDVTVERHGGPLPYHRRPVLFQEYLDIDQMIFNRELPQASGLLHHCCFYKHRGRNLVTMTTAPRGLQSGDRATWFGLYYNISGAGFFLHHVGLELLVNHKALDPARWTIQKVFYQGRYYDSLAQLEAQFEAGLVNVVLIPDNGTGGSWSLKSPVPPGPAPPLQFYPQGPRFSVQGSRVASSLWTFSFGLGAFSGPRIFDVRFQGERLVYEISLQEALAIYGGNSPAAMTTRYVDGGFGMGKYTTPLTRGVDCPYLATYVDWHFLLESQAPKTIRDAFCVFEQNQGLPLRRHHSDLYSHYFGGLAETVLVVRSMSTLLNYDYVWDTVFHPSGAIEIRFYATGYISSAFLFGATGKYGNQVSEHTLGTVHTHSAHFKVDLDVAGLENWVWAEDMVFVPMAVPWSPEHQLQRLQVTRKLLEMEEQAAFLVGSATPRYLYLASNHSNKWGHPRGYRIQMLSFAGEPLPQNSSMARGFSWERYQLAVTQRKEEEPSSSSVFNQNDPWAPTVDFSDFINNETIAGKDLVAWVTAGFLHIPHAEDIPNTVTVGNGVGFFLRPYNFFDEDPSFYSADSIYFRGDQDAGACEVNPLACLPQAAACAPDLPAFSHGGFSHN'