# 4.7 Update of annotations
Annotations are missing in the model. Annotations are added for:
- Reactions
- Metabolites
- Genes

Additionally metabolites and genes which are not associated to reactions are removed.

In [1]:
#import package needed
import cobra
import pandas as pd
import numpy as np

from cobra.io import load_json_model
from cobra import Model, Reaction, Metabolite

In [27]:
# Load EcN model
EcN_ID = 'CP022686.1'
EcN_model = cobra.io.load_json_model('../data/models/%s_cur_4.6.json'%EcN_ID)

# 1. Update annotations

In [3]:
# Load metabolite overview
mtb_info = pd.read_csv('../tables/metabolites_info.csv')
mtb_info['ID'] = mtb_info['ID'].str.strip() #Remove white spaces
mtb_info.set_index('ID', inplace=True)
mtb_info.rename(columns={'Reactome Compound': 'reactome','KEGG Compound':'kegg.compound', 'CHEBI': 'chebi',
                         'InChI Key': 'inchikey', 'Human Metabolome Database':'Human Metabolome Database', 'BioCyc': 'biocyc',
                         'MetaNetX (MNX) Chemical': 'metanetx.chemical', 'SEED Compound': 'seed.compound',
                         'LipidMaps': 'lipidmaps', 'KEGG Drug': 'kegg.drug', 'KEGG Glycan':'kegg.glycan'}, inplace = True)

# Load reaction overview
rxn_info = pd.read_csv('../tables/reactions_info.csv')
rxn_info['ID'] = rxn_info['ID'].str.strip() #Remove white spaces
rxn_info.set_index('ID', inplace=True)
rxn_info.rename(columns={'RHEA': 'rhea', 'BioCyc':'biocyc', 'MetaNetX (MNX) Equation': 'metanetx.reaction',
                         'KEGG Reaction': 'kegg.reaction','SEED Reaction': 'seed.reaction',
                         'EC Number': 'ec-code', 'Reactome Reaction': 'reactome'}, inplace = True)
rxn_info.head()

Unnamed: 0_level_0,Name,Reaction Formula,Gene Rules,Subsystem,Essentiality,LB,UB,FVA_min,FVA_max,rhea,biocyc,metanetx.reaction,kegg.reaction,seed.reaction,ec-code,reactome
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
ALATA_D2,D-alanine transaminase,ala__D_c + pydx5p_c --> pyam5p_c + pyr_c,CIW80_06560 or CIW80_22360,Cofactor and Prosthetic Group Biosynthesis,Not essential,0.0,1000.0,0.0,-1.698893e-12,28565,META:RXN0-5240,MNXR95697,R01147,rxn00848,,
SHCHD2,Sirohydrochlorin dehydrogenase (NAD),dscl_c + nad_c --> h_c + nadh_c + scl_c,CIW80_11380,Cofactor and Prosthetic Group Biosynthesis,Essential,0.0,1000.0,0.00025,0.0002500454,15616,META:DIMETHUROPORDEHYDROG-RXN,MNXR104373,R03947,rxn02774,1.3.1.76,
CPPPGO,Coproporphyrinogen oxidase (O2 required),cpppg3_c + 2.0 h_c + o2_c --> 2.0 co2_c + 2.0 ...,CIW80_06055,Cofactor and Prosthetic Group Biosynthesis,Not essential,0.0,1000.0,0.00025,0.0002500454,18260,META:RXN0-1461,MNXR96880,R03220,rxn02303,1.3.3.3,
GTHOr,Glutathione oxidoreductase,gthox_c + h_c + nadph_c <=> 2.0 gthrd_c + nadp_c,CIW80_12105,Cofactor and Prosthetic Group Biosynthesis,Not essential,-1000.0,1000.0,0.0,0.2774966,11743,META:GLUTATHIONE-REDUCT-NADPH-RXN,MNXR100098,R00115,rxn00086,1.8.1.7,R-XTR-71682
DHORD5,Dihydroorotic acid (menaquinone-8),dhor__S_c + mqn8_c --> mql8_c + orot_c,CIW80_22710,Purine and Pyrimidine Biosynthesis,Not essential,0.0,1000.0,0.0,0.3709138,29202,,MNXR97421,,rxn08336,,


In [4]:
# Remove the secondary biosynthesis reactions > replaced by detailed reactions in 4.6
for rxn in rxn_info.index.values:
    if 'MNXR' in rxn:
        rxn_info.drop([rxn], inplace=True)
        
rxn_info.rename(index={'2HH24DDH1_copy1': '2HH24DDH1'}, inplace=True)
        
rxn_info.drop(['ENTCS', 'SALASYN', 'YBTSYN', 'PRECLBTNSYN', 'ACGAL6PISO', 'ACGAL6PI', '2HH24DDH1_copy2', 'URIC', 'CELLBpts', 'BUTt2rpp','ABUTtex', 'ACACtex', 'ETHAtex', 'XANtex'], inplace=True) # These reactions were replaced in 4.6 and removed in 4.1

In [5]:
# Check
rxn_info.loc['GLTPD', 'ec-code']
# rxn_info.loc['NO3R2pp', 'rhea']
# rxn_info.loc['SHCHD2', 'ec-code']

'1.1.1.M6'

### 1.1 Update reactions

In [6]:
# add annotations reactions
for rxn in rxn_info.index.values:
    try:
        reaction = EcN_model.reactions.get_by_id(rxn)

        # Add database annotations when present
        for annot in ['rhea', 'biocyc', 'metanetx.reaction','kegg.reaction',
                      'seed.reaction', 'ec-code', 'reactome']:
            if pd.isnull(rxn_info.loc[rxn, annot]):
                pass
            else:
                if annot == 'rhea':
                    rxn_info.loc[rxn, annot] = rxn_info.loc[rxn, annot].split('#')[0] # Remove the #1 which is present in some rhea annotations
                reaction.annotation[annot] = [rxn_info.loc[rxn, annot]]
    except:
        print(rxn, 'not in model')

SUCptspp not in model
FFSD not in model
SUCR not in model
SUCtpp not in model


In [7]:
# Check a reaction
EcN_model.reactions.ALATA_D2.annotation

{'bigg.reaction': ['ALATA_D2'],
 'biocyc': ['META:RXN0-5240'],
 'kegg.reaction': ['R01147'],
 'metanetx.reaction': ['MNXR95697'],
 'rhea': ['28565'],
 'sabiork': ['1388'],
 'sbo': 'SBO:0000176',
 'seed.reaction': ['rxn00848']}

In [8]:
# Update subsystem annotation of reactions
for rxn in EcN_model.reactions:
    if 't2pp' in rxn.id: # Correct the subsystem of these reactions to Inner Membrane
        if rxn.subsystem == 'Transport':
            rxn.subsystem = 'Transport, Inner Membrane'
    
    if 'PEP:Pyr' in rxn.name: # Correct the subsystem of these reactions to Inner Membrane
        rxn.subsystem = 'Transport, Inner Membrane'
        
    if 'tpp' in rxn.id: # Correct the subsystem of these reactions to Inner Membrane
        if rxn.subsystem == 'Transport' or rxn.subsystem == 'Exchange':
            rxn.subsystem = 'Transport, Inner Membrane'

    if 'tex' in rxn.id: # Correct the subsystem of these reactions to Outer Membrane
        if rxn.subsystem == 'Transport':
            rxn.subsystem = 'Transport, Outer Membrane'
        
    if 't4rpp' in rxn.id: # Correct the subsystem of these reactions to Outer Membrane
        rxn.subsystem = 'Transport, Outer Membrane'
        
        
### Align all names
    if rxn.subsystem == 'Transport Outer Membrane Porin':
        rxn.subsystem = 'Transport, Outer Membrane'
        
    if rxn.subsystem == 'Transport, Outer Membrane Porin':
        rxn.subsystem = 'Transport, Outer Membrane'
        
    if rxn.subsystem == 'S_Transport_Outer_Membrane_Porin':
        rxn.subsystem = 'Transport, Outer Membrane'
        
    if rxn.subsystem == 'Transport Outer Membrane':
        rxn.subsystem = 'Transport, Outer Membrane'
        
    if rxn.subsystem == 'Transport Inner Membrane':
        rxn.subsystem = 'Transport, Inner Membrane'
        
    if rxn.subsystem == 'S_Transport_Inner_Membrane':
        rxn.subsystem = 'Transport, Inner Membrane'
        
    if rxn.subsystem == 'S_Aromatic_Acid_Breakdown':
        rxn.subsystem = 'Aromatic Acid Breakdown'
        
    if rxn.subsystem == 'S_Alternate_Carbon_Metabolism':
        rxn.subsystem = 'Alternate Carbon Metabolism'
        
    if rxn.subsystem == 'S_Alternate_Carbon_source':
        rxn.subsystem = 'Alternate Carbon Metabolism'
        
    if rxn.subsystem == 'S_Lipopolysaccharide_Biosynthesis___Recycling':
        rxn.subsystem = 'Lipopolysaccharide Biosynthesis / Recycling'
        
    if rxn.subsystem == 'S_Lipopolysaccharide_Biosynthesis_Recycling':
        rxn.subsystem = 'Lipopolysaccharide Biosynthesis / Recycling'    
        
    if rxn.subsystem == 'iron metabolism':
        rxn.subsystem = 'Iron Metabolism'
        
    if rxn.subsystem == 'S_penicillin_breakdown':
        rxn.subsystem = 'Penicillin Breakdown'
        
    if rxn.subsystem == 'purine metabolism':
        rxn.subsystem = 'Purine Metabolism'
        
    if rxn.subsystem == 'siderophore biosynthesis':
        rxn.subsystem = 'Secondary metabolite biosynthesis'
        
    if rxn.subsystem == 'Benzoate degradation':
        rxn.subsystem = 'Benzoate Degradation'   

### 1.2 Update metabolites

In [9]:
# Remove metabolites
mtb_info.drop(['asn__L', 'preclbtn_c', 'preclbtn_p'], inplace=True)

In [10]:
# add annotations metabolites
for mtb in mtb_info.index.values:
    metabolite = EcN_model.metabolites.get_by_id(mtb)

    # Add database annotations when present
    for annot in ['reactome', 'kegg.compound', 'chebi', 'inchikey', 'Human Metabolome Database', 'biocyc',
                  'metanetx.chemical', 'seed.compound', 'lipidmaps', 'kegg.drug', 'kegg.glycan']:
        if pd.isnull(mtb_info.loc[mtb, annot]):
            pass
        else:
            metabolite.annotation[annot] = [mtb_info.loc[mtb, annot]]

In [11]:
# Check a metabolite
dict_1 = EcN_model.metabolites.dms_e.annotation
EcN_model.metabolites.dms_e.annotation

{'bigg.metabolite': ['dms'],
 'biocyc': ['META:CPD-7670'],
 'chebi': ['CHEBI:4611'],
 'envipath': ['32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/compound/26c081a5-2c2d-4e64-bd11-0d80376989ba'],
 'hmdb': ['HMDB02303'],
 'inchi_key': ['QMMFVYPAHWMCMS-UHFFFAOYSA-N'],
 'kegg.compound': ['C00580'],
 'metanetx.chemical': ['MNXM444'],
 'sabiork': ['2104'],
 'sbo': 'SBO:0000247',
 'seed.compound': ['cpd00450'],
 'inchikey': ['QMMFVYPAHWMCMS-UHFFFAOYSA-N'],
 'Human Metabolome Database': ['HMDB02303']}

In [12]:
for value in dict_1.values():
    print(value, type(value))

['dms'] <class 'list'>
['META:CPD-7670'] <class 'list'>
['CHEBI:4611'] <class 'list'>
['32de3cf4-e3e6-4168-956e-32fa5ddb0ce1/compound/26c081a5-2c2d-4e64-bd11-0d80376989ba'] <class 'list'>
['HMDB02303'] <class 'list'>
['QMMFVYPAHWMCMS-UHFFFAOYSA-N'] <class 'list'>
['C00580'] <class 'list'>
['MNXM444'] <class 'list'>
['2104'] <class 'list'>
SBO:0000247 <class 'str'>
['cpd00450'] <class 'list'>
['QMMFVYPAHWMCMS-UHFFFAOYSA-N'] <class 'list'>
['HMDB02303'] <class 'list'>


In [13]:
dict_1['seed.compound']

['cpd00450']

### 1.3 Update genes

In [14]:
#import packages needed
from glob import glob
from Bio import Entrez, SeqIO

In [15]:
in_file = '../data/genomes_gb/CP022686.1.gb'

gene_annot_df = pd.DataFrame(columns=('locus', 'ncbigene', 'ncbiprotein'))
gene_annot_df.set_index('locus', inplace=True)

handle = open(in_file)

records = SeqIO.parse(handle, "genbank")
for record in records:
    for f in record.features:
        if f.type=='CDS':

            # Get locus tag
            if 'locus_tag' in f.qualifiers.keys():
                locus = f.qualifiers['locus_tag'][0]
            
#             # get gene id
            if 'gene' in f.qualifiers.keys():
                gene_id = f.qualifiers['gene'][0]
                gene_annot_df.loc[locus, 'ncbigene'] = gene_id
            
            # get protein_id
            if 'protein_id' in f.qualifiers.keys():
                prot_id = f.qualifiers['protein_id'][0]
                gene_annot_df.loc[locus, 'ncbiprotein'] = prot_id

gene_annot_df.head()

Unnamed: 0_level_0,ncbigene,ncbiprotein
locus,Unnamed: 1_level_1,Unnamed: 2_level_1
CIW80_00005,,AXY44376.1
CIW80_00010,,AXY44377.1
CIW80_00015,,AXY44378.1
CIW80_00020,gap,AXY44379.1
CIW80_00025,,AXY48986.1


In [16]:
# add annotations genes
for gene in gene_annot_df.index.values:
    try:
        EcN_gene = EcN_model.genes.get_by_id(gene)

        # Add database annotations when present
        for annot in ['ncbigene', 'ncbiprotein']:
            if pd.isnull(gene_annot_df.loc[gene, annot]):
                pass
            else:
                EcN_gene.annotation[annot] = [gene_annot_df.loc[gene, annot]]
    except:
        pass

In [17]:
EcN_model.genes.CIW80_00140.annotation

{'asap': ['ABE-0004807'],
 'ecogene': ['EG13764'],
 'ncbigene': ['945976'],
 'ncbigi': ['16129401'],
 'refseq_locus_tag': ['b1442'],
 'refseq_name': ['ydcU'],
 'refseq_synonym': ['ECK1436', 'JW1437'],
 'sbo': 'SBO:0000243',
 'uniprot': ['P77156'],
 'ncbiprotein': ['AXY44398.1']}

## 2. Remove metabolites without reaction

In [18]:
# find all metabolites without a reaction
for mtb in EcN_model.metabolites:
    if EcN_model.metabolites.get_by_id(mtb.id).reactions == frozenset():
        print(mtb)
        EcN_model.remove_metabolites(mtb)
        
# Some empty metabolites were not removed > run second time
print('\nSecond run')
for mtb in EcN_model.metabolites:
    if EcN_model.metabolites.get_by_id(mtb.id).reactions == frozenset():
        print(mtb)
        EcN_model.remove_metabolites(mtb)

frulysp_c
gg4abut_c
dhpppn_c
hkntd_c
dhps_p
2hptcl_c
dhcinnm_c
ggptrc_c
sq_c
cinnm_c
o16a4und_p
2hptcoa_c
dhptdp_c
34dphacoa_c
frulys_c
sqg_c
ragund_c
thcur_c
cechddd_c
o16aund_p
dhptdd_c
garagund_c
o16a3und_p
kphphhlipa_c
udpgalfur_c
3oxdhscoa_c
cenchddd_c
2oxpaccoa_c
hkndd_c
dtdprmn_c
rephaccoa_c
cur_c
op4en_c
dtdp4d6dm_c
sq_p
ggbutal_c
aragund_c
o16a2und_p
dhcur_c
4h2opntn_c
suc6p_c
4abzglu_c
sucr_c
pep_p

Second run


## 3. Remove genes without reaction

In [19]:
# Load gene_origin dataframe
gene_origin_df = pd.read_csv('../tables/gene_origin.csv')
gene_origin_df.set_index('EcN_gene', inplace=True)

In [20]:
print('The original number of genes is:', len(EcN_model.genes), '\n')

# Create a list of genes not associated to reactions
gene_del_list = []

for gene in EcN_model.genes:
    if EcN_model.genes.get_by_id(gene.id).reactions == frozenset():
        print(gene)
        gene_del_list.append(gene.id)
        
        gene_origin_df.loc[gene.id, 'added'] = 'removed'
        gene_origin_df.loc[gene.id, 'notes'] = 'Empty gene. Removed in 4.7'
        
# Remove genes
cobra.manipulation.remove_genes(EcN_model, gene_del_list)
        
print('\nThe new number of genes is:', len(EcN_model.genes))

The original number of genes is: 1536 

CIW80_03935
CIW80_10195
CIW80_03825

The new number of genes is: 1533


- CIW80_03935 > Never had reaction, O16AP1pp in iML1515
- CIW80_10195 > AgaI, corrected in 4.2 to agaA and agaS
- CIW80_03825 > SUCtpp, removed in 4.5

# 3. Save model

In [21]:
# Save the model
cobra.io.json.save_json_model(EcN_model, str('../data/models/%s_cur_4.7.json'%EcN_ID))

In [22]:
# Save as a table
gene_origin_df.to_csv('../tables/gene_origin.csv')