In [1]:
import cobra
import sys
sys.path.append('../../utils/')
import importlib
import graph_utils 
importlib.reload(graph_utils)
import networkx as nx
import pandas as pd
import numpy as np
from scipy.optimize import nnls


In [2]:
graph=nx.read_gml('../../Knowledge_graph/ich360_graph.gml')

###  Read Enzyme to pp map

In [3]:
E=pd.read_csv('../../Model/model_tables/enzyme_pp_stoichiometric_matrix_uniprot.csv',index_col=0)
E.head()

Unnamed: 0,P69441,P0A763,P15770,P05020,P08244,P07004,P0ABH7,P08200,P77580,P0A9Q7,...,P77625,P0ABD5,P0A9Q5,P24182,P0ABD8,P16095,P39286,P32662,P0A7A2,P25665
6PGLUCONDEHYDROG-CPLX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
G6903-MONOMER,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPLX0-7912,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPLX0-8158,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PITB-MONOMER,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Add slack enzymes (These were previosulty identified through Ecocyc querying)

In [4]:
out_of_model_enzymes={'CPLX0-2381':{'P0A6P9':2,'P05055':3,'P0A8J8':2,'P21513':4,'P0A7B1':2},
                      'CYSSYNMULTI-CPLX':{'P0ABK5':2,'P0A9D4':6},
                      'CPLX0-8306':{'P0A830':1,'P0AEC8':1},
                    'CPLX0-8307':{'P0AEC8':2},
                      'CPLX-168':{'P69783':1,'P36672':2},
                      'CPLX0-7':{'P69783':1,'P77272':1},
                      'CPLX0-7921':{'P69797':2},
                      'CPLX0-8255':{'P69786 ':1},
                      'CPLX0-8240':{'P0ABU2':2},
                      'RIBULOKIN-CPLX':{'P08204':2}
                      
}
for additional_enzyme, subunit_composition in out_of_model_enzymes.items():
    for uniprot, stoich in subunit_composition.items():
        E.loc[additional_enzyme,uniprot]=stoich
E.fillna(0,inplace=True)

### Identify polypeptides associated with out-of-model enzymes on ecocyc

### Read proteomic Data from Schmidt et al. (2016)

In [5]:
data=pd.read_csv('../data/Schmidt_et_al_2016/parsed_data/Schmidt_et_al_2016_parsed.csv',index_col=0).set_index('uniprot_id')
data=data.filter([c for c in data.columns if 'p_per_cell' in c])
data.head()

Unnamed: 0_level_0,glucose_pp_per_cell,acetate_pp_per_cell,fumarate_pp_per_cell,glycerol_pp_per_cell,pyruvate_pp_per_cell,fructose_pp_per_cell,succinate_pp_per_cell,xylose_pp_per_cell,glucose_chemostat_mu_0_5_pp_per_cell,glucose_chemostat_mu_0_12_pp_per_cell,glucose_chemostat_mu_0_20_pp_per_cell,glucose_chemostat_mu_0_35_pp_per_cell,glucose_MG1655_pp_per_cell
uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
A5A614,14.0,9.0,8.0,8.0,25.0,,8.0,,6.0,3.0,24.0,68.0,
A5A621,2.0,1.0,5.0,9.0,5.0,,0.0,,0.0,0.0,0.0,15.0,
B8LFD5,19.0,14.0,18.0,18.0,20.0,23.0,25.0,27.0,17.0,15.0,13.0,17.0,16.0
C9M2Y6,,,,,,,,,,,,,2241.0
D0EX67,131.0,81.0,93.0,104.0,94.0,110.0,97.0,132.0,80.0,39.0,51.0,63.0,117.0


### read proteomics metadata

In [6]:
metadata=pd.read_csv('../data/Schmidt_et_al_2016/parsed_data/Schmidt_et_all_2016_parsed_metadata.csv')
metadata.head()

Unnamed: 0,condition_id,strain,type,condition,carbon_source,bigg_carbon_source,growth_rate
0,acetate,BW25113,batch,aerobic,acetate,ac_e,0.3
1,fumarate,BW25113,batch,aerobic,fumarate,fum_e,0.42
2,glycerol,BW25113,batch,aerobic,glycerol,glyc_e,0.47
3,pyruvate,BW25113,batch,aerobic,pyruvate,pyr_e,0.4
4,fructose,BW25113,batch,aerobic,fructose,fru_e,0.65


Parametrise the polypeptide nodes in the graph

### Read PaxDB data

In [7]:
paxdb_data=pd.read_csv('../data/PAXDB/paxDB_integrated.tsv',sep='\t').set_index('#string_external_id')
paxdb_data.head(2)

Unnamed: 0_level_0,abundance
#string_external_id,Unnamed: 1_level_1
b3495,45935.0
b3986,12183.0


### Read map between b-numbers and UNIPROT

In [8]:
uniprot_bnum_map=pd.read_csv('../../external_database_data/EcoCyc_all_genes_pps.tsv',sep='\t')
uniprot_bnum_map.head()

Unnamed: 0,Genes,b_number,Product,MW_kDa,ecocyc_accession_id,Product.1,UniProt,biocyc_id
0,3'ETS-<i>leuZ</i>,b4759,small regulatory RNA 3'ETS<sup><i>leuZ</i></sup>,,ECK4616,small regulatory RNA 3'ETS<sup><i>leuZ</i></sup>,,RNA0-388
1,aaeA,b3241,aromatic carboxylic acid efflux pump membrane ...,34.775,ECK3230,aromatic carboxylic acid efflux pump membrane ...,P46482,G7686-MONOMER
2,aaeB,b3240,aromatic carboxylic acid efflux pump subunit AaeB,73.591,ECK3229,aromatic carboxylic acid efflux pump subunit AaeB,P46481,G7685-MONOMER
3,aaeR,b3243,DNA-binding transcriptional activator AaeR,34.516003,ECK3232,DNA-binding transcriptional activator AaeR,P67662,G7688-MONOMER
4,aaeX,b3242,DUF1656 domain-containing protein AaeX,7.8470006,ECK3231,DUF1656 domain-containing protein AaeX,P46478,G7687-MONOMER


### Add paxDB data as a column of the proteomic_data

In [9]:
schmidt_plus_paxdb=data.copy()
schmidt_plus_paxdb['paxDB_pp_per_cell']=pd.NA

for uniprot_id in schmidt_plus_paxdb.index.union(E.columns):
    if uniprot_id in uniprot_bnum_map['UniProt'].tolist():
        bnum=uniprot_bnum_map.query("UniProt==@uniprot_id").iloc[0]['b_number']
    else:
        bnum=None
    if bnum is not None and bnum in paxdb_data.index:
        schmidt_plus_paxdb.loc[uniprot_id,'paxDB_pp_per_cell']=paxdb_data.loc[bnum,'abundance']


In [10]:
# Input unmeasured model pps with PAXDB data
schmidt_plus_paxdb_imputed=schmidt_plus_paxdb.copy()
for uniprot_id in schmidt_plus_paxdb_imputed.index:
    for condition in schmidt_plus_paxdb_imputed.columns.difference(['paxDB_pp_per_cell']):
        if pd.isna(schmidt_plus_paxdb_imputed.loc[uniprot_id,condition]) and (not pd.isna(schmidt_plus_paxdb_imputed.loc[uniprot_id,'paxDB_pp_per_cell'])):
            schmidt_plus_paxdb_imputed.loc[uniprot_id,condition]=schmidt_plus_paxdb_imputed.loc[uniprot_id,'paxDB_pp_per_cell']
                                                                              
                                                                              

# Least Square Regression

we have 

$$p=E^T e$$

where p is a vector of abundance of polypeptides and e is a vector of abundance of enzymes. We recover e given p using non-negative least squares

In [11]:
ET=E.T

fitted_enzyme_abundance={'copies_per_cell':{},'g_gDW':{}}
for cur_condition in schmidt_plus_paxdb_imputed.columns:
    measured_pps_in_condition=schmidt_plus_paxdb_imputed.loc[:,cur_condition].dropna().index.intersection(ET.index)
    cur_ET=ET.loc[measured_pps_in_condition]
    #remove zero columns
    cur_ET=cur_ET.loc[:,(cur_ET!=0).any()]
    e_vector=nnls(A=cur_ET,
                 b=schmidt_plus_paxdb_imputed.loc[measured_pps_in_condition,cur_condition])[0]
    e_series_copies_per_cell=pd.Series(e_vector,cur_ET.columns)
    #Now convert in MW
    avogadro=6.022e23
    gDW_per_cell=2.8e-13
    e_series_g_gDW=pd.Series(index=e_series_copies_per_cell.index,dtype=float)
    for enzyme in e_series_copies_per_cell.index:
        if enzyme not in graph.nodes:
            continue
        mw=graph.nodes[enzyme]['mw'] #in KDa
        e_series_g_gDW[enzyme]=e_series_copies_per_cell[enzyme]*(mw*1000/    avogadro)/gDW_per_cell
    fitted_enzyme_abundance['copies_per_cell'][cur_condition.replace("_pp_per_cell",'')]=e_series_copies_per_cell
    fitted_enzyme_abundance['g_gDW'][cur_condition.replace("_pp_per_cell",'')]=e_series_g_gDW

#No

In [12]:
#Merge the g_gDW estimate in one dataframe, with a column per condition. use nans for missing data

nnls_fits_g_gDW=pd.DataFrame(fitted_enzyme_abundance['g_gDW'])
nnls_fits_g_gDW.columns=[col+'_g_gDW' for col in nnls_fits_g_gDW.columns]
#turn the index into a column named (enzyme)

#If an enzyme is in the graph, but not in the table, add a nan row
for enzyme in E.index:
    if enzyme not in nnls_fits_g_gDW.index:
        print(f"Adding {enzyme} to the table with NaN")
        nnls_fits_g_gDW.loc[enzyme]=pd.NA

#Add a column 'enzyme' on the left side of the table
nnls_fits_g_gDW['enzyme']=nnls_fits_g_gDW.index

nnls_fits_g_gDW=nnls_fits_g_gDW.reset_index(drop=True)

Adding PITB-MONOMER to the table with NaN
Adding CPLX0-7653 to the table with NaN
Adding EG11512-MONOMER to the table with NaN
Adding CPLX0-8255 to the table with NaN


In [13]:
#Merge the copies_per_cell estimate in one dataframe, with a column per condition. use nans for missing data

nnls_fits_copies_per_cell=pd.DataFrame(fitted_enzyme_abundance['copies_per_cell'])
nnls_fits_copies_per_cell.columns=[col+'_copies_per_cell' for col in nnls_fits_copies_per_cell.columns]
#turn the index into a column named (enzyme)

#If an enzyme is in the graph, but not in the table, add a nan row
for enzyme in E.index:
    if enzyme  not in nnls_fits_copies_per_cell.index:
        print(f"Adding {enzyme} to the table with NaN")
        nnls_fits_copies_per_cell.loc[enzyme]=pd.NA

#Add a column 'enzyme' on the left side of the table
nnls_fits_copies_per_cell['enzyme']=nnls_fits_copies_per_cell.index

nnls_fits_copies_per_cell=nnls_fits_copies_per_cell.reset_index(drop=True)
enzyme_column=nnls_fits_copies_per_cell.pop('enzyme')
nnls_fits_copies_per_cell.insert(0,'enzyme',enzyme_column)

Adding PITB-MONOMER to the table with NaN
Adding CPLX0-7653 to the table with NaN
Adding EG11512-MONOMER to the table with NaN
Adding CPLX0-8255 to the table with NaN


In [14]:
#Export to csv
schmidt_plus_paxdb.to_csv('../data/Schmidt_et_al_2016/parsed_data/Schmidt_et_al_2016_plus_paxDB.csv')
schmidt_plus_paxdb_imputed.to_csv('../data/Schmidt_et_al_2016/parsed_data/Schmidt_et_al_2016_paxDB_imputed.csv')
nnls_fits_g_gDW.to_csv('./schmidt_2016_mass_abundance_mapped_NNLS.csv',index=False)
nnls_fits_copies_per_cell.to_csv('./schmidt_2016_copies_per_cell_abundance_mapped_NNLS.csv',index=False)

  values = values.astype(str)
