# Build Excel stoich for GAMS (originally A01_build_excel_stoich_for_GAMS.ipynb)

#### Load data

In [1]:
import cobra,json,os,sys,openpyxl,re,pandas as pd
from collections import OrderedDict
from copy import deepcopy

model_root_path = '../'
sys.path.append(model_root_path)
from pycore.gsm_custom_functions import *
from pycore.cobrapy_custom_extras import *
kribo_default = 22*3600 #(22 aa/ribo/s converted to aa/ribo/h)
mito_compartments = ['m', 'mm'] # IDs for mitochondria compartments

In [2]:
# Metabolic model (COBRApy json)
# If you have to use multiple stoichiometric models (SMs), 
#   it's best to keep 1 as the main copy and then use transaction files to apply differences to the others. Helps improve consistency in annotations, stoich. etc.
chosen_model = 'Batch-Rabinowitz' # c: carbon, n: nitrogen

# List parameters for your model version(s), such as:
	# max_allowed_mito_proteome_allo_fraction: maximum fraction of proteome that can be allocated to mitochondria
	# nonmodeled_proteome_allocation: fraction of proteome not accounted for in the model
	# predicted_but_unused_proteins_path: path to file with predicted proteins that are not used in the model
	# sheet_name: name of sheet in relevant excel files (e.g., biomass, dummy protein) to use
# for 'Batch-Rabinowitz' before including nonmodeled proteins into model: 'nonmodeled_proteome_allocation':0.49,
modelVersions = {
	'Batch-Rabinowitz':{
		'max_allowed_mito_proteome_allo_fraction':0.13953,
		'nonmodeled_proteome_allocation':0,
		'predicted_but_unused_proteins_path':'%model_root_path%prosyn_constraints_predicted_but_unused_Rabinowitz2023_batchGlc_using_MFA_and_scRBA_methods.txt',
		'sheet_name':'batch-Rabinowitz'
	},
	'C':{
		'max_allowed_mito_proteome_allo_fraction':0.13464,
		'nonmodeled_proteome_allocation':0.54,
		'sheet_name':'chemo-clim-Rabinowitz'
	},
	'N':{
		'max_allowed_mito_proteome_allo_fraction':0.13033,
		'nonmodeled_proteome_allocation':0.53,
		'sheet_name':'chemo-nlim-Rabinowitz'
	}
	}
modelname = 'iRhto'+chosen_model
model = load_cobra_model('./input/'+modelname+'.json')
# if not choosing between nutrient-limited models, uncomment the following lines:
# model = cobra.io.load_json_model('./input/iRhtoC.json')
# chosen_model = ''

# Update GAMS settings file to reflect model version, 
#   and add NGAM, mito capacity, etc.
#   If not using multiple models, replace everything in the curly braces with the values you want
with open('./model/model-version-settings.txt', 'w') as f:
	npa = modelVersions[chosen_model].get('nonmodeled_proteome_allocation', 0)
	mampaf = modelVersions[chosen_model]['max_allowed_mito_proteome_allo_fraction'] if 'max_allowed_mito_proteome_allo_fraction' in modelVersions[chosen_model] else 1
	f.write(f"* file with settings updated by build_model scripts, to help with managing multiple SMs\n"
		f"$setGlobal max_allowed_mito_proteome_allo_fraction {mampaf}\n"
		f"$setGlobal nonmodeled_proteome_allocation {npa}\n")
	if 'measured_but_unused_proteins_path' in modelVersions[chosen_model]:
		f.write('* unless specified otherwise (e.g., in test_kapp.gms), include prosyn constraints to reflect production of unused proteins; comment out if using dummy protein instead\n')
		if modelVersions[chosen_model]['predicted_but_unused_proteins_path']:
			if os.path.exists(modelVersions[chosen_model]['predicted_but_unused_proteins_path']):
				f.write(f"$if not %ignore_measured_unused_constraints%==1 $include {modelVersions[chosen_model]['predicted_but_unused_proteins_path']}\n")

# Protein
df_pro = read_spreadsheet('./input/PROTEIN_stoich_curation.xlsx')
## OPTIONAL: add "special cases only" sheet to df_pro. 
##   Good for distinguishing proteins in the organism from ones added for testing purposes.
include_special_case_proteins = False # affects protein and enzyme stoichiometry
if include_special_case_proteins:
	df_pro_special = read_spreadsheet('./input/PROTEIN_stoich_curation.xlsx',sheet_name='special cases only',header=1)
	df_pro = pd.concat([df_pro,df_pro_special],ignore_index=True)

# Amino acid info
df_aamap = pd.read_csv('./input/PROTEIN_amino_acid_map.txt', sep='\t')
df_aamap.index = df_aamap.aa_abbv.to_list()

# Dummy protein
dummy_path = './input/PROTEIN_dummy_prot_calc.xlsx'
# df_aa_dummy = read_spreadsheet(dummy_path)
# df_aa_dummy2 = read_spreadsheet(dummy_path,sheet_name='unidentified')
# 1st entry is default dummy protein; remove 2nd entry unless needed
dummy_list = [{"df":read_spreadsheet(dummy_path,sheet_name=modelVersions[chosen_model]['sheet_name']),"name":"PROSYN-PROTDUMMY"},
			  {"df":read_spreadsheet(dummy_path,sheet_name='unidentified'),"name":"PROSYN-PROTDUMMYUNIDENTIFIED"}]
for prot in dummy_list:
	prot["df"].index = prot["df"].aa_abbv.to_list()
	prot["medianL"] = int(round(prot["df"].loc['A', 'Unnamed: 5'], 0))
	# Add 1e-5 g/mmol to prevent number round lost of protein MW
	#dummy_MW
	prot["MW"] = round(prot["df"].loc['C', 'Unnamed: 5'], 5) + 1e-5
	prot["biosyn"] = prot["name"].replace('PROSYN-','BIOSYN-')
	prot["met"] = prot["name"].lower().replace('prosyn-','BIO-')
	prot["translation_loc"] = 'unknown' if not 'translation_loc' in prot else prot['translation_loc']

# Enzyme
df_enz = read_spreadsheet('./input/ENZYME_stoich_curation.tsv')
# add 'id' column to df_enz, which is 'RXN-' + rxn_src + '_' + dir + '-' + enz
if include_special_case_proteins:
	df_enz_special = read_spreadsheet('./input/ENZYME_stoich_curation_special_cases.tsv')
	df_enz = pd.concat([df_enz,df_enz_special],ignore_index=True)

# RNA
df_rnas = read_spreadsheet('./input/RNA_stoich.xlsx')
df_rnas.index = df_rnas.RNAid.to_list()

# Ribosome ('_c' for nucleus or nucleoid, '_m' for mitochondria)
ribo_dict = {'_c':read_spreadsheet('./input/RIBOSOME_nucleus.xlsx'), 
			 '_m':read_spreadsheet('./input/RIBOSOME_mitochondria.xlsx')}

# Biomass
# heading start at cell X4
df_biom = read_spreadsheet('./input/BIOMASS_RBA.xlsx', sheet_name=modelVersions[chosen_model]['sheet_name'], header=4)
# concat with the other biomass sheet if needed (WARNING: should find a way to avoid having elements be redefined)
# df_biom = pd.concat([df_biom,read_spreadsheet('./input/BIOMASS_RBA.xlsx', sheet_name='batch-Rabinowitz+Pinheiro-macro', header=4)],ignore_index=True)
# df_biom = read_spreadsheet('./input/BIOMASS_RBA.xlsx', sheet_name='batch-Pinheiro', header=4)
# list of biomass rxns to turn off by default and only turn on when needed; any rxns not found will be excluded from list
whole_biomass_rxns = ['BIOSYN-' + i for i in ['BIODILAERO', 'BIODILAERO-NOGAM','BIOXYL-PINHEIRO','BIOXYL-PINHEIRO-NOGAM']] + ['BIOMASS_xyl_hybrid']

# checks which rxns to ignore when seeing if all necessary enzymes are modeled. Ignore blocked rxns (to avoid unnecessary work from adding them)
rxns_not_needing_enzymes = list(dict.fromkeys([r.id for r in model.reactions if r.bounds == (0,0)] + [rxn.id for rxn in find_biomass_reactions(model)] + ['BIOMASS_MFA','BIOMASS_MFA_NO_GAM','BIOMASS_RBA']))
# Gene-Protein Reaction associations (GPRs) representing spontaneous reactions (i.e., no enzyme needed in model)
spont_GPRs = ['SPONT', 'UNKNOWN']
# take IDs of all rxns
print('Reactions to ignore when checking if enzymes must be added:')
print(rxns_not_needing_enzymes)

Reactions to ignore when checking if enzymes must be added:
['ABTD2Dy_c', 'ABTD4Dy_c', 'ABTLD_c', 'ACALDt_c_m', 'ACOAO100_x', 'ACOAO120_x', 'ACOAO40_x', 'ACOAO60_x', 'ACOAO80_x', 'ALCD2i2_c', 'ASNtps_v', 'ASPtps_v', 'BIOMASS_RBA', 'BIOMASS_MFA', 'BIOMASS_MFA_NO_GAM', 'BIOMASS_PinheiroEtAl2020', 'BIOMASS_xyl_hybrid', 'CTPS1_c', 'CYSItps_v', 'DDPA_m', 'EX_fald_e', 'FER_c', 'G3PD1i_m', 'GLNtps_v', 'GLUtps_v', 'GLYAT_c', 'HMGCOAS_m', 'ILEtps_v', 'LEUtps_v', 'NITRy_c', 'PHCHGS_m', 'PKETF_c', 'TYRtps_v', 'XU5PFGT_x', 'XYLI1_c', 'XYLI2_c', 'XYLK_c', 'XYLURx_c', 'XYLURy_c', 'BIOMASS']


In [3]:
# Expanding df_pro to include sublocations as separate proteins
## remove the 'id' column
df_pro = df_pro.drop(columns=['id'])
# if the 'gene_src' column is empty, fill it with the 'name' column
df_pro['gene_src'] = df_pro['gene_src'].fillna(df_pro['name'])
## for each protein, if there are commas in subloc_assigned, split them into separate entries with 'id' modified to reflect sublocation
df_pro['subloc_assigned'] = df_pro['subloc_assigned'].apply(lambda x: x.split(',') if type(x) == str and ',' in x else x)
# add the 'id' column back in, assigning it to "gene_src" plus "subloc_assigned" if the latter is not 'unknown'
# modify IDs to reflect sublocation
# add location only if there are multiple proteins with the same gene source
df_pro = df_pro.explode('subloc_assigned').reset_index(drop=True)
if 'id' not in df_pro.columns:
	df_pro['id'] = df_pro.apply(
		lambda row: row['id_manual_override (clear when using new sheet)']
		if pd.notna(row['id_manual_override (clear when using new sheet)'])
		else (
			str(row['gene_src'])
			+ (
				'_' + row['subloc_assigned']
				if row['subloc_assigned'] != 'unknown'
				and df_pro[df_pro['gene_src'] == row['gene_src']].shape[0] > 1
				else ''
			)
		),
		axis=1
	)
df_pro.index = df_pro.id.to_list()
# save df_pro to model folder
df_pro[['id'] + [col for col in df_pro.columns if col != 'id']].to_csv('./model/PROTEIN_stoich_curation.tsv', sep='\t', index=False)

In [5]:
# calculate MW of enzymes = sum of MW of all proteins in protein_stoich (MW from df_pro) * number of protein_stoich

# if there are any rows in df_enz with 'dir' that's empty, create 2 new rows for each of those, one with 'dir' as 'FWD' and one with 'dir' as 'REV' (for faster writing to model)
if df_enz['dir'].isnull().any():
	print('Enzyme stoichiometry has empty "dir" values. Creating FWD and REV entries for those enzymes.')
	df_enz_fwd = df_enz.copy()
	df_enz_fwd['dir'] = df_enz_fwd['dir'].fillna('FWD')
	df_enz_rev = df_enz.copy()
	df_enz_rev['dir'] = df_enz_rev['dir'].fillna('REV')
	df_enz = pd.concat([df_enz_fwd[df_enz_fwd['dir'] == 'FWD'], df_enz_rev[df_enz_rev['dir'] == 'REV']], ignore_index=True)
	# sort df_enz by 'rxn_src' and 'dir'
	df_enz = df_enz.sort_values(by=['rxn_src', 'dir']).reset_index(drop=True)
	# Now df_enz_new contains all original rows, plus FWD and REV for those missing 'dir'

df_enz['id'] = df_enz.apply(lambda x: 'RXN-' + x['rxn_src'] + '_' + x['dir'] + '-' + x['enz'], axis=1)
df_enz.index = df_enz.id.to_list()
mw = 'MW (g/mmol)'
for enz in df_enz.index:
	enz_mw = 0
	for pro in df_enz.loc[enz, 'protein_stoich'].split(','):
		# find protein stoichiometry in df_pro
		delim = ':'
		if delim in pro:
			prot_id = pro.split(delim)[0].strip()
			prot_stoich = float(pro.split(delim)[1].strip())
			pro_matches = df_pro.loc[df_pro.gene_src == prot_id]
			# if no matches, check if prot_id is in df_pro.index
			if len(pro_matches) == 0:
				if prot_id in df_pro.index:
					pro_matches = df_pro.loc[prot_id]
				else:
					print(f'Warning: {prot_id} not found in protein stoichiometry file')
					continue
			pro_mw = pro_matches[mw]
			if isinstance(pro_matches[mw], pd.Series):
				if pro_matches[mw].nunique() > 1:
					print(f'Warning: {prot_id} has multiple MWs in df_pro')
				pro_mw = pro_matches[mw].iloc[0]
			if float(pro_mw) == 0:
				print(f'Warning: MW for {prot_id} is 0 in protein stoichiometry file')
				continue
			enz_mw += pro_mw * prot_stoich
	# set enz_mw to NaN if it is 0
	if enz_mw == 0:
		enz_mw = float('nan')
	df_enz.loc[enz, mw] = enz_mw
# move 'id' column to the front
df_enz[['id'] + [col for col in df_enz.columns if col != 'id']].to_csv('./model/ENZYME_stoich_curation.tsv', sep='\t', index=False)

Enzyme stoichiometry has empty "dir" values. Creating FWD and REV entries for those enzymes.


In [6]:
# list all genes, to map them to translation (PROSYN) rxns and other rxns making their products where needed
# combine all genes from df_pro and model
# combine all genes from df_pro and model, ensuring all are strings and not NaN
genes_model = [str(gene.id) for gene in model.genes if pd.notna(gene.id)]
genes_pro = [str(g) for g in df_pro.gene_src.to_list() if pd.notna(g)]
genes_rna = [str(r) for r in df_rnas.RNAid.to_list() if pd.notna(r)]
genes = sorted(list(dict.fromkeys(genes_model + genes_pro + genes_rna)))
gene_expression_dict = {}

In [7]:
# testing only
# # tag_c formula weight
# # print(model.metabolites.get_by_id('tag_c').formula_weight-227.000000) # 227.000000 from Ac element accidentally listed in it
# # TAG MW is 89.07 w/o Acyl3, 882.40 w/ Acyl3 according to https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6838544/. Thus, Acyl MW is (882.40-89.07)/3 = 264.4433333333
# # calculate_molecular_weight('C28H43O')
# # (12.010700*28)+(1.007940*43)+15.999400 = 395.64042
# # print(model.metabolites.get_by_id('ergstest_c').formula_weight-227.000000+264.4433333333) # 227.000000 from Ac element accidentally listed in it
# print(model.metabolites.get_by_id('o2_c').formula_weight)

In [8]:
# Make version of model with no generic cofactors, for publication and for MEMOTE testing
model_no_generics = deepcopy(model)
# remove rxns with names beginning with "GENERIC_"
for rxn in model.reactions:
	if rxn.id.startswith("GENERIC_"):
		# print(rxn.id)
		model_no_generics.reactions.get_by_id(rxn.id).remove_from_model()
report_mass_balance(model_no_generics)
# remove metabolites with names beginning with "pseudometabolite "
for met in model.metabolites:
	if met.name.startswith("pseudometabolite "):
		# print(met.name)
		model_no_generics.metabolites.get_by_id(met.id).remove_from_model()
# write JSON version of the model w/o these additions (for future use)
cobra.io.save_json_model(model_no_generics, "./input/"+modelname+"-no-generic-cofactors.json")
# convert to xml file
cobra.io.write_sbml_model(model_no_generics, "./input/"+modelname+"-no-generic-cofactors.xml")

DM_5mta_c
5mta_c --> 
5mta_c	C11H15N5O3S1	0	C00170	5'-S-methyl-5'-thioadenosine	biocyc
C:-11, H:-15, N:-5, O:-3, S:-1, charge:0

DM_gcald_c
gcald_c --> 
gcald_c	C2H4O2	0	C00266	glycolaldehyde	biocyc
C:-2, H:-4, O:-2, charge:0

TORULNOXi_c
o2_c + toruln_c --> 2.0 h_c + torularho_c
h_c	H1	1	C00080	H+	biocyc
o2_c	O2	0	C00007	oxygen	biocyc
torularho_c	C40H52O2	0	      	Torularhodin	manual
toruln_c	C40H54	0	C08613	Torulene	seed.compound
H:0, O:0, C:0, charge:2

compACYLCOA_c
0.01875 docoscoa_c + 0.355664 linocoa_c + 0.087612 linolncoa_c + 0.207129 odecoa_c + 0.1694 pmtcoa_c + 0.112806 stcoa_c + 0.048639 ttccoa_c <=> acylcoa_c
acylcoa_c	C21H31N7O16P3SAcyl	-4	C00040	Acyl-CoA	manual
docoscoa_c	C43H74N7O17P3S1	-4	C16528	docosanoyl-CoA	biocyc
linocoa_c	C39H62N7O17P3S	-4	C02050	linoleoyl-CoA	seed.compound
linolncoa_c	C39H60N7O17P3S	-4	C16162	alpha-linolenoyl-CoA	seed.compound
odecoa_c	C39H64N7O17P3S1	-4	C00510	oleoyl-CoA	biocyc
pmtcoa_c	C37H62N7O17P3S1	-4	C00154	palmitoyl-CoA	biocyc
stcoa_c	C39H6

#### Assemble reactions

In [9]:
import itertools
import cobra.manipulation

def find_true_combinations(expression, use_knockout=False, variables:list=[]):
    """
    Finds all combinations of boolean values that make the given expression true.

    Args:
        expression: The boolean expression as a string.
        use_knockout: If True, the function will test combinations where only one variable is False, and use those to eliminate redundant combinations, saving time. Will return an error if the expression contains 'not' operators unless set to 'ignore warning'.
        variables: A list of strings (variable names in expression) to consider. If none provided, the function will extract variable names from the expression. Added in case consistent RegEx parsing proves too unreliable.
    Returns:
        A list of dictionaries, where each dictionary represents a true combination.

    """

    # Extract variables from the expression
    if variables == []:
        variables = set(re.findall(r'\b\w+\b', expression))
        if use_knockout and any([var in ['not','!'] for var in variables]):
            raise ValueError("The expression contains 'not' operators, which may cause issues with the 'use_knockout' option. If you still want to use this option, set it to 'ignore warning'.")
        # remove 'and', 'or', 'not' from variables
        variables = [var for var in variables if var not in ['and', 'or', 'not']]
    # print(variables)

    # Generate all possible combinations of True/False values for variables
    combinations = itertools.product([True, False], repeat=len(variables))
    # sort combinations from fewest to most True values
    combinations = sorted(combinations, key=lambda x: sum(x))
    combos_to_check = combinations.copy()
    # print(combinations)
    # find all combinations where only one variable is False
    knockout_combos = []
    essential_vars = [] # variables that must be True
    # test if combo where all variables are True makes the expression True
    all_true = dict(zip(variables, [True]*len(variables)))
    expr = expression
    for var in sorted(variables, key=len, reverse=True):
        expr = expr.replace(var, str(all_true[var]))
    if use_knockout != False:
        if eval(expr):
            # for each variable, find the combination where only that variable is False
            for var in variables:
                combo = [True]*len(variables)
                combo[variables.index(var)] = False
                knockout_combos.append(combo)
                expr = expression
                for var2 in sorted(variables, key=len, reverse=True):
                    expr = expr.replace(var2, str(combo[variables.index(var2)]))
                # add variable to essential_vars if it must be True
                if not eval(expr):
                    essential_vars.append(var)
                    # remove all combinations where this variable is False from combos_to_check
                    combos_to_check = [combo for combo in combos_to_check if combo[variables.index(var)]]
    # if len(combinations) != len(combos_to_check):
    #     print(f'Reduced combinations to check from {len(combinations)} to {len(combos_to_check)} by eliminating redundant combinations.')

    true_combinations = []
    for combo in combos_to_check:
        # Create a dictionary mapping variables to their assigned values
        values = dict(zip(variables, combo))
        # print(values)
        # print(combo)
        # replace variables with their values in 'values'
        # cobra.manipulation.knock_out_model_genes

        expr = expression
        # Replace variables with their values in the expression, starting with the longest variable names to avoid the risk of replacing substrings
        for var in sorted(variables, key=len, reverse=True):
            expr = expr.replace(var, str(values[var]))

        # print(expression)
        # print(expr,type(expr))
        # Evaluate the expression
        if eval(expr):
            # check if the combination's True values are a subset of any previous ones; if so, don't add it
            # print('\t',values)
            redundant_combos = False
            true_vals = [var for var in values if values[var]]
            for prev in true_combinations:
                prev_vals = [var for var in prev if prev[var]]
                if set(true_vals).issubset(set(prev_vals)) or set(prev_vals).issubset(set(true_vals)):
                    redundant_combos = True
                    # print('redundant:',true_vals)
                    break
            if not redundant_combos:
                true_combinations.append(values)

    return true_combinations

# Example usage:
expression = "rt05 and ((rt01 and (rt012 or rt0123)) or rt04)"
true_combos = find_true_combinations(expression, use_knockout=True)
print(true_combos)

[{'rt04': True, 'rt012': False, 'rt05': True, 'rt01': False, 'rt0123': False}, {'rt04': False, 'rt012': True, 'rt05': True, 'rt01': True, 'rt0123': False}, {'rt04': False, 'rt012': False, 'rt05': True, 'rt01': True, 'rt0123': True}]


In [10]:
# Automatically create placeholders for enzymes based on GPRs in model.reactions
rxns_to_skip = set(rxns_not_needing_enzymes)
enz_rxn_dict = {}
spont_rxn_dict = {}
rxn_updates = {rxn.id: {} for rxn in model.reactions}
GPR_enz_dict = {}
try:
	df_enz_rxns = read_spreadsheet('./input/ENZYME_stoich.tsv')
	# make set of rxns to skip, from combining the 'rxns' column cells in df_enz_rxns (to avoid reviewing existing rxns) and separating by commas. However, relying on this will prevent updates to their GPRs from being added to the model.
	# rxns_to_skip.update(set(df_enz_rxns.rxns.str.split(',').sum()))
	# make enz_rxn_dict from df_enz_rxns, with 'enz' as keys and 'rxns' and 'protein_stoich' as values
	for index, row in df_enz_rxns.iterrows():
		enz_rxn_dict[row['enz']] = {'rxns': sorted(list(dict.fromkeys(row['rxns'].split(',')))),
										'protein_stoich': {prot.split(':')[0]: float(prot.split(':')[1]) for prot in row['protein_stoich'].split(',')}}
		if not pd.isna(row[mw]):
			enz_rxn_dict[row['enz']][mw] = row[mw]
except:
	df_enz_rxns = pd.DataFrame(columns=['enz','rxns','protein_stoich',mw])
# add dict of protein_stoich to enz_rxn_dict, with each protein as a key and its stoichiometry as the value

# find all unique GPRs in model.reactions, and their associated reactions
for rxn in model.reactions:
	# print(rxn.id, rxn.gene_reaction_rule)
	if rxn.id in rxns_to_skip:
		continue
	if rxn.gene_reaction_rule not in spont_GPRs + ['']:
		if rxn.gene_reaction_rule in GPR_enz_dict.keys():
			# add rxn to corresponding enz in enz_rxn_dict
			enz_ids = GPR_enz_dict[rxn.gene_reaction_rule]
			for enz_id in enz_ids:
				enz_rxn_dict[enz_id]['rxns'].append(rxn.id)
				rxn_updates[rxn.id].setdefault('enz',enz_id)
		else:
			GPR_enz_dict[rxn.gene_reaction_rule] = set()
			enzs = sorted(find_true_combinations(rxn.gene_reaction_rule, use_knockout=True, variables=[g.id for g in rxn.genes]), key=lambda x: sorted(x.keys()))
			for enz in enzs:
				# print('\t',enz)
				enz_found = None
				# default: 1 copy of each protein is used
				# prot_stoich = ','.join([gene+':1' for gene in sorted(enz.keys()) if enz[gene]])
				prot_stoich = {gene:1 for gene in sorted(enz.keys()) if enz[gene]}
				# check if any enzymes with the same stoichiometry are already in enz_rxn_dict
				enz_id = ''.join(sorted(prot_stoich.keys()))

				for old_enz_id in enz_rxn_dict:
					# print(enz_id,enz_rxn_dict[enz_id])
					# automatically assumes enzymes are the same if they have the same proteins, regardless of order/count
					if sorted(enz_rxn_dict[old_enz_id]['protein_stoich'].keys()) == sorted(prot_stoich.keys()):
						enz_rxn_dict[old_enz_id]['rxns'].append(rxn.id)
						enz_found = old_enz_id
						break
				if not enz_found:
					# enz_rxn_dict[rxn.id] = enz
					enz_rxn_dict[enz_id] = {'rxns':[rxn.id],
											'protein_stoich':prot_stoich}
					rxn_updates[rxn.id].setdefault('enz',enz_id)
					GPR_enz_dict[rxn.gene_reaction_rule].add(enz_id)
				else:
					GPR_enz_dict[rxn.gene_reaction_rule].add(enz_found)
	else:
		# add to spont_rxn_dict under its GPR as a list of rxn IDs, if it is not already there
		spont_rxn_dict[rxn.gene_reaction_rule] = spont_rxn_dict.get(rxn.gene_reaction_rule, {'rxns':[],'protein_stoich':{'zeroCost':1}})
		spont_rxn_dict[rxn.gene_reaction_rule]['rxns'].append(rxn.id)
		rxn_updates[rxn.id].setdefault('gene_reaction_rule',rxn.gene_reaction_rule)
		# spont_rxn_dict[rxn.gene_reaction_rule] = spont_rxn_dict.get(rxn.gene_reaction_rule, [rxn.id])
		# spont_rxn_dict[rxn.gene_reaction_rule] = rxn.id
					
# give each enz in enz_rxn_dict a row in df_enz_rxns
new_rows = []
for enz in enz_rxn_dict:
	# only add enzymes that are not already in df_enz_rxns
	if df_enz_rxns[df_enz_rxns.enz == enz].empty:
		# calculate MW of each enzyme based on its protein stoichiometry
		if mw in enz_rxn_dict[enz] and enz_rxn_dict[enz][mw]:
			enz_mw = enz_rxn_dict[enz][mw]
		else:
			enz_mw = 0
			for prot in enz_rxn_dict[enz]['protein_stoich']:
				pro_mw = df_pro.loc[prot,mw] if prot in df_pro.index else 0
				# check in gene_src column
				if pro_mw == 0:
					if prot in df_pro.gene_src.to_list():
						# print(prot,df_pro.loc[df_pro.gene_src == prot,mw])
						# if there are multiple proteins with the same gene_src, check if their MW is the same
						# if not, print a warning
						# pick the first one
						pro_match = df_pro.loc[df_pro.gene_src == prot,mw].to_list()
						if df_pro.loc[df_pro.gene_src == prot,mw].nunique() > 1:
							print(f'Warning: {prot} has multiple MWs in df_pro')
						pro_mw = pro_match[0]
					else:
						print(f'Warning: {prot} not found in df_pro')
				enz_mw += pro_mw*enz_rxn_dict[enz]['protein_stoich'][prot]
		new_rows.append({'enz':enz,'rxns':','.join(enz_rxn_dict[enz]['rxns']),
						'protein_stoich':','.join([prot+':'+str(enz_rxn_dict[enz]['protein_stoich'][prot]) for prot in enz_rxn_dict[enz]['protein_stoich']]),
						mw:enz_mw,
						'status':'automatically added'})
df_enz_rxns = pd.concat([df_enz_rxns, pd.DataFrame(new_rows)], sort=False, ignore_index=True)
# sort by id, MW, and then by rxns
df_enz_rxns = df_enz_rxns.sort_values(by=['rxns',mw,'enz'])
# save automatically updated df_enz_rxns as separate file
df_enz_rxns.to_csv('./model/ENZYME_stoich.tsv', sep='\t', index=False)



  df_enz_rxns = pd.concat([df_enz_rxns, pd.DataFrame(new_rows)], sort=False, ignore_index=True)


In [11]:
# check if all rxns in model are accounted for in df_enz_rxns, and if they have been automatically updated
# make a list of all rxns in df_enz_rxns
enz_rxns = []
for index, row in df_enz_rxns.iterrows():
	enz_rxns += row['rxns'].split(',')
enz_rxns = sorted(list(dict.fromkeys(enz_rxns)))
# check if all rxns in model are in enz_rxns
missing_rxns = []
for rxn in model.reactions:
	if rxn.id not in rxns_not_needing_enzymes and rxn.id not in enz_rxns and rxn.id not in whole_biomass_rxns and rxn.gene_reaction_rule not in spont_rxn_dict.keys():
		missing_rxns.append(rxn.id)
if len(missing_rxns) > 0:
	print(f'Warning: the following reactions are not accounted for in df_enz_rxns:')
	for rxn in missing_rxns:
		print(rxn)
# remove rxns from rxn_updates if their value is {}
for rxn in rxn_updates.copy():
	if not rxn_updates[rxn]:
		del rxn_updates[rxn]
	else:
		print(f'Updating {rxn} with {rxn_updates[rxn]}')
# if rxn_updates isn't empty, print a warning
# if len(rxn_updates) > 0:
# 	raise ValueError(f'Warning: the following reactions have been automatically updated:\n{rxn_updates}')

Updating 12AMANTF_g with {'enz': 'rt2093'}
Updating 13BDGLUCANt_c_en with {'gene_reaction_rule': ''}
Updating 13BGH_e with {'enz': 'rt5185'}
Updating 13GS_c with {'enz': 'rt7616'}
Updating 14BMANTF_c with {'enz': 'rt6298'}
Updating 14DMLANOSTt_c_e with {'gene_reaction_rule': ''}
Updating 16GS_c with {'enz': 'rt0150'}
Updating 1AGPCt_l_rm with {'gene_reaction_rule': ''}
Updating 1AGPEt_l_rm with {'gene_reaction_rule': ''}
Updating 1MLCLAT_mm with {'enz': 'rt8106'}
Updating 2DDA7Pt_c_m with {'gene_reaction_rule': ''}
Updating 2DHPt_c_m with {'gene_reaction_rule': ''}
Updating 2DOXG6PP_c with {'enz': 'rt5045'}
Updating 2MBACt_c_e with {'gene_reaction_rule': ''}
Updating 2MBALDt_c_e with {'gene_reaction_rule': ''}
Updating 2MBALDt_c_m with {'gene_reaction_rule': ''}
Updating 2MBTOHt_c_e with {'gene_reaction_rule': ''}
Updating 2MBTOHt_c_m with {'gene_reaction_rule': ''}
Updating 2OBUTt_c_m with {'gene_reaction_rule': ''}
Updating 2OGMAH_c with {'enz': 'rt6557'}
Updating 2OSUCAH_c with {'en

In [12]:
# convert df_enz_rxns to old ENZYME_stoich_curation file format from 2024 version of scRBA, for compatibility
df_rxn_enz_pairs = pd.DataFrame(columns=['id','rxn_src','enz','protein_stoich'])
catalyst_rxn_dict = {**spont_rxn_dict, **enz_rxn_dict}
for k, v in catalyst_rxn_dict.items():
	if k in spont_rxn_dict:
		catalyst_rxn_dict[k]['rxns'].extend(spont_rxn_dict[k]['rxns']) # adds rxns that were overrided by merger with enz_rxn_dict
		catalyst_rxn_dict[k]['rxns'] = list(dict.fromkeys(catalyst_rxn_dict[k]['rxns']))
for enz in catalyst_rxn_dict:
	enz_id = enz if enz != '' else 'SPONT'
	# don't seem to need MW of enzyme in prior RBA code
	prot_stoich_str = 'zeroCost' if enz in spont_GPRs + [''] else ','.join([prot+':'+str(catalyst_rxn_dict[enz]['protein_stoich'][prot]) for prot in catalyst_rxn_dict[enz]['protein_stoich']])
	for rxn in catalyst_rxn_dict[enz]['rxns']:
		dirs = []
		# add FWD and REV to rxn_src if those directions are possible
		if model.reactions.get_by_id(rxn).lower_bound < 0:
			dirs.append('REV')
		if model.reactions.get_by_id(rxn).upper_bound > 0:
			dirs.append('FWD')
		for dir in dirs:
			# print(enz,rxn)
			# df_rxn_enz_pairs = df_rxn_enz_pairs.append({'id':f'RXN-{rxn}_{dir}-{enz}','enz':enz,'gpr':'','protein_stoich':','.join([prot+':'+str(enz_rxn_dict[enz]['protein_stoich'][prot]) for prot in enz_rxn_dict[enz]['protein_stoich']]),'status':'automatically added'}, ignore_index=True)
			df_rxn_enz_pairs.loc[len(df_rxn_enz_pairs)] = [f'RXN-{rxn}_{dir}-{enz_id}',rxn,enz,prot_stoich_str]
			# df_rxn_enz_pairs = pd.concat([df_rxn_enz_pairs,pd.DataFrame({'id':f'RXN-{rxn}_{dir}-{enz}','enz':enz,'protein_stoich':','.join([prot+':'+str(enz_rxn_dict[enz]['protein_stoich'][prot]) for prot in enz_rxn_dict[enz]['protein_stoich']])})], ignore_index=True)

# sort by ID
df_rxn_enz_pairs = df_rxn_enz_pairs.sort_values(by='id')			
df_rxn_enz_pairs.to_csv('./model/ENZYME_rxn_pairs.tsv', sep='\t', index=False)

In [13]:
df_eqn = pd.DataFrame(columns=['id', 'type', 'coupling_type', 'coupling_species', 'reaction','tag','FBA_name','dir','enz_id'])
c = -1
medium = []

### Metabolic network reaction
# Exchange reactions
for rxn in model.reactions:
	# check if rxn is in df_enz
	# if rxn.id not in df_enz.rxn_src.to_list():
	if rxn.id[:3] == 'EX_':
		enz_id = 'SPONT'
		make_reversible = True
# else:
# 	gpr = rxn.gene_reaction_rule if rxn.gene_reaction_rule != "" else 'UNKNOWN'

		protein_stoich = 'zeroCost' if enz_id in spont_GPRs else ''
		
		met = [i for i in rxn.metabolites.keys()][0]
		c += 1
		tag = 'RXN'
		dir = 'FWD'
		new_id = f'{tag}-{rxn.id}_{dir}-{enz_id}'
		df_eqn.loc[c, 'id'] = new_id
		df_eqn.loc[c, 'type'] = 'metabolic'
		df_eqn.loc[c, 'reaction'] = 'MET-' + met.id + ' -->'
		# df_eqn.loc[c, ['FBA_name','dir']] = [rxn.id,'FWD']
		df_eqn.loc[c, ['tag','FBA_name','dir','enz_id']] = [tag,rxn.id,dir,enz_id]
		
		c += 1
		dir = 'REV'
		new_id = f'{tag}-{rxn.id}_{dir}-{enz_id}'
		df_eqn.loc[c, 'id'] = new_id
		df_eqn.loc[c, 'type'] = 'metabolic'
		df_eqn.loc[c, 'reaction'] = '-->' + 'MET-' + met.id
		# df_eqn.loc[c, ['FBA_name','dir']] = [rxn.id,'REV']
		df_eqn.loc[c, ['tag','FBA_name','dir','enz_id']] = [tag,rxn.id,dir,enz_id]

# Reactions that are not exchange reactions
for i in df_enz.index:
	rxn_id = df_enz.id[i]
	tag,rxn_base_id,rxn_dir,enz_id = extract_details_from_rxnid(rxn_id)
	
	if rxn_base_id[:3] == 'EX_':
		continue
	
	c += 1
	rxn_base = model.reactions.get_by_id(rxn_base_id)
	
	met_dict = metabolites_dict_from_reaction_equation_RBA(rxn_base.reaction)
	met_dict = {k:v for k,v in met_dict.items() if k != ''}
	met_dict = {'MET-' + k:v for k,v in met_dict.items()}
	if rxn_dir == 'REV':
		met_dict = {k:-v for k,v in met_dict.items()}
	elif rxn_dir == 'FWD':
		None
	else:
		print("Unknown ID that indicate reaction direction, only accepting 'FWD' and 'REV'")
	
	if enz_id not in spont_GPRs:
		df_eqn.loc[c, 'coupling_type'] = 'rxn_enz'
		df_eqn.loc[c, 'coupling_species'] = enz_id
	
	df_eqn.loc[c, 'id'] = rxn_id
	df_eqn.loc[c, 'type'] = 'metabolic'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(met_dict, arrow='-->')
	df_eqn.loc[c, ['tag','FBA_name','dir','enz_id']] = [tag,rxn_base_id,rxn_dir,enz_id]
	
### Enzyme synthesis network reaction
enz_stoich = OrderedDict()
for i in df_enz.index:
	enz_stoich[df_enz.enz[i]] = df_enz.protein_stoich[i]

c = df_eqn.shape[0] - 1
for enz_id,prot_str in enz_stoich.items():
	if prot_str == 'zeroCost':
		continue
	
	c += 1
	prot_str = prot_str.split(',')
	coeffs = OrderedDict({'PTMPRO-' + i.split(':')[0]:-int(i.split(':')[1]) for i in prot_str})
	coeffs['ENZ-' + enz_id] = 1
	
	df_eqn.loc[c, 'id'] = 'ENZSYN-' + enz_id
	df_eqn.loc[c, 'type'] = 'enzyme'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(coeffs, arrow='-->')

#these ENZLOAD equations also need to have their MW listed in the enz_mw_g_per_mmol file
#initialize an enzload sting
enzload_str = ""
for i in df_enz.index:
	if df_enz.protein_stoich[i] == 'zeroCost':
		continue
		
	c += 1
	coeffs = OrderedDict()
	coeffs['ENZ-' + df_enz.enz[i]] = -1

	enzload_id = 'ENZLOAD-' + df_enz.id[i][4:]

	#for debugging, comment out if unused
	#print("i: ",i,"\tenzload_id: ", enzload_id)
	
	df_eqn.loc[c, 'id'] = enzload_id
	df_eqn.loc[c, 'type'] = 'enzymeRxnLoad'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(coeffs, arrow='-->')

	#get the associated mw
	enz_mw = getattr(df_enz,"MW (g/mmol)")[i]

	new_line = enzload_id + "\t" + str(enz_mw) + "\n"
	enzload_str = enzload_str + new_line

# write the file resulting from enzload_str
with open('./model/enz_mw_g_per_mmol.txt', 'w') as f:
	f.write(enzload_str)
with open('../input/enz_mw_g_per_mmol.txt', 'w') as f:
	f.write(enzload_str)

### Ribosome
ribo_subunits = {}
c = df_eqn.shape[0] - 1
for rna in df_rnas.index:
	c += 1
	rna_stoich = OrderedDict({i:0 for i in ['MET-'+rna, 'MET-atp_c', 'MET-ctp_c',
											'MET-gtp_c', 'MET-utp_c', 'MET-ppi_c']})
	rna_stoich['RIBO-'+rna] = 1
	rna_stoich['MET-atp_c'] = -int(df_rnas.A[rna])
	rna_stoich['MET-ctp_c'] = -int(df_rnas.C[rna])
	rna_stoich['MET-gtp_c'] = -int(df_rnas.G[rna])
	rna_stoich['MET-utp_c'] = -int(df_rnas.U[rna])
	rna_stoich['MET-ppi_c'] = int(df_rnas.loc[rna, ['A','C','G','U']].sum())
	rna_stoich['BIO-rrna'] = df_rnas.loc[rna, 'MW (g/mmol)']
	
	df_eqn.loc[c, 'id'] = 'RIBOSYN-' + rna
	df_eqn.loc[c, 'type'] = 'ribosome'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(rna_stoich, arrow='-->')
	gene_expression_dict.setdefault(rna, []).append(df_eqn.loc[c, 'id'])
with open('./model/prosyn_ribo_subunits.txt', 'w') as f, open('./model/pro_ribo_subunits.txt', 'w') as f2, open('./model/kribo.txt', 'w') as f4, open('./model/j_ribo.txt', 'w') as f5, open('./model/ribosomes.txt', 'w') as f6:
	for file in [f,f2,f4,f5,f6]: file.write('/')
	for loc, ribo in ribo_dict.items():
		## proteins into ribosome subunits (to account for paralogs)
		ribo_stoich = OrderedDict()
		# if only 1 ribosome is present, all rRNAs are used
		rnas = [item for item in df_rnas.index if item.endswith(loc) or len(ribo_dict.keys()) == 1]
		# assume 1 of each rRNA and protein is used per ribosome
		for i in ribo.index:
			if ribo.id[i] in ribo_subunits:
				ribo_subunits[ribo.id[i]].append(loc[1:])
			else:
				ribo_subunits[ribo.id[i]] = [loc[1:]]
			if ribo.id[i] in rnas:
				ribo_stoich['RIBO-' + ribo.id[i]] = -1
			else:
				f.write("\n'"+loc[1:]+"'.'PROSYN-"+ribo.id[i]+"'")
				f2.write("\n'"+loc[1:]+"'.'"+ribo.id[i]+"'")
				# if it's not a paralog of another protein, all paralogs between it and the next non-paralog are treated as paralogs
				if pd.isnull(ribo.paralog[i]):
					ribosub_name = 'RIBOSUB-' + ribo.id[i]
				# add rxn converting the protein into the ribosome subunit (RIBOSUB prefix)
				c += 1
				ribosub_stoich = OrderedDict()
				ribosub_stoich['PTMPRO-' + ribo.id[i]] = -1
				ribosub_stoich[ribosub_name] = 1
				df_eqn.loc[c, 'id'] = 'RIBOSUBSYN-' + ribosub_name + '-FROM-' + ribo.id[i]
				df_eqn.loc[c, 'type'] = 'ribosome-subunit'
				df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(ribosub_stoich, arrow='-->')
				# add the ribosome subunit to the ribosome
				ribo_stoich[ribosub_name] = -1
		# making the ribosome itself from the subunits
		c += 1
		df_eqn.loc[c, 'id'] = 'RIBOSYN-ribonuc' if loc == '_c' else 'RIBOSYN-ribomito' if loc == '_m' else 'RIBOSYN-ribo'+loc[1:]
		# add to ribosome files
		ribo_id = "'"+loc[1:]+"'"
		f6.write('\n'+ribo_id); f5.write('\n'+ribo_id+".'"+df_eqn.loc[c, 'id']+"'"); f4.write('\n'+ribo_id+' '+str(kribo_default))
		df_eqn.loc[c, 'type'] = 'ribosome'
		df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(ribo_stoich, arrow='-->')
	for file in [f,f2,f4,f5,f6]: file.write('\n/')
# write the ribosome subunit stoichiometry to a json file
with open('./model/ribo_subunits.json', 'w') as f:
	json.dump(ribo_subunits, f)

#### Build files related to MW

In [14]:
data = getattr(df_pro,'MW (g/mmol)').to_dict()
ids = getattr(df_pro,'id').to_list()

#build the string to write
prot_mw_str = ""

#for each id
for id in ids:
	new_line = id + "\t" + str(data[id]) + "\n"
	prot_mw_str = prot_mw_str + new_line
# add dummy proteins to the string
for prot in dummy_list:
	new_line = prot["name"].replace('PROSYN-','') + "\t" + str(prot["MW"]) + "\n"
	prot_mw_str = prot_mw_str + new_line

#write to output
with open('./model/pro_mw_g_per_mmol.txt', 'w') as f:
	f.write(prot_mw_str)
with open('../input/pro_mw_g_per_mmol.txt', 'w') as f:
	f.write(prot_mw_str)

In [15]:
#TEST: this whole block. get MW list
mw_list = getattr(df_enz,"MW (g/mmol)").to_list()

#get enz list
enz_list = getattr(df_enz,"enz").to_list()

#create string for writing output file
enz_mw_str = "enz_id\tMW (g/mmol)\n"

#add to the string for enzyme entries
#need to initialize a counter
c = 0

#list to keep track of existing entries
#predefined values prevent adding unknown
existing_pro = ['nan','unknown']

for enz in enz_list:

	#only add a new line if unique entry
	if not(str(enz) in existing_pro):

		#for debugging, comment out if unused
		#print("enz: |",enz,"|, type: ",type(enz))
	
		new_line = str(enz) + "\t" + str(mw_list[c]) + "\n"

		#for debugging, comment out if unused
		#print(new_line)

		enz_mw_str = enz_mw_str + new_line

		existing_pro.append(enz)

	c = c + 1

#write to output
with open('../input/enz_mw_g_per_mmol_norxnmapped.txt', 'w') as f:
	f.write(enz_mw_str)

#### Protein file

In [16]:
### Protein

pro_lengths = dict()

c = df_eqn.shape[0] - 1
translation_loci = set()
unknown_ribo_prosyn = set()
protein_functions = dict()

def edit_stoich(stoich:OrderedDict,edits):
	if not pd.isna(edits):
		cofs_st = edits.split(',')
		cofs_st = OrderedDict({c.split(':')[0]:c.split(':')[1] for c in cofs_st})
		for k,v in cofs_st.items():
			stoich['MET-'+k] = -int(v)
	return stoich

for i in df_pro.index: 
	c += 1
	pro_id = df_pro.loc[i, 'id'] if 'id' in df_pro.columns else i
	prepro = 'PRO-' + str(pro_id) # preprotein (from translation)
	prot = 'PTMPRO-' + str(pro_id) # protein after post-translational modifications (PTMs)

	trans_st = OrderedDict()
	PTM_st = OrderedDict({prepro:-1,prot:1})
	for met in ['MET-atp_c', 'MET-h2o_c',
				'MET-adp_c', 'MET-pi_c', 'MET-h_c', 'MET-gtp_c',
				'MET-gdp_c']:
		trans_st[met] = 0
	seq = df_pro.sequence[i].replace("*","")

	trans_st = edit_stoich(trans_st,df_pro.trans_stoich_edits[i])
	PTM_st = edit_stoich(PTM_st,df_pro.PTM_stoich_edits[i])

	pro_lengths[i] = len(seq)
	for aa in df_aamap.index:
		trans_st[df_aamap.tRNA_in[aa]] = -seq.count(aa)
		trans_st[df_aamap.tRNA_out[aa]] = seq.count(aa)
				
	trans_st[prepro] = 1
	df_eqn.loc[c, 'coupling_type'] = 'prot_ribo'
	df_eqn.loc[c, 'coupling_species'] = 'ribo'
	
	# protein will occupy cellular space in its specific compartment
	if df_pro.subloc_assigned[i] in mito_compartments:
		trans_st['BIO-protmito'] = df_pro.loc[i, 'MW (g/mmol)']
	else:
		trans_st['BIO-protcyt'] = df_pro.loc[i, 'MW (g/mmol)']
	
	# Cost: Initiation: 1 ATP + 2 GTP (initiate and bind Methionine)
	for met in ['MET-atp_c', 'MET-h2o_c']:
		trans_st[met] -= 1
	for met in ['MET-adp_c', 'MET-pi_c', 'MET-h_c']:
		trans_st[met] += 1
				
	# Elongation: 2 GTP / cycle (2(n-1) cycles for n amino acids + 2 GTP from initiation = 2n GTP consumed so far)
	# (elongation process excludes Methionine since it is already bound in initiation process)
	# Termination and recycling: 1 GTP + 1 ATP: https://doi.org/10.1534/genetics.115.186221
	# 	However, we exclude these costs since it's not always required (e.g., via "leaky scanning"): https://doi.org/10.1093/nar/gkp765
	for met in ['MET-gtp_c', 'MET-h2o_c']:
		trans_st[met] -= 2*len(seq)
	for met in ['MET-gdp_c', 'MET-pi_c', 'MET-h_c']:
		trans_st[met] += 2*len(seq)
	
	df_eqn.loc[c, 'id'] = 'PROSYN-' + df_pro.id[i]

	# add translation rxn to gene_expression_dict list, or create a new list if it doesn't exist
	gene_expression_dict.setdefault(df_pro.gene_src[i], []).append(df_eqn.loc[c, 'id'])

	# add to protein functions dictionary under its gene_src
	protein_functions.setdefault(df_pro.gene_src[i], {})[df_pro.id[i]] = {}
	df_eqn.loc[c, 'type'] = 'protein'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(trans_st, arrow='-->')
	# Set ribosome translating the protein
	if df_pro.loc[i, 'translation_loc'] in ['unknown','any']:
		unknown_ribo_prosyn.add("'"+df_eqn.loc[c, 'id']+"'")
	else:
		translation_loci.add("'"+df_pro.loc[i, 'translation_loc']+"'.'"+df_eqn.loc[c, 'id']+"'")
	# add PTM reaction
	c += 1
	df_eqn.loc[c, 'id'] = 'PTM-' + df_pro.id[i]
	df_eqn.loc[c, 'type'] = 'proteinPTM'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(PTM_st, arrow='-->')
# write the protein length file
len_str = "/\n"
for prot in pro_lengths:
	len_str = len_str + "'PROSYN-" + prot + "' " + str(pro_lengths[prot]) + "\n"
for prot in dummy_list:
	len_str += "'" + prot["name"] + "' " + str(prot["medianL"]) + "\n"
len_str = len_str + "/"
with open('./model/RBA_proteinLength.txt', 'w') as f:
	f.write(len_str)

### Dummy protein
for prot in dummy_list:
	dummy_MW = prot["MW"]
	trans_st = make_dummy_protein_stoich(aa_standards_df=df_aamap, prot_df=prot["df"], length=prot["medianL"], rxn_name=prot["name"], mw=dummy_MW)

	c += 1
	# prot_st["BIO-protdummy"] = prot["MW"]
	df_eqn.loc[c, 'id'] = prot["name"]
	df_eqn.loc[c, 'coupling_type'] = 'prot_ribo'
	df_eqn.loc[c, 'coupling_species'] = 'ribo' #TEST: Update for ribonuc, ribomito, and other ribo types
	df_eqn.loc[c, 'type'] = 'protein'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(trans_st,
															arrow='-->', floatdecimal=6)
	if prot["translation_loc"] in ['unknown','any']:
		unknown_ribo_prosyn.add("'"+df_eqn.loc[c, 'id']+"'")
	else:
		translation_loci.add("'"+df_eqn.loc[c, 'id']+"'.'"+df_eqn.loc[c, 'id']+"'")

# Protein waste reactions
c = df_eqn.shape[0] - 1
with open('./model/pro_syn_waste.txt', 'w') as f:
	f.write('/')
	for i in df_pro.index: 
		for abbv,tag in {'':'PROSYN','PTM':'PTM'}.items():
			c += 1
			pro=abbv+'PRO-'+df_pro.id[i]
			new_st = OrderedDict()
			new_st[pro] = -1
			# add the protein's MW as the coefficient for BIO-protwasted
			new_st['BIO-protwasted'] = df_pro.loc[i, 'MW (g/mmol)']
			
			pw_id = f"{abbv}PROWASTE-{df_pro.id[i]}"
			f.write(f"\n'{tag}-{df_pro.id[i]}'.'{pw_id}'")

			df_eqn.loc[c, 'id'] = pw_id
			df_eqn.loc[c, 'type'] = 'proteinWaste'
			df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA(new_st, arrow='-->')

	f.write('\n/')
# Pseudoreaction representing mass of waste protein (for kapps): PROWASTE-TOTALPROTEIN
c += 1
df_eqn.loc[c, 'id'] = 'PROWASTE-TOTALPROTEIN'
df_eqn.loc[c, 'type'] = 'proteinWaste'
df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA({'BIO-protwasted':-1}, arrow='-->')
for loc in ['modeled','dummy','cyt','mito']: # add PROWASTE rxns for biomass-related byproduct of PROSYN rxns, to support simulations at 0 growth
	c += 1
	df_eqn.loc[c, 'id'] = 'PROWASTE-PROT'+loc.upper()
	df_eqn.loc[c, 'type'] = 'proteinWaste'
	df_eqn.loc[c, 'reaction'] = build_reaction_equation_from_metabolites_dict_RBA({'BIO-prot'+loc:-1}, arrow='-->')

### Write translation localization lists
with open('./model/translation.txt', 'w') as f:
	f.write('\n'.join(['/'] + sorted(list(translation_loci)) + ['/']))
with open('./model/translated_by_any_ribo.txt', 'w') as f:
	f.write('\n'.join(['/'] + sorted(list(unknown_ribo_prosyn)) + ['/']))

### Biomass
for i in df_biom.index:
	c += 1
	if not pd.isna(df_biom.rxn_id[i]):
		df_eqn.loc[c, 'id'] = df_biom.rxn_id[i]
		df_eqn.loc[c, 'type'] = 'biomass'
		df_eqn.loc[c, 'reaction'] = df_biom.rxn_equation[i]
# make into file RBA_rxns_whole_biomass.txt
with open('./model/RBA_rxns_whole_biomass.txt', 'w') as f:
	# find whole_biomass_rxns in df_eqn and write them to file
	f.write('\n'.join(['/'] + [i for i in whole_biomass_rxns if i in df_eqn.id.to_list()] + ['/']))

In [17]:
for k,v in gene_expression_dict.items():
	# check for entries in v starting with "'RIBO"
	if any([i.startswith("'RIBO") for i in v]):
		print(f"Warning: {k} has entries starting with 'RIBO' in gene_expression_dict")

In [18]:
# save gene list to file
with open('./model/genes.txt', 'w') as f:
	f.write('\n'.join(['/'] + [f"'{i}'" for i in genes] + ['/']))
for gene in genes:
	if gene not in gene_expression_dict:
		gene_expression_dict[gene] = []
# remove duplicates from gene_expression_dict, preserve order
gene_expression_dict = {k: list(dict.fromkeys([f"'{i}'" for i in v])) for k, v in gene_expression_dict.items()}
# save gene-protein mapping to file
with open('./model/genes_expression.txt', 'w') as f:
	f.write('\n'.join(['/'] + [f"'{gene}'.({','.join(gene_expression_dict[gene])})" for gene in genes if gene_expression_dict[gene]] + ['/']))

In [19]:
# make extra kapp calculation files
directory = '../GAMS/parameterization/enz_from_proteome'
if not os.path.exists(directory):
	os.makedirs(directory)

with open(directory + '/pro_and_enz.txt', 'w') as f:
	f.write('\n'.join(['/'] + sorted(list(set(["'ENZ-" + i + "'" for i in enz_list] + ["'PTMPRO-" + i + "'" for i in df_pro.index]))) + ['/']))

#### Save excel files

In [20]:
df_eqn.to_csv('./model/RBA_stoichiometry.tsv', sep='\t', index=None)

# Build GAMS Sij and flux bounds (originally A02_build_GAMS_Sij_and_fluxBounds.ipynb)

#### Load excel file

In [21]:
df_eqn = read_spreadsheet('./model/RBA_stoichiometry.tsv')

#### Assemble list of species

In [22]:
met_list_raw = []
for i in df_eqn.index:
	met_dict = metabolites_dict_from_reaction_equation_RBA(df_eqn.reaction[i])
	met_list_raw += met_dict.keys()

met_list = sorted(list(set(met_list_raw)))
met_list = [i for i in met_list if i != '']
with open('./model/RBA_species.txt', 'w') as f:
	f.write('\n'.join(['/'] + ["'" + i + "'" for i in met_list] + ['/']))
# add list of demand rxns for each metabolite (for testing purposes)
demand_list = ["DM-" + i for i in met_list]
# WIP: add to df_eqn
with open('./model/RBA_rxns_demand.txt', 'w') as f:
	f.write('\n'.join(['/'] + ["'DM-" + i + "'" for i in met_list] + ['/']))

#### Assemble list of reactions

In [23]:
rxn_list = df_eqn.id.to_list()
rxn_list = ["'" + i + "'" for i in rxn_list if i != '']
rxn_list = ['/'] + rxn_list + ['/']

In [24]:
with open('./model/RBA_rxns.txt', 'w') as f:
	f.write('\n'.join(rxn_list))

#### Get list of uptake reactions

In [25]:
rxn_list = []
for i in df_eqn.id:
	tag,rxn_base_id,rxn_dir,enz_id = extract_details_from_rxnid(i)
	if tag == 'RXN' and rxn_base_id[:3] == 'EX_' and rxn_dir == 'REV':
		rxn_list.append(i)

rxn_list = ["'" + i + "'" for i in rxn_list if i != '']
rxn_list = ['/'] + rxn_list + ['/']

In [26]:
with open('./model/RBA_rxns_EXREV.txt', 'w') as f:
	f.write('\n'.join(rxn_list))

#### Get list of secretion reactions

In [27]:
rxn_list = []
for i in df_eqn.id:
	tag,rxn_base_id,rxn_dir,enz_id = extract_details_from_rxnid(i)
	if tag == 'RXN' and rxn_base_id[:3] == 'EX_' and rxn_dir == 'FWD':
		rxn_list.append(i)

rxn_list = ["'" + i + "'" for i in rxn_list if i != '']
rxn_list = ['/'] + rxn_list + ['/']

In [28]:
with open('./model/RBA_rxns_EXFWD.txt', 'w') as f:
	f.write('\n'.join(rxn_list))

#### Get list of protein waste reactions

In [29]:
rxn_list = []
for i in df_eqn.id:
	tag = i.split('-')[0]
	if tag in ['PROWASTE','PTMPROWASTE']:
		rxn_list.append(i)

rxn_list = ["'" + i + "'" for i in rxn_list if i != '']
rxn_list = ['/'] + rxn_list + ['/']

In [30]:
with open('./model/RBA_rxns_prowaste.txt', 'w') as f:
	f.write('\n'.join(rxn_list))

#### Get list of enzyme syn reactions

In [31]:
rxn_list = []
for i in df_eqn.id:
	tag = i.split('-')[0]
	if tag == 'ENZSYN':
		rxn_list.append(i)

rxn_list = ["'" + i + "'" for i in rxn_list if i != '']
rxn_list = ['/'] + rxn_list + ['/']

In [32]:
with open('./model/RBA_rxns_enzsyn.txt', 'w') as f:
	f.write('\n'.join(rxn_list))

#### Get list of enzyme load reactions

In [33]:
rxn_list = []
for i in df_eqn.id:
	tag = i.split('-')[0]
	if tag == 'ENZLOAD':
		rxn_list.append(i)

rxn_list = ["'" + i + "'" for i in rxn_list if i != '']
# list created so GAMS knows which ENZLOAD corresponds to which rxn
rxn_enzload_coupling_list = [i + '.' + i.replace('ENZLOAD-','RXN-') + ' 1' for i in rxn_list]
rxn_list = ['/'] + rxn_list + ['/']

In [34]:
with open('./model/RBA_rxns_enzload.txt', 'w') as f:
	f.write('\n'.join(rxn_list))
with open('./model/RBA_rxn_enzload_coupling.txt', 'w') as f:
	f.write('\n'.join(['/'] + rxn_enzload_coupling_list + ['/']))

#### Get list of metabolic network reactions

In [35]:
rxn_list = []
for i in df_eqn.id:
	tag = i.split('-')[0]
	if tag == 'RXN':
		rxn_list.append(i)

rxn_list = ["'" + i + "'" for i in rxn_list if i != '']
rxn_list = ['/'] + rxn_list + ['/']

In [36]:
with open('./model/RBA_rxns_rxnmetabolicnetwork.txt', 'w') as f:
	f.write('\n'.join(rxn_list))

In [37]:
# Make k_app (enzyme turnover rate) file draft using placeholder values
kapp_list = []
kapp_list = [i + " 360000" for i in rxn_list if i != '/']
kapp_list = ['/'] + kapp_list + ['/']
with open('./model/kapp_placeholders.txt', 'w') as f:
	f.write('\n'.join(kapp_list))

#### Assemble stoichiometry

In [38]:
# make extra kapp calculation files
directory = '../GAMS/parameterization/enz_from_proteome/'
if not os.path.exists(directory):
	os.makedirs(directory)

sij = []
rxns_enz = []
rxns_proenz = []
sij_proenz = []

for i in df_eqn.index:
	x = metabolites_dict_from_reaction_equation_RBA(df_eqn.reaction[i])
	met_dict = dict()
	for k,v in x.items():
		if k == '':
			continue
		# note when a protein is used in a reaction other than its PROWASTE and production reactions
		met = k.split('-',1)[1]
		if k.startswith('PTMPRO-') and df_eqn.id[i] not in ['PTMPROWASTE-' + met,'PTM-' + met]:
			for gene,prots in protein_functions.items():
				if met in prots:
					protein_functions[gene][met].update({df_eqn.id[i]:v})
					break
		if v.is_integer():
			met_dict[k] = int(v)
		else:
			met_dict[k] = v
			
	rxn_prefixes = ('ENZ', 'PTM-')
	if df_eqn.id[i].startswith(rxn_prefixes):
		if df_eqn.id[i].startswith("PTM-"):
			rxns_proenz.append("'" + df_eqn.id[i].replace('PTM-','PROIN-') + "'")
		else:
			rxns_proenz.append("'" + df_eqn.id[i] + "'")
			rxns_enz.append("'" + df_eqn.id[i] + "'")
		# sij.append("'" + df_eqn.id[i] + "'.'" + df_eqn.id[i] + "' 1")
		for k,v in met_dict.items():
			sij.append("'" + k + "'.'" + df_eqn.id[i] + "' " + str(v))
			if k.startswith('PTMPRO-') or k.startswith('ENZ-'):
				sij_proenz.append("'" + k + "'.'" + df_eqn.id[i].replace('PTM-','PROIN-') + "' " + str(v))
	else:
		for k,v in met_dict.items():
			sij.append("'" + k + "'.'" + df_eqn.id[i] + "' " + str(v))
	
sij = ['/'] + sij + ['/']
with open('./model/RBA_sij.txt', 'w') as f:
	f.write('\n'.join(sij))
# write protein functions to file
with open('./protein_functions.json', 'w') as f:
	json.dump(protein_functions, f)
# make set of all proteins in protein_functions with no modeled functions
proteins_without_modeled_functions = set()
# identify them by checking if all values within values are empty
for gene,prots in protein_functions.items():
	for prot,funcs in prots.items():
		# if all values are empty, add to set
		if not any(funcs.values()):
			proteins_without_modeled_functions.add(prot)
with open('./proteins_without_modeled_functions.txt', 'w') as f:
	f.write('\n'.join(sorted(list(proteins_without_modeled_functions))))
with open(directory + 'rxns_enz.txt', 'w') as f:
	f.write('\n'.join(['/'] + sorted(rxns_enz) + ['/']))
with open(directory + 'rxns_pro_and_enz.txt', 'w') as f:
	f.write('\n'.join(['/'] + sorted(rxns_proenz) + ['/']))
with open(directory + 'sij_pro_and_enz.txt', 'w') as f:
	f.write('\n'.join(['/'] + sij_proenz + ['/']))

# Build GAMS RBA constraints (originally A03_build_GAMS_RBA_constraints.ipynb)

In [39]:
#### LOAD INPUTS AND PARAMETERS
copy_to_gams = True # if True, copy the model to the gams folder

# Stoichiometry
df_stoich = read_spreadsheet('./model/RBA_stoichiometry.tsv')
df_stoich.index = df_stoich.id.to_list()

In [40]:
### Write prosyn reaction
idx = [i for i in df_stoich.index if i[:7] == 'PROSYN-']
prosyn = ["'" + i + "'" for i in idx]
prosyn = ['/'] + prosyn + ['/']
with open('./model/RBA_rxns_prosyn.txt', 'w') as f:
	f.write('\n'.join(prosyn))
## write ribosyn set
idx = [i for i in df_stoich.index if i[:8] == 'RIBOSYN-']
ribosyn = ["'" + i + "'" for i in idx]
ribosyn = ['/'] + ribosyn + ['/']
with open('./model/RBA_rxns_ribosyn.txt', 'w') as f:
	f.write('\n'.join(ribosyn))

In [41]:
# Make fwd and reversible rxn list for metabolic network and all rxns (not needed but helpful for testing)
# Also, make flux coupling analysis file in case you use that for testing

# Using Patrick's FCA code
rxn_types = {'irrev': 0, 'reversible-fwd-half': 1, 'reversible-rev-half': 2, 'pseudoreaction': 3, 'exchange-fwd-half': 4, 'exchange-rev-half': 5}

rev_rxn_list = []; rev_list = []
fwd_rxn_list = []; fwd_list = []
fca_list = []

for i in df_stoich.id:
	tag,rxn_base_id,rxn_dir,enz_id = extract_details_from_rxnid(i)
	#print("'"+i+"' "+str(rxn_types['irrev']))
	#print(rxn_base_id)
	#print(i,'\t',tag,'\t',rxn_dir)
	if rxn_dir == 'FWD':
		fwd_list.append(i)
		if tag == 'RXN':
			fwd_rxn_list.append(i)
		if rxn_base_id[:3] == 'EX_':
			fca_list.append("'"+i+"' "+str(rxn_types['exchange-fwd-half']))
		# Add as irreversible if no counterpart found in other direction
		elif i.replace("_FWD-","_REV-") not in df_stoich.id:
			fca_list.append("'"+i+"' "+str(rxn_types['irrev'])) 
		else:
			fca_list.append("'"+i+"' "+str(rxn_types['reversible-fwd-half']))
	elif rxn_dir == 'REV':
		rev_list.append(i)
		if tag == 'RXN':
			rev_rxn_list.append(i)
		if rxn_base_id[:3] == 'EX_':
			fca_list.append("'"+i+"' "+str(rxn_types['exchange-rev-half']))
		# Add as irreversible if no counterpart found in other direction
		elif i.replace("_REV-","_FWD-") not in df_stoich.id:
			fca_list.append("'"+i+"' "+str(rxn_types['irrev'])) 
		else:
			fca_list.append("'"+i+"' "+str(rxn_types['reversible-rev-half']))
	else:
		fca_list.append("'"+i+"' "+str(rxn_types['irrev'])) 

fwd_rxn_list = ["'" + i + "'" for i in fwd_rxn_list if i != '/']
fwd_rxn_list = ['/'] + fwd_rxn_list + ['/']
with open('./model/RBA_rxns_rxnmetabolicnetworkFWD.txt', 'w') as f:
	f.write('\n'.join(fwd_rxn_list))
rev_rxn_list = ["'" + i + "'" for i in rev_rxn_list if i != '/']
rev_rxn_list = ['/'] + rev_rxn_list + ['/']
with open('./model/RBA_rxns_rxnmetabolicnetworkREV.txt', 'w') as f:
	f.write('\n'.join(rev_rxn_list))

fwd_list = ["'" + i + "'" for i in fwd_list if i != '/']
fwd_list = ['/'] + fwd_list + ['/']
with open('./model/RBA_rxns_FWD.txt', 'w') as f:
	f.write('\n'.join(fwd_list))
rev_list = ["'" + i + "'" for i in rev_list if i != '/']
rev_list = ['/'] + rev_list + ['/']
with open('./model/RBA_rxns_REV.txt', 'w') as f:
	f.write('\n'.join(rev_list))

fca_list = ['/'] + fca_list + ['/']
with open('./model/rxntype.txt', 'w') as f:
	f.write('\n'.join(fca_list))

In [42]:
# creating set pairing SM values w/ RBA ones, for use in converting flux constraints from SM format to RBA

# filter out rxns with no FBA_name
df_sm = df_stoich[df_stoich.FBA_name.notnull()]
# get SM rxn IDs from FBA_name column
sm_rxn_ids = df_sm.FBA_name.to_list()
# get RBA rxn IDs from index
rba_rxn_ids = df_sm.index.to_list()
rba_rxn_dirs = [extract_details_from_rxnid(i)[2] for i in rba_rxn_ids]
# convert dir to -1 if rev, 1 otherwise
rba_rxn_dirs = [-1 if i == 'REV' else 1 for i in rba_rxn_dirs]
# create list of strings pairing SM rxn IDs with RBA rxn IDs
sm_rba_rxn_pairs_dict = {rba_rxn_ids[i]:sm_rxn_ids[i] for i in range(0, len(sm_rxn_ids))}
# save as JSON file
with open('./model/SM_RBA_rxn_pairs.json', 'w') as f:
	json.dump(sm_rba_rxn_pairs_dict, f)
sm_rba_rxn_pairs = ["'"+sm_rxn_ids[i]+"'.'"+rba_rxn_ids[i]+"'"+' '+str(int(rba_rxn_dirs[i])) for i in range(0, len(sm_rxn_ids))]
# convert SM rxn bounds to RBA format; for each sm rxn, get the corresponding rba rxn and its direction
with open('./model/SM_rxn_bounds.txt', 'w') as f:
	delim = ' '
	sm_rxns_checked = []
	# find longest rxn ID for formatting purposes
	max_sm_rxn_id_len = max([len(i)+2 for i in sm_rxn_ids])+1
	max_lb_len = max([len(str(rxn.lower_bound)) for rxn in model.reactions if rxn.id in sm_rxn_ids])+1
	max_ub_len = max([len(str(rxn.upper_bound)) for rxn in model.reactions if rxn.id in sm_rxn_ids])
	# write headers to file, with ljust
	f.write(f"{'':>{max_sm_rxn_id_len}}{'lo':>{max_lb_len}}{' up':<{max_ub_len}}")
	for rba_rxn,sm_rxn in sm_rba_rxn_pairs_dict.items():
		# find bounds of sm rxn in model
		rxn = model.reactions.get_by_id(sm_rxn)
		ub = rxn.upper_bound
		lb = rxn.lower_bound

		if sm_rxn not in sm_rxns_checked:
			sm_rxns_checked.append(sm_rxn)
			name=f"'{sm_rxn}'"
			# make string with SM rxn ID, lb, and ub, using max lengths for formatting
			f.write(f"\n{name:>{max_sm_rxn_id_len}}{str(lb):>{max_lb_len}} {str(ub):<{max_ub_len}}")
# write to file
with open('./model/SM_RBA_rxn_pairs.txt', 'w') as f:
	f.write('\n'.join(['/'] + sorted(sm_rba_rxn_pairs) + ['/']))
# make file w/ all FBA IDs
with open('./model/SM_rxn_ids.txt', 'w') as f:
	f.write('\n'.join(['/'] + sorted(["'"+i+"'" for i in set(sm_rxn_ids)]) + ['/']))

In [43]:
# make file for all formula weights (some may be inaccurate due to R groups and other non-standard groups)
with open('./model/formula_weights.txt','w') as f:
	f.write('/\n')
	for met in model.metabolites:
		# write met name, formula, and formula weight
		try:
			formula = str(met.formula)
			mw = str(met.formula_weight)
		except:
			mw = 'unknown'
			formula = 'unknown'
		f.write(str(met.id) + '\t' + formula + '\t' + mw + '\n')
	f.write('/')

  warn(f"The element {e} does not appear in the periodic table")
  warn(f"The element {e} does not appear in the periodic table")
  warn(f"The element {e} does not appear in the periodic table")
  warn(f"The element {e} does not appear in the periodic table")


In [44]:
# for AA in df_aamap, find all mets in met_list that match its name (without the location)
with open('./model/aa.txt', 'w') as f, open('./model/tRNAs.txt', 'w') as f2:
	fis = [f,f2]
	for fi in fis: fi.write('/')
	for aa in df_aamap.index:
		aa_mets = [f"'{met}'" for met in met_list if met.rsplit('_',1)[0] == df_aamap.tRNA_in[aa].rsplit('_',1)[0]]
		f.write(f"\n'{aa}'")
		if aa_mets:
			f2.write(f"\n'{aa}'.({','.join(aa_mets)})")
	for fi in fis: fi.write('\n/')

In [45]:
# write combinations of gene_src and protein IDs in form 'gene_src'.('protein1','protein2',...)
with open('./model/proteins_and_locations.txt', 'w') as f, open('./model/proteins.txt', 'w') as f2:
	fis = [f,f2]
	for fi in fis: fi.write('/')
	for gene,prots in protein_functions.items():
		prots_str = ','.join([f"'PROSYN-{prot}'" for prot in prots])
		f.write(f"\n'{gene}'.({prots_str})")
		f2.write(f"\n'{gene}'")
	for fi in fis: fi.write('\n/')

In [46]:
if copy_to_gams: # automatically copies the model folder to the GAMS folder (removing the previous version), if you choose to
	import shutil
	# copy "model" folder to GAMS folder
	# Source folder path
	source_folder = './model'

	# Destination folder path
	destination_folder = '../GAMS/model'

	# Remove the existing destination folder
	shutil.rmtree(destination_folder)

	# Copy the folder
	shutil.copytree(source_folder, destination_folder)

In [90]:
# check if any enzymes likely have cofactors counted multiple times
for enz in set(df_enz['enz']):
	# find 1st row where the enz matches
	prot_stoich_str = df_enz[df_enz['enz'] == enz].iloc[0]['protein_stoich']
	if pd.isna(prot_stoich_str):
		continue
	# parse the protein_stoich string into a dictionary
	# print(enz, prot_stoich_str)
	prot_stoich = {item.split(':')[0]: float(item.split(':')[1]) for item in prot_stoich_str.split(',') if ':' in item and not pd.isna(item)}
	if not prot_stoich:
		continue
	rxns = set(df_enz[df_enz['enz'] == enz]['rxn_src'].to_list())
	# remove blocked reactions
	rxns = [rxn for rxn in rxns if model.reactions.get_by_id(rxn).bounds != (0, 0)]
	# find the trans_stoich_edits and ptm_stoich_edits for all proteins comprising it
	possible_cofactors = dict()
	for pro in prot_stoich.keys():
		if pro in df_pro.index:
			met_dict = dict()
			# Collect edits, filter out NaN, and convert to string
			edits = [df_pro.loc[pro,'trans_stoich_edits'], df_pro.loc[pro,'PTM_stoich_edits']]
			edits = [str(e) for e in edits if pd.notna(e)]
			if not edits:
				continue
			met_items = set(','.join(edits).split(','))
			met_dict = {i.split(':')[0]: float(i.split(':')[1]) for i in met_items if not pd.isna(i) and ':' in i}
			for k,v in met_dict.items():
				if v > 0: # i.e., if it's consumed
					if k in possible_cofactors:
						print(f"Warning: Enzyme {enz} has cofactor {k} counted multiple times, including subunits {possible_cofactors[k]} and {pro}.")
					possible_cofactors.setdefault(k, []).append(pro)
	if possible_cofactors:
		# print(enz,possible_cofactors)
		# find the reactions that have the same cofactors
		cofactors_in_rxns = set()
		for rxn in rxns:
			rxn_cofactors = set()
			# get the stoichiometry of the reaction
			rxn_stoich_dict = model.reactions.get_by_id(rxn).metabolites
			if rxn_stoich_dict is None:
				continue
			for met, coef in rxn_stoich_dict.items():
				# if coef < 0 and met.id in [i for i in possible_cofactors.keys()]:
				if coef < 0 and met.id.rsplit('_',1)[0] in [i.rsplit('_',1)[0] for i in possible_cofactors.keys()]:
					rxn_cofactors.add(met.id)
					cofactors_in_rxns.add(met.id)
		if cofactors_in_rxns:
			print(f"Enzyme {enz} has cofactors {cofactors_in_rxns} in its reactions.")
			# print the reactions that have the same cofactors
			for rxn in rxns:
				for cofactor in cofactors_in_rxns:
					if cofactor in [met.id for met in model.reactions.get_by_id(rxn).metabolites.keys()]:
						print(f"  - {rxn} has cofactor {cofactor}")

Enzyme rt2671 has cofactors {'fad_m'} in its reactions.
  - G3PDf_m has cofactor fad_m
Enzyme rt2040 has cofactors {'fad_x'} in its reactions.
  - ACOAD162f_x has cofactor fad_x
  - ACOAD100f_x has cofactor fad_x
  - ACOAD143f_x has cofactor fad_x
  - ACOAD121af_x has cofactor fad_x
  - ACOAD182f_x has cofactor fad_x
  - ACOAD142f_x has cofactor fad_x
  - ACOAD101df_x has cofactor fad_x
  - ACOAD82ef_x has cofactor fad_x
  - ACOAD140f_x has cofactor fad_x
  - ACOAD181af_x has cofactor fad_x
  - ACOAD60f_x has cofactor fad_x
  - ACOAD160f_x has cofactor fad_x
  - ACOAD161bf_x has cofactor fad_x
  - ACOAD120f_x has cofactor fad_x
  - ACOAD40f_x has cofactor fad_x
  - ACOAD102ef_x has cofactor fad_x
  - ACOAD161af_x has cofactor fad_x
  - ACOAD80f_x has cofactor fad_x
  - ACOAD141af_x has cofactor fad_x
  - ACOAD141bf_x has cofactor fad_x
  - ACOAD180f_x has cofactor fad_x
  - ACOAD183f_x has cofactor fad_x
  - ACOAD163f_x has cofactor fad_x
Enzyme rt1644 has cofactors {'fad_m'} in its re