In [1]:
import json 
from cobra.io import load_matlab_model, save_matlab_model
from collections import Counter
import copy 

In [2]:
json_model = json.load(open("/Mounts/rbg-storage1/datasets/Metabo/datasets/iML1515_dataset.json", "r"))

In [3]:
unknown_mets = []
for rxn_dict in json_model:
   
    if rxn_dict.get("reactants", False):
        unk = [r['metabolite_id'] for r in rxn_dict["reactants"] if r['smiles'] is None]
        unknown_mets.extend(unk)
    if rxn_dict.get("products", False):
        unk = [r['metabolite_id'] for r in rxn_dict["products"] if r['smiles'] is None]
        unknown_mets.extend(unk)

In [7]:
unknown_mets = set(unknown_mets)

### Remove unk metabolites

In [4]:
matlab_model = load_matlab_model("/Mounts/rbg-storage1/datasets/Metabo/BiGG/iML1515.mat")
print(f"{len(matlab_model.metabolites)} METABOLITES, {len(matlab_model.reactions)} REACTIONS")

# metabolites without known smiles
unknown_mets_mat = [j for j in matlab_model.metabolites if j.id in set(unknown_mets) ]

This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model iML1515. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, p


Scaling...
 A: min|aij| =  1.000e+00  max|aij| =  1.000e+00  ratio =  1.000e+00
Problem data seem to be well scaled
1877 METABOLITES, 2712 REACTIONS


In [5]:
print("Remove metabolites with unknown smiles")
matlab_model.remove_metabolites(unknown_mets_mat, destructive = False)
print(f"{len(matlab_model.metabolites)} METABOLITES, {len(matlab_model.reactions)} REACTIONS")

Remove metabolites with unknown smiles
1842 METABOLITES, 2712 REACTIONS


In [39]:
save_matlab_model(matlab_model, "/Mounts/rbg-storage1/datasets/Metabo/BiGG/iML1515_unk_mets_removed.mat")

In [8]:
json_model_unk_mets_removed = copy.deepcopy(json_model)
for rxn_dict in json_model_unk_mets_removed:
    reactants = rxn_dict.get("reactants", [])
    rxn_dict['reactants'] = [r for r in reactants if r['metabolite_id'] not in unknown_mets]
    
    products = rxn_dict.get("products", [])
    rxn_dict['products'] = [r for r in products if r['metabolite_id'] not in unknown_mets]

In [13]:
known_mets_ids = []
for rxn_dict in json_model_unk_mets_removed:
    known_mets_ids.extend(r['metabolite_id'] for r in rxn_dict.get("reactants", []))
    known_mets_ids.extend(r['metabolite_id'] for r in rxn_dict.get("products", []))
print(f"{len(set(known_mets_ids))} METABOLITES, {len(json_model_unk_mets_removed)} REACTIONS")

1842 METABOLITES, 2712 REACTIONS


In [14]:
json.dump(json_model_unk_mets_removed, open("/Mounts/rbg-storage1/datasets/Metabo/datasets/iML1515_unk_mets_removed.json", "w"))

### Remove reactions involving unk metabolites

In [40]:
matlab_model = load_matlab_model("/Mounts/rbg-storage1/datasets/Metabo/BiGG/iML1515.mat")
print(f"{len(matlab_model.metabolites)} METABOLITES, {len(matlab_model.reactions)} REACTIONS")

# metabolites without known smiles
unknown_mets_mat = [j for j in matlab_model.metabolites if j.id in set(unknown_mets) ]
# reactions affected by metabolites missing smiles
rxns_affected = list(set([r.id for u in unknown_mets_mat for r in u.reactions]))

This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model iML1515. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, p


1877 METABOLITES, 2712 REACTIONS


In [41]:
print("Remove reactions with metabolites missing smiles")
matlab_model.remove_metabolites(unknown_mets_mat, destructive = True)
print(f"{len(matlab_model.metabolites)} METABOLITES, {len(matlab_model.reactions)} REACTIONS")

Remove reactions with metabolites missing smiles
1842 METABOLITES, 2642 REACTIONS


In [42]:
save_matlab_model(matlab_model, "/Mounts/rbg-storage1/datasets/Metabo/BiGG/iML1515_unk_mets_removed_destructive.mat")

In [22]:
json_model_unk_mets_removed_destructive = []
for rxn_dict in json_model:
    metabolites = rxn_dict.get("reactants", []) + rxn_dict.get("products", [])
    metabolite_ids = [r['metabolite_id'] for r in metabolites]
    if len(unknown_mets.intersection(set(metabolite_ids))):
        continue
    json_model_unk_mets_removed_destructive.append(rxn_dict)

In [24]:
known_mets_ids = []
for rxn_dict in json_model_unk_mets_removed_destructive:
    known_mets_ids.extend(r['metabolite_id'] for r in rxn_dict.get("reactants", []))
    known_mets_ids.extend(r['metabolite_id'] for r in rxn_dict.get("products", []))
print(f"{len(set(known_mets_ids))} METABOLITES, {len(json_model_unk_mets_removed_destructive)} REACTIONS")

1840 METABOLITES, 2642 REACTIONS


In [25]:
json.dump(json_model_unk_mets_removed_destructive, 
          open("/Mounts/rbg-storage1/datasets/Metabo/datasets/iML1515_unk_mets_removed_destructive.json", "w"))