In [12]:
import pandas as pd
from cobra.io import load_json_model
from collections import Counter

In [4]:
model = load_json_model('iCHO3K.json')

In [17]:
category_mapping = {
    'Amino Acid Metabolism': [
        'AMINO ACID METABOLISM',
        'ALANINE AND ASPARTATE METABOLISM',
        'AMINO ACID DERIVATIVE',
        'ARGININE AND PROLINE METABOLISM',
        'BETA-ALANINE METABOLISM',
        'CYSTEINE METABOLISM',
        'D-ALANINE METABOLISM',
        'GLUTAMATE METABOLISM',
        'GLUTATHIONE METABOLISM',
        'GLYCINE, SERINE, ALANINE AND THREONINE METABOLISM',
        'GLYCINE, SERINE, AND THREONINE METABOLISM',
        'HISTIDINE METABOLISM',
        'LYSINE METABOLISM',
        'METHIONINE AND CYSTEINE METABOLISM',
        'METHIONINE METABOLISM',
        'PEPTIDE METABOLISM',
        'PHENYLALANINE METABOLISM',
        'SELENOAMINO ACID METABOLISM',
        'SPERMINE DEGRADATION',  # Added
        'TAURINE AND HYPOTAURINE METABOLISM',
        'TRYPTOPHAN METABOLISM',
        'TYROSINE METABOLISM',
        'UREA CYCLE',
        'UREA CYCLE/AMINO GROUP METABOLISM',
        'VALINE, LEUCINE, AND ISOLEUCINE METABOLISM'
    ],
    'Carbohydrates Metabolism': [
        'ALKALOID SYNTHESIS',
        'AMINOSUGAR METABOLISM',
        'ASCORBATE AND ALDARATE METABOLISM',
        'BUTANOATE METABOLISM',
        'C5-BRANCHED DIBASIC ACID METABOLISM',
        'FRUCTOSE AND MANNOSE METABOLISM',
        'GALACTOSE METABOLISM',
        'GLYOXYLATE AND DICARBOXYLATE METABOLISM',
        'HEPARAN SULFATE DEGRADATION',
        'HYALURONAN METABOLISM',
        'LIMONENE AND PINENE DEGRADATION',
        'LIPOATE METABOLISM',
        'PENTOSE AND GLUCURONATE INTERCONVERSIONS',
        'PROPANOATE METABOLISM',
        'PYRUVATE METABOLISM',
        'STARCH AND SUCROSE METABOLISM',
        'STILBENE, COUMARINE AND LIGNIN SYNTHESIS',
        'OTHER CARBON METABOLISM'
    ],
    'Energy Metabolism': [
        'BIOMASS',
        'BIOMASS SYNTHESIS',
        'CITRIC ACID CYCLE',
        'GLYCOLYSIS/GLUCONEOGENESIS',
        'OXIDATIVE PHOSPHORYLATION',
        'PENTOSE PHOSPHATE PATHWAY',
        'ROS DETOXIFICATION',
        'CENTRAL CARBON AND ENERGY METABOLISM'
    ],
    'Exchange/Transport': [
        'EXCHANGE/DEMAND/SINK',
        'EXCHANGE/DEMAND REACTION',
        'EXCHANGE/DEMAND/SINK REACTION',
        'EXCHANGE',
        'DEMAND',
        'SINK',
        'TRANSPORT, ENDOPLASMIC RETICULAR',
        'TRANSPORT, EXTRACELLULAR',
        'TRANSPORT, GOLGI APPARATUS',
        'TRANSPORT, LYSOSOMAL',
        'TRANSPORT, MITOCHONDRIAL',
        'TRANSPORT, NUCLEAR',
        'TRANSPORT, PEROXISOMAL',
        'TRANSPORT'
    ],
    'Lipid Metabolism': [
        'ARACHIDONIC ACID METABOLISM',
        'BILE, EICOSANOID AND STEROID METABOLISM',
        'BILE ACID SYNTHESIS',
        'CARNITINE SHUTTLE',
        'CHOLESTEROL METABOLISM',
        'EICOSANOID METABOLISM',
        'FATTY ACID METABOLISM',
        'FATTY ACID OXIDATION',
        'FATTY ACID SYNTHESIS',
        'FATTY ACID ELONGATION',
        'GLYCAN AND GLYCOSAMINOGLYCAN METABOLISM',
        'GLYCEROLIPID METABOLISM',
        'GLYCEROPHOSPHOLIPID METABOLISM',
        'GLYCOSPHINGOLIPID METABOLISM',
        'GLYCOSYLPHOSPHATIDYLINOSITOL-ANCHOR BIOSYNTHESIS',
        'GLYCOSYLPHOSPHATIDYLINOSITOL (GPI)-ANCHOR BIOSYNTHESIS',
        'GLYCEROPHOSPHOLIPID, SPHINGOLIPID AND INOSITOL METABOLISM'
        'INOSITOL PHOSPHATE METABOLISM',
        'LINOLEATE METABOLISM',
        'PHOSPHATIDYLINOSITOL PHOSPHATE METABOLISM',
        'R GROUP SYNTHESIS',
        'SPHINGOLIPID METABOLISM',
        'SQUALENE AND CHOLESTEROL SYNTHESIS',
        'TRIACYLGLYCEROL SYNTHESIS'
    ],
    'Nucleotide Metabolism': [
        'NUCLEOTIDE SALVAGE PATHWAY',
        'NUCLEOTIDE INTERCONVERSION',
        'NUCLEOTIDES',
        'PURINE METABOLISM',
        'PURINE CATABOLISM',
        'PURINE SYNTHESIS',
        'PYRIMIDINE METABOLISM',
        'PYRIMIDINE BIOSYNTHESIS',
        'PYRIMIDINE CATABOLISM',
        'PYRIMIDINE SYNTHESIS',
        'NUCLEOTIDE SUGAR METABOLISM',
        'NUCLEOTIDE METABOLISM',
        'NUCELOTIDE METABOLISM',
        'SALVAGE PATHWAY'
    ],
    'Protein Product Synthesis': [
        'BLOOD GROUP SYNTHESIS',
        'CHONDROITIN SULFATE DEGRADATION',
        'CHONDROITIN SYNTHESIS',
        'KERATAN SULFATE METABOLISM',
        'KERATAN SULFATE SYNTHESIS',
        'N-GLYCAN METABOLISM',
        'N-GLYCAN BIOSYNTHESIS',
        'N-GLYCAN SYNTHESIS',  # Added
        'N-GLYCAN DEGRADATION',
        'O-GLYCAN SYNTHESIS',
        'PROTEIN ASSEMBLY',
        'PROTEIN DEGRADATION',
        'PROTEIN MODIFICATION',
        'PROTEIN PRODUCTION',
        'GLYCAN AND GLYCOSAMINOGLYCAN METABOLISM'
    ],
    'Unassigned': [
        'UNASSIGNED',
        'MISCELLANEOUS'
    ],
    'Vitamin & Cofactor Metabolism': [
        'ANDROGEN AND ESTROGEN SYNTHESIS AND METABOLISM',
        'BIOTIN METABOLISM',
        'COA METABOLISM',
        'COA SYNTHESIS',
        'COA CATABOLISM',
        'CYTOCHROME METABOLISM',
        'FOLATE METABOLISM',
        'NAD METABOLISM',
        'PORPHYRIN METABOLISM',
        'TETRAHYDROBIOPTERIN METABOLISM',
        'THIAMINE METABOLISM',
        'UBIQUINONE AND OTHER TERPENOID-QUINONE SYNTHESIS',
        'UBIQUINONE SYNTHESIS',
        'VITAMIN A METABOLISM',
        'VITAMIN B2 METABOLISM',
        'VITAMIN B6 METABOLISM',
        'VITAMIN B12 METABOLISM',
        'VITAMIN C METABOLISM',
        'VITAMIN D METABOLISM',
        'VITAMIN D',  # Adjusted to include both variants
        'VITAMIN E METABOLISM',
        'XENOBIOTICS METABOLISM',
        'HEME SYNTHESIS',
        'HEME DEGRADATION',
        'CYP METABOLISM',
        'VITAMIN AND COFACTOR METABOLISM'
    ]
}


# Function to get the most common category, excluding "Unassigned" and "Exchange/Transport"
def get_most_common_category(categories):
    # Filter out "Unassigned" and "Exchange/Transport"
    filtered_categories = [cat for cat in categories if cat not in ["Unassigned", "Exchange/Transport"]]
    
    if not filtered_categories:  # If all categories are excluded, return "Unassigned"
        return "Unassigned"
    
    # Count occurrences of each category
    category_counter = Counter(filtered_categories)
    
    # Get the most common category
    most_common_category = category_counter.most_common(1)[0][0]
    
    return most_common_category

In [24]:
gene_category_dict = {}
# Iterate over genes and reactions
for g in model.genes:
    
    # Collect all categories for the gene's reactions
    categories = []
    for r in g.reactions:
        category = get_category(r.subsystem)
        categories.append(category)
    
    # Get the most common category, excluding "Unassigned" and "Exchange/Transport"
    unified_category = get_most_common_category(categories)
    
    # Add the gene-category pair to the dictionary
    gene_category_dict[g.name] = unified_category

In [None]:
# Iterate over genes and reactions
for g in model.genes:
    print(g.name)
    for r in g.reactions:
        category = get_category(r.subsystem)
        print(category)

In [20]:
metabolic_genes = pd.read_csv('../Data/metabolic_genes.csv')

In [27]:
metabolic_genes['system'] = metabolic_genes['Gene Symbol'].apply(lambda x: gene_category_dict.get(x, "Unassigned"))

In [32]:
# Filter out rows where 'system' is 'Unassigned'
filtered_df = metabolic_genes[metabolic_genes['system'] != 'Unassigned']

In [33]:
# Drop the 'Unnamed: 0' column if it exists
if 'Unnamed: 0' in filtered_df.columns:
    filtered_df = filtered_df.drop(columns=['Unnamed: 0'])

In [34]:
# Reset the index
filtered_df = filtered_df.reset_index(drop=True)

In [35]:
# Save the updated DataFrame to a CSV file
filtered_df.to_csv('../Data/filtered_metabolic_genes.csv', index=False)