In [1]:
import requests
from Bio import SeqIO



In [2]:
from Bio.KEGG import REST

from Bio.KEGG import Enzyme

In [3]:
# Show plots as part of the notebook
%matplotlib inline

# Show images inline
from IPython.display import Image

# Standard library packages
import io
import os

# Import Biopython modules to interact with KEGG
from Bio import SeqIO
from Bio.KEGG import REST
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas

# Import Pandas, so we can use dataframes
import pandas as pd

In [7]:
def get_protein_sequence(uniprot_id):
    """
    Fetches the amino acid sequence of a protein from UniProt.

    Args:
        uniprot_id (str): The UniProt ID of the protein.

    Returns:
        str: The amino acid sequence of the protein, or None if not found.
    """

    url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
    response = requests.get(url)

    if response.status_code == 200:
        # Separate header from sequence
        header, sequence = response.text.split('\n', 1)
        return sequence.replace('\n', '')  # Remove newlines
    else:
        print(f"Protein not found (UniProt ID: {uniprot_id})")
        return None

# Example usage
#uniprot_id = "P01308"  # Example: Human Insulin
uniprot_id = "G1JUH1"
sequence = get_protein_sequence(uniprot_id)

if sequence:
    print(sequence)

MSIFSTRYLVTPFSSFSPPKAFVSKACSLSTGQPLNYSPNISTNIISSSNGIINPIRRSGNYEPTMWNYEYIQSTHNHHVGEKYMKRFNELKAEMKKHLMMMLHEESQELEKLELIDNLQRLGVSYHFKDEIIQILRSIHDQSSSEATSANSLYYTALKFRILRQHGFYISQDILNDFKDEQGHFKQSLCKDTKGLLQLYEASFLSTKSETSTLLESANTFAMSHLKNYLNGGDEENNWMVKLVRHALEVPLHCMMLRVETRWYIDIYENIPNANPLLIELAKLDFNFVQAMHQQELRNLSRWWKKSMLAEKLPFARDRIVEAFQWITGMIFESQENEFCRIMLTKVTAMATVIDDIYDVYGTLDELEIFTHAIQRMEIKAMDELPHYMKLCYLALFNTSSEIAYQVLKEQGINIMPYLTKSWADLSKSYLQEARWYYSGYTPSLDEYMENAWISVGSLVMVVNAFFLVTNPITKEVLEYLFSNKYPDIIRWPATIIRLTDDLATSSNEMKRGDVPKSIQCYMKENGASEEEARKHINLMIKETWKMINTAQHDNSLFCEKFMGCAVNIARTGQTIYQHGDGHGIQNYKIQNRISKLFFEPITISMP


In [38]:
def get_uniprot_ids_from_rhea_id(rhea_id):
    """
    Fetches a list of UniProt IDs associated with a Rhea reaction ID.

    Args:
        rhea_id: The Rhea reaction ID as a string (e.g., "RHEA:10000").

    Returns:
        A list of UniProt IDs, or None if an error occurs.
    """

    url= "https://rest.uniprot.org/uniprotkb/search?"
    rid = rhea_id.split(':')[1]

    parameter = {
      "query":'((cc_catalytic_activity:"rhea:"'+rid+') AND (fragment:false) AND (reviewed:true))',
      "fields":"accession,rhea",
      "format":'tsv',
    }
    response = requests.get(url,params=parameter)
    #print(response.content)

   
    data = response.text.strip()

    # Check if the response has multiple lines (i.e., multiple UniProt IDs)
    if '\n' in data:
        uniprot_ids = data.splitlines()[1:]  # Skip the header line
    else:
        uniprot_ids = [data] if data else []

    return uniprot_ids




In [35]:
reaction_csv = pd.read_csv("./metanetx_new_final/metanetx/data_metanetx_latest/reac_prop_csv.csv", index_col = '#ID',skiprows = [i for i in range(351)])

In [36]:
reaction_csv = reaction_csv.drop(index='EMPTY')

In [37]:
reaction_csv_notNAN = reaction_csv.dropna(subset=['classifs'])
len(reaction_csv_notNAN)

39735

In [47]:
reaction_csv.head()

Unnamed: 0_level_0,mnx_equation,reference,classifs,is_balanced,is_transport
#ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MNXR01,1 MNXM01@MNXD1 = 1 MNXM1@MNXD1,mnx:MNXR01,,B,
MNXR02,1 MNXM1@MNXD1 = 1 MNXM1@MNXD2,mnx:MNXR02,,B,T
MNXR03,1 MNXM01@MNXD1 = 1 MNXM01@MNXD2,mnx:MNXR03,,B,T
MNXR100000,1 MNXM10958@MNXD1 + 1 MNXM1104529@MNXD1 = 1 MN...,biggR:GALNACT5g,,,
MNXR100001,1 MNXM1100890@MNXD1 + 1 MNXM147296@MNXD1 = 1 M...,biggR:GALNTg,,,


In [49]:
import json
rule_map = json.load(open('./rule_mapping_metanetx.json'))

In [55]:
pathway = ['MNXR102311','MNXR114039', 'MNXR151676']

#reaction_id = 'MNXR111780'
all_reactions = [rule_map[rid] for rid in pathway]
all_reactions

[['MNXR102311'],
 ['MNXR114039', 'MNXR131130', 'MNXR132834', 'MNXR132876'],
 ['MNXR151676', 'MNXR169858']]

In [57]:
# Example usage
#rhea_id = "RHEA:25221"

pathway = ['MNXR102311','MNXR114039', 'MNXR151676']

#reaction_id = 'MNXR111780'
all_reactions = [rule_map[rid] for rid in pathway]
#rhea_id = "RHEA:27754"

for rids in all_reactions:
    for reaction_id in rids:
        print(reaction_id)
        print("EC numbers are = " , reaction_csv.loc[reaction_id]['classifs'])

        if 'rhea' in reaction_csv.loc[reaction_id]['reference']:
            rhea_id = 'RHEA'+':'+reaction_csv.loc[reaction_id]['reference'].split(':')[1]



        uniprot_ids = get_uniprot_ids_from_rhea_id(rhea_id)

        #sequence = []

        if uniprot_ids:
            record = []
            print("UniProt IDs for", rhea_id)
            print(len(uniprot_ids))
            for uniprot_id in uniprot_ids:
                record.append([uniprot_id.split('\t')[0],get_protein_sequence(uniprot_id.split('\t')[0])])
                #print(uniprot_id)
                #print(uniprot_id.split('\t')[0])
                #print(get_protein_sequence(uniprot_id.split('\t')[0]))
                #print('--------')
        '''   
        else:
            print("No UniProt IDs found or an error occurred.")
        '''

        print({rhea_id:record})
        print("////")
    print("--------------------")


MNXR102311
EC numbers are =  2.6.1.85
UniProt IDs for RHEA:41432
1
{'RHEA:41432': [['B0CN28', 'MTAPADTVHPAGQPDYVAQVATVPFRLGRPEELPGTLDELRAAVSARAGEAVRGLNRPGARTDLAALLAATERTRAALAPVGAGPVGDDPSESEANRDNDLAFGIVRTRGPVAELLVDAALAALAGILEVAVDRGSDLEDAAWQRFIGGFDALLGWLADPHSAPRPATVPGAGPAGPPVHQDALRRWVRGHHVFMVLAQGCALATACLRDSAARGDLPGAEASAAAAEALMRGCQGALLYAGDANREQYNEQIRPTLMPPVAPPKMSGLHWRDHEVLIKELAGSRDAWEWLSAQGSERPATFRAALAETYDSHIGVCGHFVGDQSPSLLAAQGSTRSAVGVIGQFRKIRLSALPEQPATQQGEPS']]}
////
--------------------
MNXR114039
EC numbers are =  1.11.2.5
UniProt IDs for RHEA:41432
1
{'RHEA:41432': [['B0CN28', 'MTAPADTVHPAGQPDYVAQVATVPFRLGRPEELPGTLDELRAAVSARAGEAVRGLNRPGARTDLAALLAATERTRAALAPVGAGPVGDDPSESEANRDNDLAFGIVRTRGPVAELLVDAALAALAGILEVAVDRGSDLEDAAWQRFIGGFDALLGWLADPHSAPRPATVPGAGPAGPPVHQDALRRWVRGHHVFMVLAQGCALATACLRDSAARGDLPGAEASAAAAEALMRGCQGALLYAGDANREQYNEQIRPTLMPPVAPPKMSGLHWRDHEVLIKELAGSRDAWEWLSAQGSERPATFRAALAETYDSHIGVCGHFVGDQSPSLLAAQGSTRSAVGVIGQFRKIRLSALPEQPATQQGEPS']]}
////
MNXR131130
EC numbers are =  1.11;1.11.

In [48]:
# Example usage
#rhea_id = "RHEA:25221"
reaction_id = 'MNXR102311'
#rhea_id = "RHEA:27754"

print("EC numbers are = " , reaction_csv.loc[reaction_id]['classifs'])

if 'rhea' in reaction_csv.loc[reaction_id]['reference']:
    rhea_id = 'RHEA'+':'+reaction_csv.loc[reaction_id]['reference'].split(':')[1]
    


uniprot_ids = get_uniprot_ids_from_rhea_id(rhea_id)

#sequence = []

if uniprot_ids:
    record = []
    print("UniProt IDs for", rhea_id)
    print(len(uniprot_ids))
    for uniprot_id in uniprot_ids:
        record.append([uniprot_id.split('\t')[0],get_protein_sequence(uniprot_id.split('\t')[0])])
        #print(uniprot_id)
        #print(uniprot_id.split('\t')[0])
        #print(get_protein_sequence(uniprot_id.split('\t')[0]))
        #print('--------')
'''   
else:
    print("No UniProt IDs found or an error occurred.")
'''
    
print({rhea_id:record})


EC numbers are =  2.6.1.85
UniProt IDs for RHEA:11536
2
{'RHEA:11536': [['O09460', 'MSLSESLAKYGITGATNIVHNPSHEELFAAETQASLEGFEKGTVTEMGAVNVMTGVYTGRSPKDKFIVKNEASKEIWWTSDEFKNDNKPVTEEAWAQLKALAGKELSNKPLYVVDLFCGANENTRLKIRFVMEVAWQAHFVTNMFIRPTEEELKGFEPDFVVLNASKAKVENFKELGLNSETAVVFNLAEKMQIILNTWYGGEMKKGMFSMMNFYLPLQGIAAMHCSANTDLEGKNTAIFFGLSGTGKTTLSTDPKRLLIGDDEHGWDDDGVFNFEGGCYAKVINLSKENEPDIWGAIKRNALLENVTVDANGKVDFADKSVTENTRVSYPIFHIKNIVKPVSKAPAAKRVIFLSADAFGVLPPVSILSKEQTKYYFLSGFTAKLAGTERGITEPTPTFSSCFGAAFLTLPPTKYAEVLVKRMEASGAKAYLVNTGWNGTGKRISIKDTRGIIDAILDGSIDTANTATIPYFNFTVPTELKGVDTKILDPRNTYADASEWEVKAKDLAERFQKNFKKFESLGGDLVKAGPQL'], ['Q7X2H8', 'MHIDNIENLSDREFDYIVVGGGSAGAAVAARLSEDPAVSVALVEAGPDDRGVPEVLQLDRWMELLESGYDWDYPIEPQENGNSFMRHARAKVMGGCSSHNSCIAFWAPREDLDEWEAKYGATGWNAEAAWPLYKRLETNEDAGPDAPHHGDSGPVHLMNVPPKDPTGVALLDACEQAGIPRAKFNTGTTVVNGANFFQINRRADGTRSSSSVSYIHPIVEQENFTLLTGLRARQLVFDADRRCTGVDIVDSAFGHTHRLTARNEVVLSTGAIDTPKLLMLSGIGPAAHLAEHGIEVLVDSPGVGEHLQDHPEGVVQFEAKQPMVAESTQWWEIGIFTPTEDGLDRPDLMMHYGSVPFDMNTLRHGYPT