In [1]:
import requests
from Bio import SeqIO
from Bio.KEGG import REST
from Bio.KEGG import Enzyme
%matplotlib inline
from IPython.display import Image
import io
import os
import pandas as pd
from Bio.KEGG.KGML import KGML_parser
from Bio.Graphics.KGML_vis import KGMLCanvas



In [9]:
def get_ncbi_protein_id(kegg_gene_id):
    base_url = "http://rest.kegg.jp"
    get_conv_url = f"{base_url}/conv/ncbi-proteinid/{kegg_gene_id}"
    #ncbi_protein_ids = []
    response = requests.get(get_conv_url)
    if response.status_code != 200:
        #print(f"Conversoin of Kegg ID to NCBI gene id failed (status code: {response.status_code})")
        return 'Nothing'
    for line in response.text.splitlines():
            #print(line)
            parts = line.split()
            #print(parts)
            if len(parts)==0:
                return 'Nothing'
            return parts[1]
            #print(parts[1])
            #ncbi_protein_ids.append(parts[1])
            
    

In [10]:
def get_uniprot_id(kegg_gene_id):
    base_url = "http://rest.kegg.jp"
    get_conv_url = f"{base_url}/conv/uniprot/{kegg_gene_id}"
    #ncbi_protein_ids = []
    response = requests.get(get_conv_url)
    if response.status_code != 200:
        #print(f"Conversoin of Kegg ID to NCBI gene id failed (status code: {response.status_code})")
        return 'Nothing'
    for line in response.text.splitlines():
            #print(line)
            parts = line.split()
            #print(parts)
            if len(parts)==0:
                return 'Nothing'
            return parts[1]
            #print(parts[1])
            #ncbi_protein_ids.append(parts[1])
            

In [20]:
def get_gene_sequences_from_kegg_reaction(reaction_id):
    """
    Retrieves gene sequences associated with a KEGG reaction ID using the KEGG REST API.

    Args:
        reaction_id (str): The KEGG reaction ID (e.g., "R00001").

    Returns:
        list: A list of Biopython SeqRecord objects containing the gene sequences.
    """
    print(reaction_id)
    base_url = "http://rest.kegg.jp"

    # 1. Get genes linked to the reaction
    get_ec_url = f"{base_url}/link/enzyme/{reaction_id}"
    response = requests.get(get_ec_url)
    print(get_ec_url)
    
    #print(response.content)
    if response.status_code != 200:
        print(f"KEGG API request failed for EC number (status code: {response.status_code})")
        return []
        #raise ValueError(f"KEGG API request failed (status code: {response.status_code
    
    ec_numbers = []
    for line in response.text.splitlines():
        #print(line)
        parts = line.split()
        #print(parts)
        #print("-------")
        ec_number = parts[1]  # EC number is in the second column
        ec_numbers.append(ec_number)
    
    get_ko_url = f"{base_url}/link/ko/{ec_numbers[0]}"
    print(get_ko_url)
    response = requests.get(get_ko_url)

    if response.status_code != 200:
        print(f"KEGG API request failed for KO (status code: {response.status_code})")
        return []
    
    ko_ids = []
    for line in response.text.splitlines():
        #print(line)
        parts = line.split()
        #print(parts)
        #print("-------")
        ko_id = parts[1].split(':')[1]
        ko_ids.append(ko_id)
        
    get_genes_url = f"{base_url}/link/genes/{ko_ids[0]}"
    print(get_genes_url)
    response = requests.get(get_genes_url)

    if response.status_code != 200:
        print(f"KEGG API request failed for KO (status code: {response.status_code})")
        return []
    
    gene_ids = []
    for line in response.text.splitlines():
        #print(line)
        parts = line.split()
        #print(parts)
        #print("-------")
        #gene_id = parts[1].split(':')[1]
        gene_id = parts[1]
        #print(gene_id)
        #print("------")
        gene_ids.append(gene_id)
    
    #print(len(gene_ids))

    # 2. Fetch gene sequence data
    sequences = []
    #print(len(gene_ids))
    for gene_id in gene_ids:
        #print(gene_id)
        get_gene_url = f"{base_url}/get/{gene_id}/aaseq"
        response = requests.get(get_gene_url)

        if response.status_code != 200:
            print(f"Warning: Failed to retrieve sequence for gene {gene_id}")
            continue
        
        #print(response.content)
        #print(type(response.content))
        itr = 0
        seq_record = ''
        for line in response.text.splitlines():
            itr+=1
            parts = line.split()
            #print(itr)
            if itr>1:
                seq_record += parts[0]
            
        
        #seq_record = SeqIO.read(response.content, "fasta")
        #print(seq_record)
        #print("////////////////////")
        record = [gene_id]
        ncbi_protein_id = get_ncbi_protein_id(gene_id)
        uniprot_id = get_uniprot_id(gene_id)
        if ncbi_protein_id!='Nothing':
            record.append(ncbi_protein_id)
        if uniprot_id!='Nothing':
            record.append(uniprot_id)
        
        
        record.append(seq_record)
        #print(record)
        
            
            
        
        sequences.append(record)
        

    return sequences



In [22]:
import json

In [24]:
reaction_csv = pd.read_csv("./metanetx_new_final/metanetx/data_metanetx_latest/reac_prop_csv.csv", index_col = '#ID',skiprows = [i for i in range(351)])
reaction_csv = reaction_csv.drop(index='EMPTY')
reaction_csv_notNAN = reaction_csv.dropna(subset=['classifs'])
len(reaction_csv_notNAN)
rule_map = json.load(open('./rule_mapping_metanetx.json'))

In [25]:
pathway = ['MNXR102311','MNXR114039', 'MNXR151676']

#reaction_id = 'MNXR111780'
all_reactions = [rule_map[rid] for rid in pathway]
all_reactions

[['MNXR102311'],
 ['MNXR114039', 'MNXR131130', 'MNXR132834', 'MNXR132876'],
 ['MNXR151676', 'MNXR169858']]

In [45]:
reaction_ref_csv = pd.read_csv("./metanetx_new_final/metanetx/data_metanetx_latest/reac_xref.tsv",skiprows = [i for i in range(351)], on_bad_lines='skip', sep = '\t')
#reaction_ref_csv = reaction_csv.drop(index=0)
#reaction_csv_notNAN = reaction_csv.dropna(subset=['classifs'])
len(reaction_ref_csv)

384802

In [46]:
#reaction_ref_csv_notNAN = reaction_ref_csv.dropna(subset=['ID'])
#len(reaction_ref_csv)

In [48]:
reaction_ref_rhea = reaction_ref_csv[reaction_ref_csv['#source'].str.contains('rhea')]

In [49]:
reaction_ref_rhea

Unnamed: 0,#source,ID,description
82,rhea:54528,EMPTY,1 chebi:58273@rheaC:comp <?> 1 chebi:78346@rhe...
83,rhea:54529,EMPTY,1 chebi:58273@rheaC:comp --> 1 chebi:78346@rhe...
84,rhea:54530,EMPTY,1 chebi:58273@rheaC:comp <-- 1 chebi:78346@rhe...
85,rhea:54531,EMPTY,1 chebi:58273@rheaC:comp <=> 1 chebi:78346@rhe...
86,rheaR:54528,EMPTY,1 chebi:58273@rheaC:comp <?> 1 chebi:78346@rhe...
...,...,...,...
384555,rhea:28826,MNXR99958,1 chebi:15378@rheaC:in + 1 chebi:53071@rheaC:i...
384556,rheaR:28823,MNXR99958,1 chebi:15378@rheaC:in + 1 chebi:53071@rheaC:i...
384557,rheaR:28824,MNXR99958,1 chebi:15378@rheaC:in + 1 chebi:53071@rheaC:i...
384558,rheaR:28825,MNXR99958,1 chebi:15378@rheaC:in + 1 chebi:53071@rheaC:i...


In [37]:
reaction_ref_csv.head()

Unnamed: 0,#source,ID,description
0,EMPTY,EMPTY,Empty equation
1,bigg.reaction:CRBNTD,EMPTY,H2CO3 dissociation||1 biggM:h2co3@biggC:x = 1 ...
2,bigg.reaction:DNADRAIN,EMPTY,Dna sink transport reaction|| =
3,bigg.reaction:H2CO3D2,EMPTY,Carboxylic acid dissociation||1 biggM:h2co3@bi...
4,bigg.reaction:H2CO3D2m,EMPTY,"Carboxylic acid dissociation, mitochondrial||1..."


In [50]:
reaction_ref_kegg = reaction_ref_csv[reaction_ref_csv['#source'].str.contains('kegg')]
reaction_ref_kegg

Unnamed: 0,#source,ID,description
316,kegg.reaction:R00253,MNXR100024,L-Glutamate:ammonia ligase (ADP-forming)||ATP ...
317,keggR:R00253,MNXR100024,L-Glutamate:ammonia ligase (ADP-forming)||ATP ...
378,kegg.reaction:R00256,MNXR100030,L-glutamine amidohydrolase||L-Glutamine + H2O ...
379,keggR:R00256,MNXR100030,L-glutamine amidohydrolase||L-Glutamine + H2O ...
540,kegg.reaction:R01333,MNXR100060,glycolaldehyde:NAD+ oxidoreductase||Glycolalde...
...,...,...,...
384335,keggR:R03313,MNXR99896,L-glutamate-5-semialdehyde:NADP+ 5-oxidoreduct...
384384,kegg.reaction:R03550,MNXR99920,"gallate:oxygen 4,5-oxidoreductase (decyclizing..."
384385,keggR:R03550,MNXR99920,"gallate:oxygen 4,5-oxidoreductase (decyclizing..."
384530,kegg.reaction:R03033,MNXR99957,D-galactonate hydro-lyase||D-Galactonate <=> 2...


In [21]:
reaction_id = 'R09961'
gene_sequences = get_gene_sequences_from_kegg_reaction(reaction_id)
print({reaction_id:gene_sequences})

R09961
http://rest.kegg.jp/link/enzyme/R09961
http://rest.kegg.jp/link/ko/ec:4.2.3.105
http://rest.kegg.jp/link/genes/K21925
{'R09961': [['sly:101245212', 'ncbi-proteinid:NP_001295307', 'up:G1JUH1', 'MSIFSTRYLVTPFSSFSPPKAFVSKACSLSTGQPLNYSPNISTNIISSSNGIINPIRRSGNYEPTMWNYEYIQSTHNHHVGEKYMKRFNELKAEMKKHLMMMLHEESQELEKLELIDNLQRLGVSYHFKDEIIQILRSIHDQSSSEATSANSLYYTALKFRILRQHGFYISQDILNDFKDEQGHFKQSLCKDTKGLLQLYEASFLSTKSETSTLLESANTFAMSHLKNYLNGGDEENNWMVKLVRHALEVPLHCMMLRVETRWYIDIYENIPNANPLLIELAKLDFNFVQAMHQQELRNLSRWWKKSMLAEKLPFARDRIVEAFQWITGMIFESQENEFCRIMLTKVTAMATVIDDIYDVYGTLDELEIFTHAIQRMEIKAMDELPHYMKLCYLALFNTSSEIAYQVLKEQGINIMPYLTKSWADLSKSYLQEARWYYSGYTPSLDEYMENAWISVGSLVMVVNAFFLVTNPITKEVLEYLFSNKYPDIIRWPATIIRLTDDLATSSNEMKRGDVPKSIQCYMKENGASEEEARKHINLMIKETWKMINTAQHDNSLFCEKFMGCAVNIARTGQTIYQHGDGHGIQNYKIQNRISKLFFEPITISMP'], ['sly:101249864', 'ncbi-proteinid:XP_004230563', 'MVSILSNIRMIMVTYKRPSLFTSLRRHAANNIIITKHSHPISATRRSGNYKPTLWDFQFIESLHNPYAGDKYMKRLNELKKEVKKMMMTVEGSQDEELEKLELIDNLERLGVSYHFKDEIMQILKSIHEQKITSTDNSL