In [56]:
import pandas as pd
import argparse
import numpy as np
import requests
from concurrent.futures import ThreadPoolExecutor

def read_and_process_file(file):
    sample_name = file.split('/')[-1].split('_')[0]
    df = pd.read_csv(file, sep='\t')
    df = df[df.columns[df.sum() != 0]]
    df.columns = ["Reaction", "Abundance"]
    if df.index.duplicated().any():
        print("Duplicate indices detected. Making indices unique.")
        df.index = df.index + "_" + pd.Series(range(len(df))).astype(str)
        
    # remove UNMAPPED and UNGROUPED rows
    for i, row in df.iterrows():
        if row["Reaction"] == "UNMAPPED" or row["Reaction"] == "UNGROUPED":
            df.drop(i, inplace=True)
        
    # reset index
    df.reset_index(drop=True, inplace=True)
    
    print(f"Finished processing {file}")
    print(df.head())
    return df

def get_reactions_from_ko(ko_id):
    url = f"http://rest.kegg.jp/link/reaction/{ko_id}"
    response = requests.get(url)
    return response.text

def get_compounds_from_reaction(reaction_id):
    url = f"http://rest.kegg.jp/get/{reaction_id}"
    response = requests.get(url)
    return response.text

# KEGG server is configured to consider intermittent accesses as an attack as a countermeasure against DDoS attacks.

# Your IP addres xxx.xxx.xxx.xx is denied access by the server.

# The criteria are as follows:
# 1) When there are 240 accesses within 60 seconds of the last access,
#    the SuspectDoS environment variable is set to "1".
# 2) When there are 240 additional accesses within 3600 seconds,
#        our server denies access from the corresponding IP address.
# 3) When there are no accesses for the next 3600 seconds,
#     the counter will return to 0.
# 4) If there is another access within 3600 seconds of the last access,
#    the counter continues to add.

df = read_and_process_file("SRR7947168_kegg_3_reactions.tsv")

Finished processing SRR7947168_kegg_3_reactions.tsv
  Reaction    Abundance
0   K01520  1198.265562
1   K02428   751.252776
2   K00794   742.036107
3   K00954   613.162434
4   K00981   514.986739


In [57]:
for ko_id in df["Reaction"]:
    print(ko_id)
    print(get_reactions_from_ko(ko_id))

K01520
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>403 Forbidden</title>
</head><body>
<h1>Forbidden</h1>
<p>You don't have permission to access this resource.</p>
</body></html>

K02428
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>403 Forbidden</title>
</head><body>
<h1>Forbidden</h1>
<p>You don't have permission to access this resource.</p>
</body></html>

K00794
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>403 Forbidden</title>
</head><body>
<h1>Forbidden</h1>
<p>You don't have permission to access this resource.</p>
</body></html>

K00954
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>403 Forbidden</title>
</head><body>
<h1>Forbidden</h1>
<p>You don't have permission to access this resource.</p>
</body></html>

K00981
<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">
<html><head>
<title>403 Forbidden</title>
</head><body>
<h1>Forbidden</h1>
<p>You don't have permission to access 

In [None]:
def process_ko_id(df):
    
    ko_id = df.index[0]
    metabolite_abundance = {}
    reaction_mapping = get_reactions_from_ko(f"ko:{ko_id}")
    for line in reaction_mapping.strip().split('\n'):
        if '\t' not in line:
            continue
        _, reaction_id = line.split('\t')
        reaction_details = get_compounds_from_reaction(reaction_id.split(':')[1])
        metabolites = []
        for line in reaction_details.strip().split('\n'):
            if line.startswith("DEFINITION"):
                definition = line.split('DEFINITION')[1].strip()
                metabolites = [s.strip() for s in definition.split('<=>')[0].split('+')] + [s.strip() for s in definition.split('<=>')[1].split('+')]
        for metabolite in metabolites:
            print(metabolite)
            parts = [p for p in metabolite.split() if p]
            if parts and parts[0].isdigit():
                count = int(parts[0])
                metabolite_name = " ".join(parts[1:])
            else:
                count = 1
                metabolite_name = metabolite
            if metabolite_name not in metabolite_abundance:
                metabolite_abundance[metabolite_name] = 0
            metabolite_abundance[metabolite_name] += df.loc[ko_id].values[0] * count
            print(metabolite_abundance[metabolite_name])
    return metabolite_abundance