In [1]:
import pandas as pd  # Import pandas library for data manipulation
import argparse  # Import argparse for command-line argument parsing
import numpy as np  # Import numpy for numerical operations
import requests  # Import requests for making HTTP requests

In [12]:
def read_and_process_file(file):  # Define a function to read and process a file
    sample_name = file.split('/')[-1].split('_')[0]  # Extract sample name from file path
    df = pd.read_csv(file, sep='\t', index_col=0, comment='#')  # Read the file into a DataFrame
    df = df[~df.index.isin(['UNMAPPED', 'UNGROUPED'])]  # Remove rows with 'UNMAPPED' and 'UNGROUPED' indices
    df = df[df.columns[df.sum() != 0]]  # Remove columns with all zero values
    # df.columns = [sample_name]  # Rename columns to sample name
    if df.index.duplicated().any():  # Check for duplicate indices
        print("Duplicate indices detected. Making indices unique.")  # Print a message if duplicates are found
        df.index = df.index + "_" + pd.Series(range(len(df))).astype(str)  # Make indices unique
    return df  # Return the processed DataFrame

In [13]:
def get_reactions_from_ko(ko_id):  # Define a function to get reactions from KO ID
    url = f"http://rest.kegg.jp/link/reaction/{ko_id}"  # Construct the URL for the KEGG API
    response = requests.get(url)  # Make a GET request to the URL
    return response.text  # Return the response text

def get_compounds_from_reaction(reaction_id):  # Define a function to get compounds from reaction ID
    url = f"http://rest.kegg.jp/get/{reaction_id}"  # Construct the URL for the KEGG API
    response = requests.get(url)  # Make a GET request to the URL
    return response.text  # Return the response text

In [14]:
df = pd.read_csv("SRR7947168_kegg_3_reactions.tsv", sep='\t', index_col=0, header=0)  # Read the TSV file into a DataFrame
df  # Display the DataFrame
# drop rows with all zeros
df = df.loc[(df!=0).any(axis=1)]  # Drop rows where all values are zero
# drop UNGROUPED and UNMAPPED
df = df[~df.index.isin(['UNMAPPED', 'UNGROUPED'])]  # Drop rows with 'UNMAPPED' and 'UNGROUPED' indices

In [8]:
from concurrent.futures import ThreadPoolExecutor

def process_ko_id(ko_id, df):  # Define a function to process KO ID
    metabolite_abundance = {}  # Initialize an empty dictionary to store metabolite abundance
    reaction_mapping = get_reactions_from_ko(f"ko:{ko_id}")  # Get reaction mapping from KO ID
    for line in reaction_mapping.strip().split('\n'):  # Iterate over each line in the reaction mapping
        if '\t' not in line:  # Skip lines that do not contain a tab character
            continue
        _, reaction_id = line.split('\t')  # Split the line by tab and get the reaction ID
        reaction_details = get_compounds_from_reaction(reaction_id.split(':')[1])  # Get reaction details from reaction ID
        metabolites = []  # Initialize an empty list to store metabolites
        for line in reaction_details.strip().split('\n'):  # Iterate over each line in the reaction details
            if line.startswith("DEFINITION"):  # Check if the line starts with 'DEFINITION'
                definition = line.split('DEFINITION')[1].strip()  # Extract the definition from the line
                metabolites = [s.strip() for s in definition.split('<=>')[0].split('+')] + [s.strip() for s in definition.split('<=>')[1].split('+')]  # Split the definition into metabolites
        for metabolite in metabolites:  # Iterate over each metabolite
            parts = [p for p in metabolite.split() if p]  # Split the metabolite by space and remove empty parts
            if parts and parts[0].isdigit():  # Check if the first part is a digit
                count = int(parts[0])  # Get the count of the metabolite
                metabolite_name = " ".join(parts[1:])  # Get the name of the metabolite
            else:
                count = 1  # Set the count to 1 if the first part is not a digit
                metabolite_name = metabolite  # Set the name of the metabolite
            if metabolite_name not in metabolite_abundance:  # Check if the metabolite is not in the dictionary
                metabolite_abundance[metabolite_name] = 0  # Initialize the abundance of the metabolite to 0
            metabolite_abundance[metabolite_name] += df.loc[ko_id].values[0] * count  # Update the abundance of the metabolite
    return metabolite_abundance  # Return the metabolite abundance dictionary

def map_ko_to_metabolites(df, num_threads):  # Define a function to map KO IDs to metabolites
    metabolite_abundance = {}  # Initialize an empty dictionary to store metabolite abundance
    with ThreadPoolExecutor(max_workers=num_threads) as executor:  # Create a ThreadPoolExecutor with the specified number of threads
        results = list(executor.map(lambda ko_id: process_ko_id(ko_id, df), df.index))  # Map the KO IDs to metabolites using the executor
    for result in results:  # Iterate over each result
        for metabolite, abundance in result.items():  # Iterate over each metabolite and its abundance
            if metabolite not in metabolite_abundance:  # Check if the metabolite is not in the dictionary
                metabolite_abundance[metabolite] = 0  # Initialize the abundance of the metabolite to 0
            metabolite_abundance[metabolite] += abundance  # Update the abundance of the metabolite
    return pd.DataFrame.from_dict(metabolite_abundance, orient='index', columns=['Abundance'])  # Return a DataFrame with the metabolite abundance

metabolite_abundance = {}  # Initialize an empty dictionary to store metabolite abundance

num_threads = 1  # Example number of threads
metabolite_df = map_ko_to_metabolites(df, num_threads)  # Map KO IDs to metabolites and get the DataFrame
metabolite_df  # Display the DataFrame

KeyboardInterrupt: 