In [1]:
import requests
import pandas as pd

# KEGG API endpoint for the drug dataset
url = "http://rest.kegg.jp/list/drug"

# Send a request to the KEGG API
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Split the response content into lines
    lines = response.text.split('\n')

    # Parse the lines to extract drug information
    data = []
    for line in lines:
        if line:
            entry, description = line.split('\t')
            data.append({'Entry': entry, 'Description': description})

    # Create a DataFrame from the parsed data
    df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    df.to_csv('kegg_drug_dataset.csv', index=False)
    print("KEGG drug dataset has been downloaded and saved to 'kegg_drug_dataset.csv'.")
else:
    print("Failed to retrieve the KEGG drug dataset. Status code:", response.status_code)


KEGG drug dataset has been downloaded and saved to 'kegg_drug_dataset.csv'.


In [5]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('kegg_drug_dataset.csv')

# Rename columns
df.rename(columns={'Entry': 'KEGG ID', 'Description': 'substrate'}, inplace=True)

# Create an empty list to hold the new rows
new_rows = []

# Iterate over each row in the DataFrame
for index, row in df.iterrows():
    kegg_id = row['KEGG ID']
    substrate = row['substrate']
    # Ensure substrate is a string
    if not isinstance(substrate, str):
        substrate = ''
    substrates = substrate.split(';')
    for sub in substrates:
        new_rows.append({'KEGG ID': kegg_id.rstrip(','), 'substrate': sub.strip().rstrip(',')})

# Create a new DataFrame from the new rows
new_df = pd.DataFrame(new_rows)

# Save the new DataFrame to a CSV file
new_df.to_csv('kegg_drug_dataset_split.csv', index=False)

# Save the new DataFrame to a pickle file
new_df.to_pickle('kegg_drug_dataset_split.pkl')

print("The CSV and PKL files have been processed and saved.")


The CSV and PKL files have been processed and saved.


In [7]:
import requests
import pandas as pd

# KEGG API endpoint for the compound (substrate) dataset
url = "http://rest.kegg.jp/list/compound"

# Send a request to the KEGG API
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Split the response content into lines
    lines = response.text.split('\n')

    # Parse the lines to extract compound information
    data = []
    for line in lines:
        if line:
            entry, description = line.split('\t')
            data.append({'KEGG ID': entry, 'substrate': description})

    # Create a DataFrame from the parsed data
    df = pd.DataFrame(data)

    # Create an empty list to hold the new rows
    new_rows = []

    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        kegg_id = row['KEGG ID']
        substrate = row['substrate']
        # Ensure substrate is a string
        if not isinstance(substrate, str):
            substrate = ''
        substrates = substrate.split(';')
        for sub in substrates:
            new_rows.append({'KEGG ID': kegg_id.rstrip(','), 'substrate': sub.strip().rstrip(',')})

    # Create a new DataFrame from the new rows
    new_df = pd.DataFrame(new_rows)

    # Save the new DataFrame to a CSV file
    new_df.to_csv('kegg_substrate_dataset_split.csv', index=False)

    # Save the new DataFrame to a pickle file
    new_df.to_pickle('kegg_substrate_dataset_split.pkl')

    print("KEGG substrate dataset has been downloaded, processed, and saved to 'kegg_substrate_dataset_split.csv' and 'kegg_substrate_dataset_split.pkl'.")
else:
    print("Failed to retrieve the KEGG substrate dataset. Status code:", response.status_code)


KEGG substrate dataset has been downloaded, processed, and saved to 'kegg_substrate_dataset_split.csv' and 'kegg_substrate_dataset_split.pkl'.


In [9]:
import requests
import pandas as pd

# KEGG API endpoint for the compound mapping to ChEBI
url = "http://rest.kegg.jp/conv/chebi/compound"

# Send a request to the KEGG API
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Split the response content into lines
    lines = response.text.split('\n')

    # Parse the lines to extract compound mappings
    data = []
    for line in lines:
        if line:
            kegg_entry, chebi_entry = line.split('\t')
            kegg_id = kegg_entry.split(':')[1]
            chebi_id = chebi_entry.split(':')[1]
            data.append({
                'Input': kegg_id,
                'Input_source': 'KEGG',
                'Output': f'CHEBI:{chebi_id}',
                'Output_source': 'ChEBI'
            })

    # Create a DataFrame from the parsed data
    df = pd.DataFrame(data)

    # Save the DataFrame to a TSV file
    df.to_csv('kegg_chebi_mapping.tsv', sep='\t', index=False)

    print("KEGG to ChEBI mapping has been downloaded and saved to 'kegg_chebi_mapping.tsv'.")
else:
    print("Failed to retrieve the KEGG to ChEBI mapping. Status code:", response.status_code)


KEGG to ChEBI mapping has been downloaded and saved to 'kegg_chebi_mapping.tsv'.
