/usr/local/bin/python3


In [1]:
import os
from pathlib import Path

try:
    from IPython import get_ipython
    ipynb_path = Path(get_ipython().run_line_magic('pwd', '')).resolve()
except Exception:
    ipynb_path = Path.cwd().resolve()

os.chdir(ipynb_path)
print("Working directory set to:", os.getcwd())

Working directory set to: /Users/wynne/Dropbox/BioGeoFormer/scripts/step_9_MAG_application


In [3]:
import os
import json
import pandas as pd
import re

# === 1. Set your working directory ===
# Uncomment and update this line to where your file is saved:
os.chdir("../../cold_seep_MAG_application/KEGG_mapping/")

# === 2. Load the JSON file ===
with open("ko00001.json", "r") as f:
    data = json.load(f)

# === 3. Recursive function to extract KO entries ===
def extract_kos(node, hierarchy_path=""):
    results = []
    name = node.get("name", "")
    current_path = f"{hierarchy_path} > {name}" if hierarchy_path else name

    # Check if this node is a KO entry
    match = re.match(r"^(K\d{5})\s+(.*)", name)
    if match:
        ko = match.group(1)
        rest = match.group(2)

        # Extract EC numbers
        ec_match = re.search(r"\[EC:(.*?)\]", rest)
        ec = ec_match.group(1).strip() if ec_match else None
        rest = re.sub(r"\[EC:.*?\]", "", rest).strip()

        # Split into gene symbols and description
        if ';' in rest:
            gene, desc = [x.strip() for x in rest.split(';', 1)]
        else:
            gene, desc = "", rest

        results.append({
            "KO": ko,
            "Gene_Symbols": gene,
            "Description": desc,
            "EC": ec,
            "Hierarchy": hierarchy_path
        })

    # Recursively process children
    for child in node.get("children", []):
        results.extend(extract_kos(child, current_path))

    return results

# === 4. Parse from the root "children" list ===
root_children = data.get("children", [])
parsed_kos = []
for child in root_children:
    parsed_kos.extend(extract_kos(child))

# === 5. Convert to DataFrame ===
df = pd.DataFrame(parsed_kos)

# === 6. Preview or save ===
print(df.head())
print(f"\n✅ Parsed {len(df)} KO entries.")

# Optional: save to file
df.to_csv("../../cold_seep_MAG_application/KEGG_mapping/parsed_ko00001.csv", index=False)


       KO Gene_Symbols                    Description       EC  \
0  K00844           HK                     hexokinase  2.7.1.1   
1  K12407          GCK                    glucokinase  2.7.1.2   
2  K00845          glk                    glucokinase  2.7.1.2   
3  K25026          glk                    glucokinase  2.7.1.2   
4  K01810     GPI, pgi  glucose-6-phosphate isomerase  5.3.1.9   

                                           Hierarchy  
0  09100 Metabolism > 09101 Carbohydrate metaboli...  
1  09100 Metabolism > 09101 Carbohydrate metaboli...  
2  09100 Metabolism > 09101 Carbohydrate metaboli...  
3  09100 Metabolism > 09101 Carbohydrate metaboli...  
4  09100 Metabolism > 09101 Carbohydrate metaboli...  

✅ Parsed 62599 KO entries.
