In [1]:
print("Ben dep trai")

Ben dep trai


In [2]:
import requests
import json
import os
import time
import pandas as pd
from requests.exceptions import HTTPError, RequestException

site = 'http://classyfire.wishartlab.com'
src_folder = 'data/clean_result'
target_folder = 'data/grouping_result'
def get_classification(inchikey, format = 'json', retries=3, backoff_factor=0.3):
    url = f'{site}/entities/{inchikey}.{format}'
    header = {'Accpet': f'application/{format}'}

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=header, timeout=10)
            response.raise_for_status()
            return response.json()
            #json_res = response.content.decode('utf-8')
            #return json.loads(json_res)
            
        except HTTPError as http_err:
            if response.status_code in [404, 500, 504, 408]:
                print(f"HTTP error occurred: {http_err}")
                return {}
            else:
                print(f"HTTP error occurred: {http_err}")
                return {}
        except RequestException as req_err:
            print(f'Request error occurred: {req_err}')
            if attempt < retries - 1:
                time.sleep(backoff_factor * (2 ** attempt))
            else:
                return {}   


# create output folder if still not exist
if not os.path.exists(target_folder):
    os.makedirs(target_folder)

# read files
src_files = [f for f in os.listdir(src_folder) if f.endswith('.xlsx')]
for file in src_files:
    file_path = os.path.join(src_folder, file)
    df = pd.read_excel(file_path)

    # filter rows where title is not unknown
    filtered_df = df[df['Title'] != 'Unknown']

    # initialize lists to hold data for new table
    titles = []
    inchikeys = []
    kingdoms = []
    superclasses = []
    classes = []
    subclasses = []
    intermediate_nodes = []
    direct_parents = []
    
    for _, row in filtered_df.iterrows():
        title = row['Title']
        inchikey = row['InChIKey']

        res = get_classification(inchikey)

        print(json.dumps(res, indent=4))
        kingdom = res.get('kingdom', {}).get('name', '') if res.get('kingdom') else ''
        superclass = res.get('superclass', {}).get('name', '') if res.get('superclass') else ''
        class_ = res.get('class', {}).get('name', '') if res.get('class') else ''
        subclass = res.get('subclass', {}).get('name', '') if res.get('subclass') else ''
        direct_parent = res.get('direct_parent', {}).get('name', '') if res.get('direct_parent') else ''
        intermediate_node = []
        if 'intermediate_nodes' in res:
            for node in res['intermediate_nodes']:
                intermediate_node.append(node.get('name', ''))
        
        # Append data to the lists
        titles.append(title)
        inchikeys.append(inchikey)
        kingdoms.append(kingdom)
        superclasses.append(superclass)
        classes.append(class_)
        subclasses.append(subclass)
        intermediate_nodes.append('; '.join(intermediate_node))
        direct_parents.append(direct_parent)
        time.sleep(6)
    
    result_df = pd.DataFrame({
        'title': titles,
        'inchikey': inchikeys,
        'Kingdom': kingdoms,
        'Superclass': superclasses,
        'class': classes,
        'subclass': subclasses,
        'intermediate_nodes': intermediate_nodes,
        'direct_parents': direct_parents
    })
    # Save the new DataFrame to a CSV file with the same name as the input file
    output_file_path = os.path.join(target_folder, file.replace('.xlsx', '.csv'))
    result_df.to_csv(output_file_path, index=False)
    
    print(f'Saved processed data to {output_file_path}')

'''
inchikey = 'VSNFQQXVMPSASB-SNVBAGLBSA-N'
res = get_classification(inchikey)
print(json.dumps(res, indent=4))
'''


{
    "smiles": "CC(O)(CC(O)=O)CC(O)=O",
    "inchikey": "InChIKey=NPOAOTPXWNWTSH-UHFFFAOYSA-N",
    "kingdom": {
        "name": "Organic compounds",
        "description": "Compounds that contain at least one carbon atom, excluding isocyanide/cyanide and their non-hydrocarbyl derivatives, thiophosgene, carbon diselenide, carbon monosulfide, carbon disulfide, carbon subsulfide, carbon monoxide, carbon dioxide, Carbon suboxide, and dicarbon monoxide.",
        "chemont_id": "CHEMONTID:0000000",
        "url": "http://classyfire.wishartlab.com/tax_nodes/C0000000"
    },
    "superclass": {
        "name": "Lipids and lipid-like molecules",
        "description": "Fatty acids and their derivatives, and substances related biosynthetically or functionally to these compounds.",
        "chemont_id": "CHEMONTID:0000012",
        "url": "http://classyfire.wishartlab.com/tax_nodes/C0000012"
    },
    "class": {
        "name": "Fatty Acyls",
        "description": "Organic molecules synthesiz

"\ninchikey = 'VSNFQQXVMPSASB-SNVBAGLBSA-N'\nres = get_classification(inchikey)\nprint(json.dumps(res, indent=4))\n"

In [3]:
import pandas as pd
import os
from openpyxl import load_workbook
import datetime

src_file_path = "./data/grouping_result/"
target_file_path = "./data/clean_result/"
export_file_dict = "./data/final_result/"

# read 4.csv

for root, dirs, files in os.walk(src_file_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file.endswith(".csv"):
            src_df = pd.read_csv(file_path)
            # build a dictionary to store the [target value] to certain Inchikey
            inchikey_dict = dict()
            for index, row in src_df.iterrows():
                inchikey = row["inchikey"]
                if inchikey not in inchikey_dict:
                    inchikey_dict[inchikey] = []
                if row["direct_parents"]:
                    inchikey_dict[inchikey].append(row["direct_parents"])
                if row["Kingdom"]:
                    inchikey_dict[inchikey].append(row["Kingdom" ])
                if row["Superclass"]:
                    inchikey_dict[inchikey].append(row["Superclass" ])
                if row["class"]:
                    inchikey_dict[inchikey].append(row["class" ])
                if row["subclass"]:
                    inchikey_dict[inchikey].append(row["subclass" ])

# read 3.csv
for root, dirs, files in os.walk(target_file_path):
    for file in files:
        file_path = os.path.join(root, file)
        if file.endswith(".xlsx"):
            target_df = pd.read_excel(file_path)
            # create a new column called Class
            target_df["Class"] = None
            # go through the INCHIKey column of the dataframe and if not null, find the value in dictionary and store back to []
            for index, row in target_df.iterrows():
                inchikey = row["InChIKey"]

                if inchikey and (inchikey in inchikey_dict):
                    target_df.at[index, "Class"] = inchikey_dict[inchikey]

            # export the file
            # Check if the folder exists
            if not os.path.exists(export_file_dict):
                # Create the folder
                os.makedirs(export_file_dict)
            # random file name
            current_datetime = datetime.datetime.now()
            base_filename = os.path.basename(file)
            if base_filename.endswith(".xlsx"):
                base_filename = base_filename[:-5]
            formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
            export_file_name = f"{base_filename}_{formatted_datetime}.csv"
            export_file_path = os.path.join(export_file_dict, export_file_name)
            target_df.to_csv(export_file_path)
            print(f"The file save at {export_file_path}")



The file save at ./data/final_result/sample_2025-08-14_01-17-17.csv


In [4]:
! pip install CTSgetPy

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [5]:
df = pd.read_csv("data/final_result/sample_2025-07-23_15-45-08.csv")
df['InChIKey'].tolist()

FileNotFoundError: [Errno 2] No such file or directory: 'data/final_result/sample_2025-07-23_15-45-08.csv'

In [6]:
#pip install requests
#pip install tqdm
#pip install bs4
#pip install CTSgetPy
import pandas as pd
import os
import glob
#import pprint
import datetime

from CTSgetPy import CTSgetPy as ct

# convertion
SOURCE = 'InChIKey'
#TARGET = ['Human Metabolome Database','KEGG','PubChem CID' ,'ChEBI']
TARGET = ['Human Metabolome Database', 'KEGG','PubChem CID' ,'ChEBI']
import_folder_path = './data/final_result'
export_folder_dict = './data/convert_result'
def transform_inchikey(identifier, target, source=SOURCE):
    if identifier is not None:
        print(f'Converting {source} to {target} for {identifier} identifiers')
        result = ct.CTSget(source, target, identifier)
    else:
        result = {}
    #pprint.pprint(result)
    return result
    
# read final result file
file_list = glob.glob(os.path.join(import_folder_path, '*'))

if len(file_list) == 0:
    print('[ERROR] no file in the final_result folder to process')
else:
    for i in range(len(file_list)):
        df = pd.read_csv(file_list[i])
        # get the inchikey for each row and do convertion
        print(file_list[i])
        for t in TARGET:
            print(t)
            res = transform_inchikey(df['InChIKey'].tolist(), t)
            df[t] = None
            
            # Iterate through the DataFrame and match values from the dictionary
            for key, value in res[t].items():
                if key != 'nan':
                    # Find the row index where the key value is present
                    row_index = df.index[df['InChIKey'] == key].tolist()
                    # Assign the matched value to the corresponding row in the new column
                    df.at[row_index[0], t] = value
        # save the result back to the file
        if not os.path.exists(export_folder_dict):
            # Create the folder
            os.makedirs(export_folder_dict)
        export_file_path = os.path.join(export_folder_dict, file_list[i].split('/')[-1].split('\\')[-1])
        df.to_csv(export_file_path, index=False)



./data/final_result/sample_2025-08-14_01-14-36.csv
Human Metabolome Database
Converting InChIKey to Human Metabolome Database for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  2.92it/s]


KEGG
Converting InChIKey to KEGG for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  2.51it/s]


PubChem CID
Converting InChIKey to PubChem CID for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  2.65it/s]


ChEBI
Converting InChIKey to ChEBI for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  3.03it/s]


./data/final_result/sample_2025-08-14_01-17-17.csv
Human Metabolome Database
Converting InChIKey to Human Metabolome Database for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  2.91it/s]


KEGG
Converting InChIKey to KEGG for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  2.95it/s]


PubChem CID
Converting InChIKey to PubChem CID for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  3.01it/s]


ChEBI
Converting InChIKey to ChEBI for ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', nan, 'HDTRYLNUVZCQOY-LIZSDCNHSA-N', nan] identifiers


100%|██████████| 5/5 [00:01<00:00,  2.95it/s]


In [7]:
result = ct.CTSget("InChIKey","Human Metabolome Database", ['NPOAOTPXWNWTSH-UHFFFAOYSA-N', 'PVXPPJIGRGXGCY-UHFFFAOYSA-N', 'HDTRYLNUVZCQOY-LIZSDCNHSA-N'], top_only=True)


100%|██████████| 3/3 [00:01<00:00,  2.75it/s]


In [None]:
! pip install requests
! pip install tqdm
! pip install bs4
! pip install CTSgetPy

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [None]:
result = ct.CTSget('KEGG', ['PubChem CID', 'PubChem SID'], ['C00001', 'C00002']) 


translating from KEGG to PubChem CID


  0%|          | 0/2 [00:20<?, ?it/s]


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [8]:
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 21 17:27:10 2023

@author: User
"""

import os
import pandas as pd

input_folder = './data/convert_result/'  # 輸入的資料夾路徑
export_folder_dict = './data/metaboanalyst_pubchem'# 輸出的檔案資料夾
output_file = 'merge_result.csv'  # 輸出的檔案
csv_files = [f for f in os.listdir(input_folder) if f.endswith(".csv")]
print("start to merge these files")
print(csv_files)
# 建立空的 DataFrame 用於存放結果
df_output = pd.DataFrame(columns=['Title', 'PubChem CID'])

# 遍歷資料夾中的每個檔案
for file_name in os.listdir(input_folder):
    # 組合檔案的完整路徑
    file_path = os.path.join(input_folder, file_name)

    # 讀取 CSV 檔案
    df_input = pd.read_csv(file_path)

    # 取得檔案名稱作為欄位名稱
    column_name = file_name.split('.')[0]
    
    # 將檔案名稱新增為新的欄位
    df_output[column_name] = ''

    # 遍歷輸入資料的每一行
    for index, row in df_input.iterrows():
        title = row['Title']
        area = row['Area']
        cid = row['PubChem CID']

        # 檢查結果 DataFrame 是否已存在相同的 Title
        existing_row = df_output.loc[df_output['Title'] == title]
        if not existing_row.empty:
            # 找到相同的 Title，將 Area 寫入對應的欄位
            df_output.at[existing_row.index[0], column_name] = area
        else:
        # 沒有相同的 Title，新增一個新的列
            new_row = pd.DataFrame({'Title': [title],'PubChem CID':[cid], column_name: [area]})
            df_output = pd.concat([df_output, new_row], ignore_index=True)

# 將結果儲存為 CSV 檔案
# save the result back to the file
if not os.path.exists(export_folder_dict):
    # Create the folder
    os.makedirs(export_folder_dict)

export_file_path = os.path.join(export_folder_dict, output_file)
df_output.to_csv(export_file_path, index=False)
print(f"Merging data done.")

start to merge these files
['sample_2025-08-14_01-14-36.csv', 'sample_2025-08-14_01-17-17.csv']
Merging data done.


In [None]:
import pandas as pd

# Đọc file CSV gốc (phân cách bằng dấu phẩy), bỏ cột "Unnamed: 0"
input_file = 'data/clean_result_tmp/POOL_POS1209_LIU_54_01_2081_2024-12-20_17-08-30.csv'
df = pd.read_csv(input_file)

# Nếu có cột "Unnamed: 0", hãy bỏ đi
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

# Đổi tên cột "Name" thành "Title"
df.rename(columns={'Name': 'Title'}, inplace=True)

# Lấy 5 dòng đầu tiên
sample_df = df.head(5)

# Lưu ra file Excel mới
output_file = 'data/clean_result/sample.xlsx'
sample_df.to_excel(output_file, index=False)

print(f"✅ Đã tạo file mẫu: {output_file}")


✅ Đã tạo file mẫu: data/clean_result/sample.xlsx


In [None]:
import pandas as pd

# Đọc file CSV gốc (phân cách bằng dấu phẩy), bỏ cột "Unnamed: 0"
input_file = 'data/clean_result_tmp/2-M_0105_1_pos_1_01_49.xlsx'
df = pd.read_excel(input_file)

# Nếu có cột "Unnamed: 0", hãy bỏ đi
if "Unnamed: 0" in df.columns:
    df.drop(columns=["Unnamed: 0"], inplace=True)

# Đổi tên cột "Name" thành "Title"
df.rename(columns={'Name': 'Title'}, inplace=True)

# Lấy 5 dòng đầu tiên
sample_df = df.head(5)

# Lưu ra file Excel mới
output_file = 'data/clean_result/sample.xlsx'
sample_df.to_excel(output_file, index=False)

print(f"✅ Đã tạo file mẫu: {output_file}")

✅ Đã tạo file mẫu: data/clean_result/sample.xlsx


In [None]:
# -*- coding: utf-8 -*-
"""
Chemical Classification Pipeline - OOP Version
Created by: User
Purpose: Process chemical data through classification, conversion, and merging
"""

import requests
import json
import os
import time
import pandas as pd
import glob
import datetime
from requests.exceptions import HTTPError, RequestException
from openpyxl import load_workbook
from CTSgetPy import CTSgetPy as ct

# ================== CONFIGURATION ==================
class Config:
    """Configuration class for all pipeline settings"""
    
    # API Settings
    CLASSYFIRE_SITE = 'http://classyfire.wishartlab.com'
    REQUEST_TIMEOUT = 10
    REQUEST_RETRIES = 3
    BACKOFF_FACTOR = 0.3
    API_DELAY = 6  # seconds between API calls
    
    # Folder Paths
    SOURCE_FOLDER = 'data/clean_result'
    GROUPING_FOLDER = 'data/grouping_result'
    FINAL_RESULT_FOLDER = 'data/final_result'
    CONVERT_RESULT_FOLDER = 'data/convert_result'
    METABOANALYST_FOLDER = 'data/metaboanalyst_pubchem'
    
    # Conversion Settings
    CONVERSION_SOURCE = 'InChIKey'
    CONVERSION_TARGETS = ['Human Metabolome Database', 'KEGG', 'PubChem CID', 'ChEBI']
    
    # Output Settings
    MERGE_OUTPUT_FILE = 'merge_result.csv'


# ================== STEP 1: CHEMICAL CLASSIFICATION ==================
class ChemicalClassifier:
    """Handle chemical classification using ClassyFire API"""
    
    def __init__(self, config: Config):
        self.config = config
        self.site = config.CLASSYFIRE_SITE
        
    def get_classification(self, inchikey: str, format: str = 'json') -> dict:
        """
        Get chemical classification from ClassyFire API
        
        Args:
            inchikey: InChI key for the chemical
            format: Response format (default: json)
            
        Returns:
            Dictionary containing classification data
        """
        url = f'{self.site}/entities/{inchikey}.{format}'
        headers = {'Accept': f'application/{format}'}
        
        for attempt in range(self.config.REQUEST_RETRIES):
            try:
                response = requests.get(
                    url, 
                    headers=headers, 
                    timeout=self.config.REQUEST_TIMEOUT
                )
                response.raise_for_status()
                return response.json()
                
            except HTTPError as http_err:
                if response.status_code in [404, 500, 504, 408]:
                    print(f"HTTP error occurred: {http_err}")
                    return {}
                else:
                    print(f"HTTP error occurred: {http_err}")
                    return {}
                    
            except RequestException as req_err:
                print(f'Request error occurred: {req_err}')
                if attempt < self.config.REQUEST_RETRIES - 1:
                    time.sleep(self.config.BACKOFF_FACTOR * (2 ** attempt))
                else:
                    return {}
    
    def process_classification_files(self):
        """Process all Excel files in source folder for classification"""
        # Create output folder if it doesn't exist
        if not os.path.exists(self.config.GROUPING_FOLDER):
            os.makedirs(self.config.GROUPING_FOLDER)
        
        # Get all Excel files
        src_files = [f for f in os.listdir(self.config.SOURCE_FOLDER) 
                    if f.endswith('.xlsx')]
        
        for file in src_files:
            self._process_single_file(file)
    
    def _process_single_file(self, filename: str):
        """Process a single Excel file for classification"""
        file_path = os.path.join(self.config.SOURCE_FOLDER, filename)
        df = pd.read_excel(file_path)
        
        # Filter rows where title is not unknown
        filtered_df = df[df['Title'] != 'Unknown']
        
        # Initialize data containers
        classification_data = {
            'title': [],
            'inchikey': [],
            'Kingdom': [],
            'Superclass': [],
            'class': [],
            'subclass': [],
            'intermediate_nodes': [],
            'direct_parents': []
        }
        
        # Process each row
        for _, row in filtered_df.iterrows():
            title = row['Title']
            inchikey = row['InChIKey']
            
            # Get classification
            res = self.get_classification(inchikey)
            print(json.dumps(res, indent=4))
            
            # Extract classification data
            classification_info = self._extract_classification_info(res)
            
            # Append data
            classification_data['title'].append(title)
            classification_data['inchikey'].append(inchikey)
            classification_data['Kingdom'].append(classification_info['kingdom'])
            classification_data['Superclass'].append(classification_info['superclass'])
            classification_data['class'].append(classification_info['class'])
            classification_data['subclass'].append(classification_info['subclass'])
            classification_data['intermediate_nodes'].append(classification_info['intermediate_nodes'])
            classification_data['direct_parents'].append(classification_info['direct_parent'])
            
            # Delay between API calls
            time.sleep(self.config.API_DELAY)
        
        # Save results
        self._save_classification_results(classification_data, filename)
    
    def _extract_classification_info(self, res: dict) -> dict:
        """Extract classification information from API response"""
        kingdom = res.get('kingdom', {}).get('name', '') if res.get('kingdom') else ''
        superclass = res.get('superclass', {}).get('name', '') if res.get('superclass') else ''
        class_ = res.get('class', {}).get('name', '') if res.get('class') else ''
        subclass = res.get('subclass', {}).get('name', '') if res.get('subclass') else ''
        direct_parent = res.get('direct_parent', {}).get('name', '') if res.get('direct_parent') else ''
        
        intermediate_nodes = []
        if 'intermediate_nodes' in res:
            for node in res['intermediate_nodes']:
                intermediate_nodes.append(node.get('name', ''))
        
        return {
            'kingdom': kingdom,
            'superclass': superclass,
            'class': class_,
            'subclass': subclass,
            'direct_parent': direct_parent,
            'intermediate_nodes': '; '.join(intermediate_nodes)
        }
    
    def _save_classification_results(self, data: dict, filename: str):
        """Save classification results to CSV file"""
        result_df = pd.DataFrame(data)
        output_file_path = os.path.join(
            self.config.GROUPING_FOLDER, 
            filename.replace('.xlsx', '.csv')
        )
        result_df.to_csv(output_file_path, index=False)
        print(f'Saved processed data to {output_file_path}')


# ================== STEP 2: DATA MERGING ==================
class DataMerger:
    """Handle merging of classification data with original data"""
    
    def __init__(self, config: Config):
        self.config = config
    
    def merge_classification_data(self):
        """Merge classification data with original Excel files"""
        # Build InChIKey dictionary from classification results
        inchikey_dict = self._build_inchikey_dictionary()
        
        # Process original Excel files
        self._process_original_files(inchikey_dict)
    
    def _build_inchikey_dictionary(self) -> dict:
        """Build dictionary mapping InChIKey to classification data"""
        inchikey_dict = {}
        
        for root, dirs, files in os.walk(self.config.GROUPING_FOLDER):
            for file in files:
                if file.endswith(".csv"):
                    file_path = os.path.join(root, file)
                    src_df = pd.read_csv(file_path)
                    
                    for _, row in src_df.iterrows():
                        inchikey = row["inchikey"]
                        if inchikey not in inchikey_dict:
                            inchikey_dict[inchikey] = []
                        
                        # Add classification levels if they exist
                        for level in ["direct_parents", "Kingdom", "Superclass", "class", "subclass"]:
                            if row[level]:
                                inchikey_dict[inchikey].append(row[level])
        
        return inchikey_dict
    
    def _process_original_files(self, inchikey_dict: dict):
        """Process original Excel files and add classification data"""
        # Create output folder
        if not os.path.exists(self.config.FINAL_RESULT_FOLDER):
            os.makedirs(self.config.FINAL_RESULT_FOLDER)
        
        for root, dirs, files in os.walk(self.config.SOURCE_FOLDER):
            for file in files:
                if file.endswith(".xlsx"):
                    file_path = os.path.join(root, file)
                    target_df = pd.read_excel(file_path)
                    
                    # Add Class column
                    target_df["Class"] = None
                    
                    # Fill classification data
                    for index, row in target_df.iterrows():
                        inchikey = row["InChIKey"]
                        if inchikey and (inchikey in inchikey_dict):
                            target_df.at[index, "Class"] = inchikey_dict[inchikey]
                    
                    # Save result
                    self._save_merged_file(target_df, file)
    
    def _save_merged_file(self, df: pd.DataFrame, filename: str):
        """Save merged data to CSV file with timestamp"""
        current_datetime = datetime.datetime.now()
        base_filename = os.path.basename(filename)
        if base_filename.endswith(".xlsx"):
            base_filename = base_filename[:-5]
        
        formatted_datetime = current_datetime.strftime("%Y-%m-%d_%H-%M-%S")
        export_file_name = f"{base_filename}_{formatted_datetime}.csv"
        export_file_path = os.path.join(self.config.FINAL_RESULT_FOLDER, export_file_name)
        
        df.to_csv(export_file_path, index=False)
        print(f"The file saved at {export_file_path}")


# ================== STEP 3: CHEMICAL CONVERSION ==================
class ChemicalConverter:
    """Handle chemical identifier conversion using CTS API"""
    
    def __init__(self, config: Config):
        self.config = config
    
    def convert_identifiers(self):
        """Convert InChIKey to other chemical identifiers"""
        file_list = glob.glob(os.path.join(self.config.FINAL_RESULT_FOLDER, '*'))
        
        if len(file_list) == 0:
            print('[ERROR] No files in the final_result folder to process')
            return
        
        # Create output folder
        if not os.path.exists(self.config.CONVERT_RESULT_FOLDER):
            os.makedirs(self.config.CONVERT_RESULT_FOLDER)
        
        for file_path in file_list:
            self._convert_single_file(file_path)
    
    def _convert_single_file(self, file_path: str):
        """Convert identifiers in a single file"""
        df = pd.read_csv(file_path)
        print(f"Processing {file_path}")
        
        # Convert to each target identifier
        for target in self.config.CONVERSION_TARGETS:
            print(f"Converting to {target}")
            
            # Get conversion results
            res = self._transform_inchikey(df['InChIKey'].tolist(), target)
            df[target] = None
            
            # Map results back to DataFrame
            for key, value in res[target].items():
                if key != 'nan':
                    row_indices = df.index[df['InChIKey'] == key].tolist()
                    if row_indices:
                        df.at[row_indices[0], target] = value
        
        # Save converted data
        output_filename = os.path.basename(file_path)
        export_file_path = os.path.join(self.config.CONVERT_RESULT_FOLDER, output_filename)
        df.to_csv(export_file_path, index=False)
        print(f"Saved converted data to {export_file_path}")
    
    def _transform_inchikey(self, identifiers: list, target: str) -> dict:
        """Transform InChIKey to target identifier using CTS API"""
        if identifiers:
            result = ct.CTSget(self.config.CONVERSION_SOURCE, target, identifiers)
        else:
            result = {}
        return result


# ================== STEP 4: DATA AGGREGATION ==================
class DataAggregator:
    """Handle final data aggregation and merging"""
    
    def __init__(self, config: Config):
        self.config = config
    
    def aggregate_data(self):
        """Aggregate all converted data into final merged file"""
        csv_files = [f for f in os.listdir(self.config.CONVERT_RESULT_FOLDER) 
                    if f.endswith(".csv")]
        
        if not csv_files:
            print("No CSV files found to aggregate")
            return
        
        print("Starting to merge these files:")
        print(csv_files)
        
        # Initialize output DataFrame
        df_output = pd.DataFrame(columns=['Title', 'PubChem CID'])
        
        # Process each file
        for file_name in csv_files:
            file_path = os.path.join(self.config.CONVERT_RESULT_FOLDER, file_name)
            df_input = pd.read_csv(file_path)
            
            column_name = file_name.split('.')[0]
            df_output[column_name] = ''
            
            # Merge data
            df_output = self._merge_file_data(df_input, df_output, column_name)
        
        # Save final result
        self._save_aggregated_data(df_output)
    
    def _merge_file_data(self, df_input: pd.DataFrame, df_output: pd.DataFrame, column_name: str) -> pd.DataFrame:
        """Merge data from a single file into output DataFrame"""
        for _, row in df_input.iterrows():
            title = row['Title']
            area = row['Area']
            cid = row['PubChem CID']
            
            # Check if title already exists
            existing_row = df_output.loc[df_output['Title'] == title]
            
            if not existing_row.empty:
                # Update existing row
                df_output.at[existing_row.index[0], column_name] = area
            else:
                # Create new row
                new_row = pd.DataFrame({
                    'Title': [title],
                    'PubChem CID': [cid],
                    column_name: [area]
                })
                df_output = pd.concat([df_output, new_row], ignore_index=True)
        
        return df_output
    
    def _save_aggregated_data(self, df_output: pd.DataFrame):
        """Save aggregated data to final output file"""
        if not os.path.exists(self.config.METABOANALYST_FOLDER):
            os.makedirs(self.config.METABOANALYST_FOLDER)
        
        export_file_path = os.path.join(
            self.config.METABOANALYST_FOLDER, 
            self.config.MERGE_OUTPUT_FILE
        )
        
        df_output.to_csv(export_file_path, index=False)
        print(f"Merging data completed. Final file saved at: {export_file_path}")


# ================== MAIN PIPELINE ==================
class ChemicalAnalysisPipeline:
    """Main pipeline orchestrating all processing steps"""
    
    def __init__(self, config: Config = None):
        self.config = config or Config()
        self.classifier = ChemicalClassifier(self.config)
        self.merger = DataMerger(self.config)
        self.converter = ChemicalConverter(self.config)
        self.aggregator = DataAggregator(self.config)
    
    def run_full_pipeline(self):
        """Run the complete chemical analysis pipeline"""
        print("=== Starting Chemical Analysis Pipeline ===")
        
        try:
            print("\n1. Running chemical classification...")
            self.classifier.process_classification_files()
            
            print("\n2. Merging classification data...")
            self.merger.merge_classification_data()
            
            print("\n3. Converting chemical identifiers...")
            self.converter.convert_identifiers()
            
            print("\n4. Aggregating final data...")
            self.aggregator.aggregate_data()
            
            print("\n=== Pipeline completed successfully! ===")
            
        except Exception as e:
            print(f"Pipeline failed with error: {e}")
            raise
    
    def run_step(self, step_number: int):
        """Run a specific step of the pipeline"""
        steps = {
            1: self.classifier.process_classification_files,
            2: self.merger.merge_classification_data,
            3: self.converter.convert_identifiers,
            4: self.aggregator.aggregate_data
        }
        
        if step_number in steps:
            print(f"Running step {step_number}...")
            steps[step_number]()
            print(f"Step {step_number} completed.")
        else:
            print(f"Invalid step number: {step_number}")


# ================== USAGE EXAMPLE ==================
def main():
    """Main function to run the pipeline"""
    # Initialize pipeline with default config
    pipeline = ChemicalAnalysisPipeline()
    
    # Run full pipeline
    # pipeline.run_full_pipeline()
    
    
    # Or run individual steps:
    pipeline.run_step(1)  # Classification only
    pipeline.run_step(2)  # Merging only
    pipeline.run_step(3)  # Conversion only
    pipeline.run_step(4)  # Aggregation only


if __name__ == "__main__":
    main()

Running step 1...
{
    "smiles": "CC(O)(CC(O)=O)CC(O)=O",
    "inchikey": "InChIKey=NPOAOTPXWNWTSH-UHFFFAOYSA-N",
    "kingdom": {
        "name": "Organic compounds",
        "description": "Compounds that contain at least one carbon atom, excluding isocyanide/cyanide and their non-hydrocarbyl derivatives, thiophosgene, carbon diselenide, carbon monosulfide, carbon disulfide, carbon subsulfide, carbon monoxide, carbon dioxide, Carbon suboxide, and dicarbon monoxide.",
        "chemont_id": "CHEMONTID:0000000",
        "url": "http://classyfire.wishartlab.com/tax_nodes/C0000000"
    },
    "superclass": {
        "name": "Lipids and lipid-like molecules",
        "description": "Fatty acids and their derivatives, and substances related biosynthetically or functionally to these compounds.",
        "chemont_id": "CHEMONTID:0000012",
        "url": "http://classyfire.wishartlab.com/tax_nodes/C0000012"
    },
    "class": {
        "name": "Fatty Acyls",
        "description": "Organic m

  0%|          | 0/5 [00:20<?, ?it/s]


JSONDecodeError: Expecting value: line 1 column 1 (char 0)