### Example of Enzymatic Reaction Feasibility Classification Using Model_1A_4
#### This notebook employs the Model_1A_4 to perform classification prediction of enzymatic reaction feasibility. Workflow: first, representing reactions based on c_ECFP4; subsequently, performing feasibility classification and outputting the predicted labels and probabilities.

In [1]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import xgboost as xgb

import warnings
from rdkit import RDLogger
import pandas as pd

# 屏蔽 RDKit 警告
RDLogger.DisableLog('rdApp.warning')

# 屏蔽 FutureWarning 和 DeprecationWarning
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

class ReactionClassifier:
    def __init__(self, model_path):
        self.model_path = model_path
        self.model = xgb.Booster()
        self.model.load_model(model_path)

    # 1.Represent the reactions using c_ecfp4
    def c_ecfp4(self, reaction_smiles):
        try:
            reactants_str, product = reaction_smiles.split('>>')
            reactants = reactants_str.split('.')
        except:
            raise ValueError(f"Invalid reaction SMILES format: {reaction_smiles}")   

        product_mol = Chem.MolFromSmiles(product)
        if product_mol is None:
            raise ValueError(f"Unable to parse the SMILES of the product: {product}")     
        product_fp = AllChem.GetMorganFingerprintAsBitVect(product_mol, radius=2, nBits=1024, useChirality=True)

        combined_reactant_fp = np.zeros(1024, dtype=int)
        for smiles in reactants:
            reactant_mol = Chem.MolFromSmiles(smiles)
            if reactant_mol is None:
                raise ValueError(f"Unable to parse the SMILES of the reactant(s): {smiles}")
            reactant_fp = AllChem.GetMorganFingerprintAsBitVect(reactant_mol, radius=2, nBits=1024, useChirality=True)
            reactant_fp_array = np.array(list(reactant_fp.ToBitString()), dtype=int)
            combined_reactant_fp = np.bitwise_or(combined_reactant_fp, reactant_fp_array)
        
        combined_reactant_fp = combined_reactant_fp.astype(int)
        product_fp_array = np.array(list(product_fp.ToBitString()), dtype=int)
        reaction_fp = np.concatenate([combined_reactant_fp, product_fp_array])
        return reaction_fp
    
    # 2.Feasibility classification based on Model_1A_4
    def classifier_from_csv(self, csv_path, reaction_col='Reaction',c_ecfp4_output_path='c_ecfp4.csv',result_path='result.csv'):
        df_new = pd.read_csv(csv_path)
        fingerprints = []
        for rxn in df_new[reaction_col]:
            try:
                fp = self.c_ecfp4(rxn)
                fingerprints.append(fp)
            except Exception as e:
                print(f"Skip reaction {rxn}: {e}")
                fingerprints.append(np.nan)

        X_new = pd.DataFrame(fingerprints)
        valid_mask = ~X_new.isna().any(axis=1)
        X_valid = X_new[valid_mask]
        reactions_valid = df_new.loc[valid_mask, reaction_col]
        c_ecfp4_output=pd.concat([reactions_valid.reset_index(drop=True), X_valid.reset_index(drop=True)], axis=1)
        c_ecfp4_output.to_csv(c_ecfp4_output_path, index=False)
        print("Example c_ecfp4 (first 3 reactions, first 10 bits):")
        for i in range(min(3, len(c_ecfp4_output))):
            example_fp = c_ecfp4_output.iloc[i, 1:11].values  # 第1列到第10列（跳过 Reaction）
            print(f"  Reaction {i+1}: [{', '.join(map(str, example_fp))}]")

        dtest = xgb.DMatrix(X_valid)
        probs = self.model.predict(dtest)
        preds = (probs > 0.5).astype(int)

        results = pd.DataFrame({
            'Reaction': reactions_valid,
            'pred_label': preds,
            'pred_prob': probs
        })
        results.to_csv(result_path, index=False)
        print("Prediction completed.")
        print("Result:")
        print(results)
        print("Results have been saved.")

##Usage Example
classifier = ReactionClassifier(model_path=r"E:\ERFC\models\Model_1A_4.model")#Model_1A_4 path

classifier.classifier_from_csv(
    csv_path=r"E:\ERFC\example\example.csv",# Reaction smiles
    reaction_col='Reaction',#The SMILES column
    c_ecfp4_output_path=r"E:\ERFC\results\Model_1A_4\example_c_ecfp4.csv",# c_ecfp4 output path
    result_path=r"E:\ERFC\results\Model_1A_4\result.csv"# Result path
)

Example c_ecfp4 (first 3 reactions, first 10 bits):
  Reaction 1: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
  Reaction 2: [0, 1, 0, 0, 1, 0, 0, 0, 0, 0]
  Reaction 3: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prediction completed.
Result:
                                            Reaction  pred_label  pred_prob
0  C[C@@H]1O[C@@H](O[C@H]2[C@H](O[C@H]3[C@H](O[C@...           0   0.146270
1  C[C@H](NC(=O)OC(C)(C)C)C(=O)N[C@@H](C)C(=O)N1C...           1   0.979556
2  C[C@@H]1O[C@H](O)[C@@H](O)[C@H](OC2OC(CO)C(OC3...           0   0.092999
3  CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)(O)O)OC(=O)...           1   0.936089
4  O=c1c(O)c(-c2ccccc2)oc2cc(O)cc(O)c12>>O=C1c2c(...           1   1.057544
5  O=C(CO)N[C@H]1[C@H]([C@@H](O)[C@@H](O)CO)O[C@@...           0   0.018392
6       NC(N)=NOCC[C@H](N)C(=O)O>>N[C@@H](CCO)C(=O)O           1   0.690389
7  CC(=O)N[C@H]1[C@@H](O[C@H]2[C@@H](O)[C@@H](CO)...           0   0.019155
8  CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(=O)CCC(=O)O)C(...           1   1.000844
Results have been saved