### Example of Enzymatic Reaction Feasibility Classification Using Model_2B_2
#### This notebook employs the Model_2B_2 to perform classification prediction of enzymatic reaction feasibility. Workflow: first, representing reactions based on rxnfp; subsequently, performing feasibility classification and outputting the predicted labels and probabilities.

In [1]:
# 1.Represent the reactions using rxnfp
from tqdm import TqdmWarning
import warnings

warnings.filterwarnings("ignore", category=TqdmWarning)

import pandas as pd
from rxnfp.transformer_fingerprints import (
    RXNBERTFingerprintGenerator, get_default_model_and_tokenizer
)
model, tokenizer = get_default_model_and_tokenizer()
rxnfp_generator = RXNBERTFingerprintGenerator(model, tokenizer)

def generate_rxnfp_in_batches(input_csv_path, output_csv_path, reaction_column='Reaction', batch_size=100):

    df = pd.read_csv(input_csv_path)
    
    if reaction_column not in df.columns:
        raise ValueError(f"Column not found in CSV file: {reaction_column}")
    
    result_rows = []
    
    for i in range(0, len(df), batch_size):
        batch_df = df.iloc[i:i + batch_size]
        reactions = batch_df[reaction_column].tolist()
        
        fingerprints = rxnfp_generator.convert_batch(reactions)
        
        fp_columns = [f"rxnfp_{i+1}" for i in range(len(fingerprints[0]))]
        fp_df = pd.DataFrame(fingerprints, columns=fp_columns)
        
        batch_result = pd.concat([batch_df.reset_index(drop=True), fp_df], axis=1)
        result_rows.append(batch_result)
    
    result_df = pd.concat(result_rows, axis=0).reset_index(drop=True)
    
    result_df.to_csv(output_csv_path, index=False)
    print(f"rxnfp have been successfully generated and saved to ... {output_csv_path}")
    print("Example rxnfp (first 3 reactions, first 10 bits):")
    rxnfp_cols = [col for col in result_df.columns if col.startswith("rxnfp_")]
    for i in range(min(3, len(result_df))):
        example_fp = result_df[rxnfp_cols].iloc[i].values[:10]
        print(f"  Reaction {i+1}: [{', '.join(map(str, example_fp))}]")


##Usage Example
reaction_smiles_path = r"E:\ERFC\example\example.csv"  # Input file path
rxnfp_path = r"E:\ERFC\results\Model_2B_2\example_rxnfp.csv"  # output file path

generate_rxnfp_in_batches(reaction_smiles_path, rxnfp_path, batch_size=10)  # # Adjust batch_size

rxnfp have been successfully generated and saved to ... E:\ERFC\results\Model_2B_2\example_rxnfp.csv
Example rxnfp (first 3 reactions, first 10 bits):
  Reaction 1: [-1.0258426666259766, -1.5963499546051025, -0.24134944379329681, -1.6523154973983765, 1.3560458421707153, -0.7744007110595703, -1.022594928741455, 0.016551554203033447, -1.3075191974639893, 1.5330270528793335]
  Reaction 2: [-0.931559145450592, -1.858367919921875, -1.426747441291809, -2.4667091369628906, 1.6651504039764404, -0.7247518301010132, -0.0659518837928772, 0.31876450777053833, -0.7222118973731995, 2.202850341796875]
  Reaction 3: [-1.1722087860107422, 0.12641683220863342, -0.06531167775392532, -1.983899474143982, 2.1726584434509277, -1.2793668508529663, -0.5059995055198669, 1.254568338394165, -1.9550400972366333, 2.1997756958007812]


In [1]:
# 2.Feasibility classification based on Model_2B_2

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np

class MyNet(nn.Module):
    def __init__(self, input_shape):
        super(MyNet, self).__init__()
        self.fc_1 = nn.Linear(input_shape, 128)
        self.bn_1 = nn.BatchNorm1d(128)
        self.dropout_1 = nn.Dropout(p=0.25)

        self.fc_2 = nn.Linear(128, 64)
        self.bn_2 = nn.BatchNorm1d(64)
        self.dropout_2 = nn.Dropout(p=0.25)

        self.fc_3 = nn.Linear(64, 32)
        self.bn_3 = nn.BatchNorm1d(32)
        self.dropout_3 = nn.Dropout(p=0.25)

        self.fc_4 = nn.Linear(32, 1)

    def forward(self, input_data):
        x = F.relu(self.bn_1(self.fc_1(input_data)))
        x = self.dropout_1(x)

        x = F.relu(self.bn_2(self.fc_2(x)))
        x = self.dropout_2(x)

        x = F.relu(self.bn_3(self.fc_3(x)))
        x = self.dropout_3(x)

        x = torch.sigmoid(self.fc_4(x))
        x = x.squeeze(-1)
        return x

def prepare_read(data_path):

    data = pd.read_csv(data_path)
    x_df = data.iloc[:, 1:]  # Feature columns (starting from column 1)
    x_arr = x_df.values
    x_tensor = torch.tensor(x_arr).float()
    reaction_smiles = data.iloc[:, 0]  # column 0 is reaction SMILES
    return x_tensor, reaction_smiles

def decode(arr, threshold=0.5):

    return [0 if val < threshold else 1 for val in arr]


def model_pred(best_model, loader, cuda=True, *device):

    best_model.eval()
    with torch.no_grad():
        l_y_pred = []
        l_y_score = []

        for batch in loader:

            inputs = batch[0] 

            if cuda:
                inputs = inputs.to(device[0])

            y_pred = best_model(inputs)

            if cuda:
                y_pred = y_pred.cpu()

            y_pred_np = y_pred.numpy().flatten()
            l_y_score.extend(y_pred_np.tolist())
            l_y_pred.extend(decode(y_pred_np))

    return l_y_pred, l_y_score


def predict_on_unknown(model_path, data_path, output_path, batch_size=256):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    print("Loading data...")
    x_tensor, reaction_smiles = prepare_read(data_path)

    dataset = TensorDataset(x_tensor)
    loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=False, num_workers=0)

    print("Loading model...")
    input_shape = x_tensor.shape[1]
    model = MyNet(input_shape=input_shape).to(device)
    model = torch.load(model_path, map_location=device, weights_only=False)
    model.eval()

    print("Running prediction...")
    preds, probs = model_pred(model, loader, True, device)

    results = pd.DataFrame({
        'Reaction': reaction_smiles,
        'pred_label': preds,
        'pred_prob': probs
    })
    results.to_csv(output_path, index=False)
    print("Result:")
    print(results)
    print(f"Prediction completed. Results saved to {output_path}")
    

##Usage Example
model_path = r"E:\ERFC\models\Model_2B_2.pth" # Model_2B_2 path
rxnfp_path = r"E:\ERFC\results\Model_2B_2\example_rxnfp.csv"
result_path = r"E:\ERFC\results\Model_2B_2\result.csv"# Result path

predict_on_unknown(model_path, rxnfp_path, result_path, batch_size=256)

Using device: cuda:0
Loading data...
Loading model...
Running prediction...
Result:
                                            Reaction  pred_label  pred_prob
0  C[C@@H]1O[C@@H](O[C@H]2[C@H](O[C@H]3[C@H](O[C@...           0   0.015379
1  C[C@H](NC(=O)OC(C)(C)C)C(=O)N[C@@H](C)C(=O)N1C...           1   0.997170
2  C[C@@H]1O[C@H](O)[C@@H](O)[C@H](OC2OC(CO)C(OC3...           0   0.045358
3  CCCCCCCCCCCCCCCC(=O)OC[C@H](COP(=O)(O)O)OC(=O)...           0   0.274085
4  O=c1c(O)c(-c2ccccc2)oc2cc(O)cc(O)c12>>O=C1c2c(...           1   0.951225
5  O=C(CO)N[C@H]1[C@H]([C@@H](O)[C@@H](O)CO)O[C@@...           0   0.210358
6       NC(N)=NOCC[C@H](N)C(=O)O>>N[C@@H](CCO)C(=O)O           0   0.020491
7  CC(=O)N[C@H]1[C@@H](O[C@H]2[C@@H](O)[C@@H](CO)...           0   0.000013
8  CSCC[C@H](NC(=O)[C@H](CC(C)C)NC(=O)CCC(=O)O)C(...           1   0.994622
Prediction completed. Results saved to E:\ERFC\results\Model_2B_2\result.csv
