# Training Regression - Reaction

# Import packages

In [147]:
import os
import sys

current_path=os.getcwd()
print(current_path)

parent_path=os.path.dirname(current_path)
print(parent_path)

if parent_path not in sys.path:
    sys.path.append(parent_path)

/home/labhhc2/Documents/workspace/D20/Tam/backup/chemprop_1/examples
/home/labhhc2/Documents/workspace/D20/Tam/backup/chemprop_1


In [148]:
import pandas as pd
from lightning import pytorch as pl
from pathlib import Path

from chemprop import data, featurizers, models, nn

# Change data inputs here

In [149]:
chemprop_dir = Path.cwd().parent
input_path = chemprop_dir / "tests" / "data" / "regression" / "rxn" / "e2sn2.csv"
num_workers = 0  # number of workers for dataloader. 0 means using main process for data loading
smiles_column = 'AAM'
target_columns = ['ea']

## Load data

In [150]:
df_input = pd.read_csv(input_path)
df_input

Unnamed: 0,AAM,ea
0,[Br-:2].[Br:1][C:5]([C:4]([H:3])([C:31]#[N:32]...,28.812004
1,[Br-:2].[Br:1][C:5]([C:4]([H:3])([C:31]#[N:32]...,33.410338
2,[Br-:2].[Br:1][C:5]([C:4]([H:3])([H:31])[C:41]...,37.645698
3,[Br-:2].[Br:1][C:5]([C:4]([H:3])([H:31])[H:41]...,24.206642
4,[Br-:2].[Br:1][C:5]([C:4]([H:3])([H:31])[H:41]...,18.450753
...,...,...
3620,[F:1][C@:5]([C@:4]([H:3])([N:31]([H:32])[H:33]...,26.844888
3621,[F:1][C@:5]([C@:4]([H:3])([N:31]([H:32])[H:33]...,2.283999
3622,[F:1][C@:5]([C@:4]([H:3])([N:31]([H:32])[H:33]...,18.142052
3623,[F:1][C@:5]([C@:4]([H:3])([N:31]([H:32])[H:33]...,18.127255


In [151]:
import numpy as np
npz_file=np.load('../chemprop/data/its_Tam/super/e2sn2_super_processed_data.npz',allow_pickle=True)

v_attr=npz_file['node_attrs']
e_attr=npz_file['edge_attrs']
edge_indices=npz_file['edge_indices']
y=npz_file['ys']
print(len(y))
print(np.array(v_attr[0])[0])

3625
[ 4.   1.   0.  -0.5  7.  -1.  -1.   0.   0.   1.   3.   0.   0.   4.
  0.   0.   0.  -1.   1.   0.  -1. ]


In [152]:
smis = df_input.loc[:, smiles_column].values
ys = df_input.loc[:, target_columns].values

all_data = [data.ReactionDatapoint.from_smi(smi, y) for smi, y in zip(smis, ys)]
len(all_data)



3625

## Perform data splitting for training, validation, and testing

In [153]:
mols = [d.rct for d in all_data]  # Can either split by reactants (.rct) or products (.pdt)
train_indices, val_indices, test_indices = data.make_split_indices(mols, "random", (0.8, 0.1, 0.1))
train_v, val_v, test_v = data.split_data_by_indices(
    v_attr, train_indices, val_indices, test_indices
)

train_e, val_e, test_e = data.split_data_by_indices(
    e_attr, train_indices, val_indices, test_indices
)

train_idx_g, val_idx_g, test_idx_g = data.split_data_by_indices(
    edge_indices, train_indices, val_indices, test_indices
)

train_y, val_y, test_y = data.split_data_by_indices(
    y, train_indices, val_indices, test_indices
)

The return type of make_split_indices has changed in v2.1 - see help(make_split_indices)


[3084, 971, 2430, 2551, 2817, 3138, 1632, 2277, 1474, 936, 1772, 402, 196, 776, 2675, 2888, 408, 1636, 2588, 248, 326, 3227, 3126, 2518, 2621, 88, 3609, 3501, 3153, 45, 648, 3292, 1852, 2807, 710, 311, 1652, 688, 2887, 3451, 953, 2650, 1826, 1839, 2132, 2255, 3049, 1161, 1357, 1044, 452, 3598, 1875, 634, 3526, 794, 2072, 3418, 1762, 3616, 3098, 2025, 1151, 1939, 951, 185, 453, 117, 3250, 2170, 3495, 615, 3453, 1047, 871, 253, 1567, 192, 1473, 728, 545, 2866, 1944, 1, 1051, 3446, 868, 302, 2993, 3368, 1917, 712, 1928, 2231, 2160, 1983, 2420, 558, 716, 1313, 1087, 2627, 3011, 2073, 298, 2291, 1952, 1043, 1580, 1559, 1032, 463, 3309, 1895, 562, 725, 2928, 472, 3287, 2416, 3620, 224, 1283, 214, 1907, 489, 2123, 528, 621, 1592, 514, 3210, 1095, 1324, 1397, 643, 526, 2165, 2377, 3527, 1999, 720, 2911, 2618, 33, 481, 1695, 39, 2323, 1992, 1122, 2972, 2287, 599, 2779, 1371, 3457, 134, 3198, 2772, 1780, 2090, 2212, 3427, 1544, 1827, 958, 2147, 565, 346, 3404, 2457, 668, 1110, 938, 378, 2514, 54

## Get ReactionDatasets

In [154]:
train_dset = data.ReactionDataset(train_v, train_e, train_idx_g, train_y)
print(train_dset[0][3])
scaler = train_dset.normalize_targets()
# print(scaler)
print(train_dset[0][3])

val_dset = data.ReactionDataset(val_v, val_e, val_idx_g, val_y)
val_dset.normalize_targets(scaler)
test_dset = data.ReactionDataset(test_v, test_e, test_idx_g, test_y)

[40.40658569]
[[40.40658569]
 [30.92371368]
 [22.12643814]
 ...
 [ 6.40574789]
 [51.82247925]
 [ 0.48311821]]
[[ 1.39308672]
 [ 0.68152562]
 [ 0.02140924]
 ...
 [-1.15821563]
 [ 2.24969491]
 [-1.60262877]]
[1.39308672]
[[ 6.44705326e-02]
 [-2.05583165e-01]
 [-9.39194013e-01]
 [-4.48007128e-01]
 [ 1.45138006e+00]
 [-5.82389147e-01]
 [-1.05607395e+00]
 [ 6.56329588e-02]
 [ 3.13761651e-02]
 [-2.48632005e-01]
 [ 1.49816257e-01]
 [ 2.06415956e+00]
 [ 8.66189078e-01]
 [ 2.29998694e+00]
 [-5.24745933e-01]
 [-6.91779933e-01]
 [ 1.80018871e-01]
 [ 2.56026879e+00]
 [-1.74377274e-01]
 [ 6.63287462e-01]
 [-2.87012964e-01]
 [ 2.62627691e+00]
 [ 2.75239636e-01]
 [ 1.26773848e+00]
 [-5.98658390e-01]
 [-1.18320361e+00]
 [-1.04436131e+00]
 [ 7.14188475e-01]
 [ 7.34552196e-02]
 [-9.19600291e-01]
 [ 7.13868886e-01]
 [-8.18989172e-01]
 [ 5.44718704e-01]
 [-1.15222532e+00]
 [-1.44364108e+00]
 [ 1.85033468e+00]
 [ 1.41828549e-01]
 [ 3.81834918e-01]
 [ 4.16764665e-01]
 [ 6.20361848e-01]
 [ 4.98692095e-01]
 [

In [155]:
train_dset[0][3]

array([1.39308672])

In [156]:
edge_index=train_dset[1][0][-2]
print(f'edge_index: {edge_index}')
reverse_index=train_dset[1][0][-1]
print(f'reverse_index: {reverse_index}')

edge_index: [[ 0  1  1  2  1 10  1 11  2  3  2  4  2  7  3 15  4  5  4  6  7  8  7  9
  11 12 11 13 11 14 16 17 17 18 18 19 18 20 18 21 19 22 23 16 23 17 23 18
  23 19 23 20 23 21 23 22]
 [ 1  0  2  1 10  1 11  1  3  2  4  2  7  2 15  3  5  4  6  4  8  7  9  7
  12 11 13 11 14 11 17 16 18 17 19 18 20 18 21 18 22 19 16 23 17 23 18 23
  19 23 20 23 21 23 22 23]]
reverse_index: [ 1  0  3  2  5  4  7  6  9  8 11 10 13 12 15 14 17 16 19 18 21 20 23 22
 25 24 27 26 29 28 31 30 33 32 35 34 37 36 39 38 41 40 43 42 45 44 47 46
 49 48 51 50 53 52 55 54]


In [157]:
import numpy as np

np.arange(6).reshape(-1,2)[:, ::-1].ravel()

array([1, 0, 3, 2, 5, 4])

## Get dataloaders

In [158]:
train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

# Change Message-Passing Neural Network (MPNN) inputs here

## Message passing

Message passing blocks must be given the shape of the featurizer's outputs.

Options are `mp = nn.BondMessagePassing()` or `mp = nn.AtomMessagePassing()`

In [159]:
train_v[0].shape[1]

21

In [160]:
fdims = (train_v[0].shape[1],train_e[0].shape[1]) # the dimensions of the featurizer, given as (atom_dims, bond_dims).
mp = nn.BondMessagePassing(*fdims)

In [161]:
print(*fdims)

21 6


## Aggregation

In [162]:
print(nn.agg.AggregationRegistry)

ClassRegistry {
    'mean': <class 'chemprop.nn.agg.MeanAggregation'>,
    'sum': <class 'chemprop.nn.agg.SumAggregation'>,
    'norm': <class 'chemprop.nn.agg.NormAggregation'>
}


In [163]:
agg = nn.MeanAggregation()

## Feed-Forward Network (FFN)

In [164]:
print(nn.PredictorRegistry)

ClassRegistry {
    'regression': <class 'chemprop.nn.predictors.RegressionFFN'>,
    'regression-mve': <class 'chemprop.nn.predictors.MveFFN'>,
    'regression-evidential': <class 'chemprop.nn.predictors.EvidentialFFN'>,
    'regression-quantile': <class 'chemprop.nn.predictors.QuantileFFN'>,
    'classification': <class 'chemprop.nn.predictors.BinaryClassificationFFN'>,
    'classification-dirichlet': <class 'chemprop.nn.predictors.BinaryDirichletFFN'>,
    'multiclass': <class 'chemprop.nn.predictors.MulticlassClassificationFFN'>,
    'multiclass-dirichlet': <class 'chemprop.nn.predictors.MulticlassDirichletFFN'>,
    'spectral': <class 'chemprop.nn.predictors.SpectralFFN'>
}


In [165]:
output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)

In [166]:
ffn = nn.RegressionFFN(output_transform=output_transform)

## Batch norm

In [167]:
batch_norm = True

## Metrics

In [168]:
print(nn.metrics.MetricRegistry)

ClassRegistry {
    'mse': <class 'chemprop.nn.metrics.MSE'>,
    'mae': <class 'chemprop.nn.metrics.MAE'>,
    'rmse': <class 'chemprop.nn.metrics.RMSE'>,
    'bounded-mse': <class 'chemprop.nn.metrics.BoundedMSE'>,
    'bounded-mae': <class 'chemprop.nn.metrics.BoundedMAE'>,
    'bounded-rmse': <class 'chemprop.nn.metrics.BoundedRMSE'>,
    'r2': <class 'chemprop.nn.metrics.R2Score'>,
    'binary-mcc': <class 'chemprop.nn.metrics.BinaryMCCMetric'>,
    'multiclass-mcc': <class 'chemprop.nn.metrics.MulticlassMCCMetric'>,
    'roc': <class 'chemprop.nn.metrics.BinaryAUROC'>,
    'prc': <class 'chemprop.nn.metrics.BinaryAUPRC'>,
    'accuracy': <class 'chemprop.nn.metrics.BinaryAccuracy'>,
    'f1': <class 'chemprop.nn.metrics.BinaryF1Score'>
}


In [169]:
metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()] 
# Only the first metric is used for training and early stopping

## Construct MPNN

In [170]:
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=27, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=321, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=300, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSE(task_weights=[[1.0]])
    (output_transform): UnscaleTransform()
  )
  (X_d_transform): Identity()
  (metrics): ModuleList(
    (0): RMSE(task_weights=[[1.0]])
    (1): MAE(task_weigh

# Training and testing

## Set up trainer

In [171]:
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True,  # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
    enable_progress_bar=True,
    accelerator="auto",
    devices=1,
    max_epochs=100,  # number of epochs to train for
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


## Start training

In [172]:
trainer.fit(mpnn, train_loader, val_loader)

/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/labhhc2/Documents/workspace/D20/Tam/backup/chemprop_1/examples/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 194 K  | train
1 | agg             | MeanAggregation    | 0      | train
2 | bn              | BatchNorm1d        | 600    | train
3 | predictor     

                                                                            

/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 99: 100%|██████████| 46/46 [00:00<00:00, 194.78it/s, train_loss_step=0.115, val_loss=0.0769, train_loss_epoch=0.0522] 

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 46/46 [00:00<00:00, 185.17it/s, train_loss_step=0.115, val_loss=0.0769, train_loss_epoch=0.0522]


## Test results

In [173]:
results = trainer.test(mpnn, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 6/6 [00:00<00:00, 370.96it/s]


In [174]:
results

[{'test/rmse': 3.325967788696289, 'test/mae': 2.563598871231079}]