# Training Regression - Reaction

# Import packages

In [365]:
import os
import sys

current_path=os.getcwd()
print(current_path)

parent_path=os.path.dirname(current_path)
print(parent_path)

if parent_path not in sys.path:
    sys.path.append(parent_path)

/home/labhhc2/Documents/workspace/D20/Tam/backup/chemprop_1/examples
/home/labhhc2/Documents/workspace/D20/Tam/backup/chemprop_1


In [366]:
import pandas as pd
from lightning import pytorch as pl
from pathlib import Path

from chemprop import data, featurizers, models, nn

# Change data inputs here

## Load data

In [367]:
import numpy as np
chemprop_dir = Path.cwd().parent
num_workers = 0  # number of workers for dataloader. 0 means using main process for data loading
# smiles_column = 'AAM'
# target_columns = ['lograte']

In [368]:
train_path = chemprop_dir / "tests" / "data" / "regression" / "rxn" / "e2sn2" / "fold_4" / "aam_train.csv"
train_npz = np.load(f'../chemprop/data/super/e2sn2/fold_4/e2sn2_aam_train_processed_data.npz', allow_pickle=True)
train_v = train_npz['node_attrs']
train_e = train_npz['edge_attrs']
train_idx_g = train_npz['edge_indices']
train_y = train_npz['ys'] 

val_path = chemprop_dir / "tests" / "data" / "regression" / "rxn" / "e2sn2" / "fold_4" / "aam_val.csv"
val_npz = np.load(f'../chemprop/data/super/e2sn2/fold_4/e2sn2_aam_val_processed_data.npz', allow_pickle=True)
val_v = val_npz['node_attrs']
val_e = val_npz['edge_attrs']
val_idx_g = val_npz['edge_indices']
val_y = val_npz['ys'] 

test_path = chemprop_dir / "tests" / "data" / "regression" / "rxn" / "e2sn2" / "fold_4" / "aam_test.csv"
test_npz = np.load(f'../chemprop/data/super/e2sn2/fold_4/e2sn2_aam_test_processed_data.npz', allow_pickle=True)
test_v = test_npz['node_attrs']
test_e = test_npz['edge_attrs']
test_idx_g = test_npz['edge_indices']
test_y = test_npz['ys'] 

In [369]:
print(train_idx_g.shape, val_y.shape, test_y.shape)

(2900,) (362,) (363,)


## Perform data splitting for training, validation, and testing

## Get ReactionDatasets

In [370]:
train_dset = data.ReactionDataset(train_v, train_e, train_idx_g, train_y)
print(train_dset[0][3])
scaler = train_dset.normalize_targets()
# print(scaler)
print(train_dset[0][3])

val_dset = data.ReactionDataset(val_v, val_e, val_idx_g, val_y)
val_dset.normalize_targets(scaler)
test_dset = data.ReactionDataset(test_v, test_e, test_idx_g, test_y)

[3.30673504]
[[ 3.30673504]
 [ 9.73300934]
 [17.01290131]
 ...
 [39.26520157]
 [11.44155598]
 [23.69203377]]
[[-1.394209  ]
 [-0.90810474]
 [-0.35743007]
 ...
 [ 1.32580619]
 [-0.77886473]
 [ 0.1478012 ]]
[-1.394209]
[[-0.85748876]
 [-0.34439181]
 [ 1.24632457]
 [ 0.53286892]
 [-0.51840217]
 [-0.39448976]
 [-0.83812708]
 [ 0.83625463]
 [-1.4263051 ]
 [ 2.42103007]
 [ 0.44325854]
 [ 1.3903106 ]
 [ 2.52036926]
 [-1.26081879]
 [ 0.61788447]
 [ 0.0288376 ]
 [ 0.10520787]
 [ 0.6120324 ]
 [-0.23038214]
 [ 0.54772696]
 [-0.53638931]
 [-1.54342827]
 [ 0.03943007]
 [-0.31003921]
 [ 0.42098418]
 [ 0.24147964]
 [ 0.65841318]
 [-0.99489239]
 [-1.2678341 ]
 [-0.23471351]
 [-0.69976746]
 [ 1.07934063]
 [ 0.18672785]
 [-0.82491662]
 [-0.0245157 ]
 [-0.76853745]
 [-0.92361557]
 [-0.20141371]
 [-0.12472675]
 [-1.29476136]
 [-0.18467183]
 [ 1.00969155]
 [-0.22431063]
 [ 0.54798752]
 [ 0.06268379]
 [-0.56125203]
 [-0.88680649]
 [-0.1855388 ]
 [-0.20739649]
 [-0.49563019]
 [-1.57264587]
 [ 0.03657827]
 [ 

In [371]:
train_dset[0][3]

array([-1.394209])

In [372]:
edge_index=train_dset[1][0][-2]
print(f'edge_index: {edge_index}')
reverse_index=train_dset[1][0][-1]
print(f'reverse_index: {reverse_index}')

edge_index: [[ 0  1  1  2  1 10  1 11  1 13  2  3  2  4  2  6  4  5  6  7  6  8  6  9
  11 12 14 15 15 16 17 14 17 15 17 16]
 [ 1  0  2  1 10  1 11  1 13  1  3  2  4  2  6  2  5  4  7  6  8  6  9  6
  12 11 15 14 16 15 14 17 15 17 16 17]]
reverse_index: [ 1  0  3  2  5  4  7  6  9  8 11 10 13 12 15 14 17 16 19 18 21 20 23 22
 25 24 27 26 29 28 31 30 33 32 35 34]


In [373]:
import numpy as np

np.arange(6).reshape(-1,2)[:, ::-1].ravel()

array([1, 0, 3, 2, 5, 4])

## Get dataloaders

In [374]:
train_loader = data.build_dataloader(train_dset, num_workers=num_workers)
val_loader = data.build_dataloader(val_dset, num_workers=num_workers, shuffle=False)
test_loader = data.build_dataloader(test_dset, num_workers=num_workers, shuffle=False)

# Change Message-Passing Neural Network (MPNN) inputs here

## Message passing

Message passing blocks must be given the shape of the featurizer's outputs.

Options are `mp = nn.BondMessagePassing()` or `mp = nn.AtomMessagePassing()`

In [375]:
train_v[0].shape[1]

21

In [376]:
fdims = (train_v[0].shape[1],train_e[0].shape[1]) # the dimensions of the featurizer, given as (atom_dims, bond_dims).
mp = nn.BondMessagePassing(*fdims)

In [377]:
print(*fdims)

21 6


## Aggregation

In [378]:
print(nn.agg.AggregationRegistry)

ClassRegistry {
    'mean': <class 'chemprop.nn.agg.MeanAggregation'>,
    'sum': <class 'chemprop.nn.agg.SumAggregation'>,
    'norm': <class 'chemprop.nn.agg.NormAggregation'>
}


In [379]:
# agg = nn.MeanAggregation()  #try Mean or Sum
agg = nn.SumAggregation()  #try Mean or Sum

## Feed-Forward Network (FFN)

In [380]:
print(nn.PredictorRegistry)

ClassRegistry {
    'regression': <class 'chemprop.nn.predictors.RegressionFFN'>,
    'regression-mve': <class 'chemprop.nn.predictors.MveFFN'>,
    'regression-evidential': <class 'chemprop.nn.predictors.EvidentialFFN'>,
    'regression-quantile': <class 'chemprop.nn.predictors.QuantileFFN'>,
    'classification': <class 'chemprop.nn.predictors.BinaryClassificationFFN'>,
    'classification-dirichlet': <class 'chemprop.nn.predictors.BinaryDirichletFFN'>,
    'multiclass': <class 'chemprop.nn.predictors.MulticlassClassificationFFN'>,
    'multiclass-dirichlet': <class 'chemprop.nn.predictors.MulticlassDirichletFFN'>,
    'spectral': <class 'chemprop.nn.predictors.SpectralFFN'>
}


In [381]:
output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)

In [382]:
ffn = nn.RegressionFFN(output_transform=output_transform)

## Batch norm

In [383]:
batch_norm = True

## Metrics

In [384]:
print(nn.metrics.MetricRegistry)

ClassRegistry {
    'mse': <class 'chemprop.nn.metrics.MSE'>,
    'mae': <class 'chemprop.nn.metrics.MAE'>,
    'rmse': <class 'chemprop.nn.metrics.RMSE'>,
    'bounded-mse': <class 'chemprop.nn.metrics.BoundedMSE'>,
    'bounded-mae': <class 'chemprop.nn.metrics.BoundedMAE'>,
    'bounded-rmse': <class 'chemprop.nn.metrics.BoundedRMSE'>,
    'r2': <class 'chemprop.nn.metrics.R2Score'>,
    'binary-mcc': <class 'chemprop.nn.metrics.BinaryMCCMetric'>,
    'multiclass-mcc': <class 'chemprop.nn.metrics.MulticlassMCCMetric'>,
    'roc': <class 'chemprop.nn.metrics.BinaryAUROC'>,
    'prc': <class 'chemprop.nn.metrics.BinaryAUPRC'>,
    'accuracy': <class 'chemprop.nn.metrics.BinaryAccuracy'>,
    'f1': <class 'chemprop.nn.metrics.BinaryF1Score'>
}


In [385]:
metric_list = [nn.metrics.RMSE(), nn.metrics.MAE()] 
# Only the first metric is used for training and early stopping

## Construct MPNN

In [386]:
mpnn = models.MPNN(mp, agg, ffn, batch_norm, metric_list)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=27, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=321, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): SumAggregation()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=300, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.0, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSE(task_weights=[[1.0]])
    (output_transform): UnscaleTransform()
  )
  (X_d_transform): Identity()
  (metrics): ModuleList(
    (0): RMSE(task_weights=[[1.0]])
    (1): MAE(task_weight

# Training and testing

## Set up trainer

In [387]:
trainer = pl.Trainer(
    logger=False,
    enable_checkpointing=True,  # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
    enable_progress_bar=True,
    accelerator="auto",
    devices=1,
    max_epochs=100,  # number of epochs to train for
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


## Start training

In [388]:
trainer.fit(mpnn, train_loader, val_loader)

/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory /home/labhhc2/Documents/workspace/D20/Tam/backup/chemprop_1/examples/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loading `train_dataloader` to estimate number of stepping batches.
/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.

  | Name            | Type               | Params | Mode 
---------------------------------------------------------------
0 | message_passing | BondMessagePassing | 194 K  | train
1 | agg             | SumAggregation     | 0      | train
2 | bn              | BatchNorm1d        | 600    | train
3 | predictor     

                                                                            

/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Epoch 99: 100%|██████████| 46/46 [00:00<00:00, 203.67it/s, train_loss_step=0.0723, val_loss=0.0722, train_loss_epoch=0.0541]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 46/46 [00:00<00:00, 194.23it/s, train_loss_step=0.0723, val_loss=0.0722, train_loss_epoch=0.0541]


## Test results

In [389]:
results = trainer.test(mpnn, test_loader)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
/home/labhhc2/anaconda3/envs/chemprop/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Testing DataLoader 0: 100%|██████████| 6/6 [00:00<00:00, 397.34it/s]


In [390]:
results

[{'test/rmse': 3.728069543838501, 'test/mae': 2.690730333328247}]