### Example script for training MPNN-POM model

In [24]:
!pip install openpom



In [25]:
import deepchem as dc
from openpom.feat.graph_featurizer import GraphFeaturizer, GraphConvConstants
from openpom.utils.data_utils import get_class_imbalance_ratio
from openpom.models.mpnn_pom import MPNNPOMModel
from datetime import datetime

In [26]:
TASKS = ['animal', 'berry', 'caramellic', 'citrus', 'earthy',
         'ethereal', 'fatty', 'fermented', 'floral', 'fruity',
         'green', 'herbal', 'sulfurous', 'tropical', 'vanilla',
         'waxy', 'honey', 'aldehydic', 'amber', 'balsamic',
         'creamy', 'musk', 'spicy', 'woody', 'minty',
         'buttery', 'musty', 'soapy', 'camphoreous', 'alliaceous',
         'cooling', 'nutty', 'coffee']
print("No of tasks: ", len(TASKS))

No of tasks:  33


In [27]:
input_file = 'new_open_pom_train.csv'

In [30]:
# get dataset

featurizer = GraphFeaturizer()
smiles_field = 'nonStereoSMILES'
loader = dc.data.CSVLoader(tasks=TASKS,
                   feature_field=smiles_field,
                   featurizer=featurizer)
dataset = loader.create_dataset(inputs=[input_file])
n_tasks = len(dataset.tasks)

In [10]:
smiles_field = 'combined_smiles_non_stereo'
loader = dc.data.CSVLoader(tasks=TASKS,
                   feature_field=smiles_field,
                   featurizer=featurizer)
train_mix_dataset = loader.create_dataset(inputs=["cleaned_non_stereo_train.csv"])
n_tasks = len(train_mix_dataset.tasks)

In [31]:
smiles_field = 'combined_smiles_non_stereo'
loader = dc.data.CSVLoader(tasks=TASKS,
                   feature_field=smiles_field,
                   featurizer=featurizer)
test_dataset = loader.create_dataset(inputs=["cleaned_non_stereo_test.csv"])
n_tasks = len(test_dataset.tasks)

In [32]:
len(dataset)

4575

In [33]:
len(test_dataset)

3374

In [13]:
len(train_mix_dataset)

107819

In [None]:
train_dataset = dataset # if you are training on modified openpom dataset (containing single molecules)
# train_dataset = train_mix_dataset # if you want to train model on mixture train split

In [35]:
train_ratios = get_class_imbalance_ratio(train_dataset)
assert len(train_ratios) == n_tasks

In [36]:
learning_rate = dc.models.optimizers.ExponentialDecay(initial_rate=0.001, decay_rate=0.5, decay_steps=39*15, staircase=True)
# learning_rate = 0.001

In [40]:
# initialize model

model = MPNNPOMModel(n_tasks = n_tasks,
                            batch_size=128,
                            learning_rate=learning_rate,
                            class_imbalance_ratio = train_ratios,
                            loss_aggr_type = 'sum',
                            node_out_feats = 100,
                            edge_hidden_feats = 75,
                            edge_out_feats = 100,
                            num_step_message_passing = 5,
                            mpnn_residual = True,
                            message_aggregator_type = 'sum',
                            mode = 'classification',
                            number_atom_features = GraphConvConstants.ATOM_FDIM,
                            number_bond_features = GraphConvConstants.BOND_FDIM,
                            n_classes = 1,
                            readout_type = 'set2set',
                            num_step_set2set = 3,
                            num_layer_set2set = 2,
                            ffn_hidden_list= [392, 392],
                            ffn_embeddings = 256,
                            ffn_activation = 'relu',
                            ffn_dropout_p = 0.12,
                            ffn_dropout_at_input_no_act = False,
                            weight_decay = 1e-5,
                            self_loop = False,
                            optimizer_name = 'adam',
                            log_frequency = 32,
                            model_dir = './examples/experiments',
                            device_name='cuda')

In [41]:
nb_epoch = 45

In [42]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)


In [47]:

# import logging

# logger = logging.getLogger(__name__)
# logging.basicConfig(level=logging.INFO)

In [43]:
start_time = datetime.now()
for epoch in range(1, nb_epoch+1):
      loss = model.fit(
                  train_dataset,
                  nb_epoch=1,
                  max_checkpoints_to_keep=1,
                  deterministic=False,
                  restore=epoch>1)
      train_scores = model.evaluate(train_dataset, [metric])['roc_auc_score']
      test_scores = model.evaluate(test_dataset, [metric])['roc_auc_score']
      print(f"epoch {epoch}/{nb_epoch} ; loss = {loss}; train_scores = {train_scores}; valid_scores = {test_scores}")
model.save_checkpoint()
end_time = datetime.now()

epoch 1/45 ; loss = 2.3011257648468018; train_scores = 0.7126606689029941; valid_scores = 0.7246028180424208
epoch 2/45 ; loss = 2.132138252258301; train_scores = 0.7617557715281938; valid_scores = 0.7601062307960406
epoch 3/45 ; loss = 2.0171688397725425; train_scores = 0.7755641563982344; valid_scores = 0.7580489580259024
epoch 4/45 ; loss = 2.0053622722625732; train_scores = 0.8286394923301179; valid_scores = 0.8178928055158909
epoch 5/45 ; loss = 1.9081830978393555; train_scores = 0.8348882112235554; valid_scores = 0.8086176633106111
epoch 6/45 ; loss = 1.8842838605244954; train_scores = 0.8386308368403703; valid_scores = 0.8275432756059709
epoch 7/45 ; loss = 1.839540754045759; train_scores = 0.8474386807948509; valid_scores = 0.8469135399880737
epoch 8/45 ; loss = 1.8109596967697144; train_scores = 0.8596095540594431; valid_scores = 0.8464010309742841
epoch 9/45 ; loss = 1.648777723312378; train_scores = 0.8642099855370956; valid_scores = 0.8450410773936735
epoch 10/45 ; loss = 1

In [None]:
# # if you want to test openpom-model on train_mix_set
# train_mix_scores = model.evaluate(train_mix_dataset, [metric])['roc_auc_score']
# print("time_taken: ", str(end_time-start_time))
# print("test_score: ", train_mix_scores)