# Predicting

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/chemprop/chemprop/blob/main/examples/predicting.ipynb)

# Import packages

In [46]:
import pandas as pd
import numpy as np
import torch
from lightning import pytorch as pl
from pathlib import Path
from joblib import load

from chemprop import data, featurizers, models, utils

## Model Input

In [47]:
chemprop_dir = Path.cwd().parent
checkpoint_path = chemprop_dir / "training" / "test_run" / "checkpoints" /"last.ckpt" 

## Load model

In [48]:
mpnn = models.MPNN.load_from_checkpoint(checkpoint_path)
mpnn

MPNN(
  (message_passing): BondMessagePassing(
    (W_i): Linear(in_features=86, out_features=300, bias=False)
    (W_h): Linear(in_features=300, out_features=300, bias=False)
    (W_o): Linear(in_features=372, out_features=300, bias=True)
    (dropout): Dropout(p=0.0, inplace=False)
    (tau): ReLU()
    (V_d_transform): Identity()
    (graph_transform): Identity()
  )
  (agg): MeanAggregation()
  (bn): BatchNorm1d(300, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (predictor): RegressionFFN(
    (ffn): MLP(
      (0): Sequential(
        (0): Linear(in_features=406, out_features=300, bias=True)
      )
      (1): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.5, inplace=False)
        (2): Linear(in_features=300, out_features=300, bias=True)
      )
      (2): Sequential(
        (0): ReLU()
        (1): Dropout(p=0.5, inplace=False)
        (2): Linear(in_features=300, out_features=1, bias=True)
      )
    )
    (criterion): MSE(task_weights=[[1.0]])
 

# OPTION 1: Predict from CSV File

In [4]:
chemprop_dir = Path.cwd().parent
test_path = chemprop_dir / "training" / "data" / "train_smiles.csv"
descriptors_path = chemprop_dir / "training" / "data" / "descriptors.csv"
smiles_column = 'full_smiles'

In [None]:
df_test = pd.read_csv(test_path)
df_test

In [None]:
smis = df_test[smiles_column]
smis

In [5]:
df_descriptors = pd.read_csv(descriptors_path)
extra_mol_descriptors = np.array(df_descriptors.values)

In [None]:
mols = [utils.make_mol(smi, keep_h=False, add_h=False) for smi in smis]
datapoints = [
    data.MoleculeDatapoint(mol, x_d=X_d)
    for mol, X_d in zip(
        mols,
        extra_mol_descriptors,
    )
]

# OPTION 2: Manually Enter Prediction Inputs
This is mostly for Bayesian Optimization and for when you need to repeat the same SMILEs with different descriptors.

In [49]:
num_rows= 50
SMILE= 'CO.N#CC1=CC=C(N)C=C1'
smis = []
for i in range(num_rows):
    smis.append(SMILE)

In [50]:
dataframe = {'full_smiles': smis}
df_test = pd.DataFrame(dataframe)

You can enter features here.

Features = [mwco, zeta_potential, contact_angle, pressure, surface_tension, solvent_mw, solvent_diameter, solvent_viscosity, density, solvent_dipole_moment, solvent_dielectric_constant, solvent_hildebrand, solvent_logp, solvent_dt, solvent_dp, solvent_dh, permeance, temperature, ph]

In [51]:
#string features - to be encoded
role='OSN'
membrane='DM150'
process_configuration='CF'
string_features= pd.DataFrame({'role': [role], 'membrane': [membrane], 'process_configuration': [process_configuration]})

encoder_path = chemprop_dir / 'predictions' / 'one_hot_encoder.joblib'
encoder = load(encoder_path)
one_hot_array = encoder.transform(string_features.to_numpy())
print(one_hot_array[0])

[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]




In [52]:
x=3 #index of feature you want to change
hrange= 30 #maximum value for this feature
features=[150,-1.0,59,10,22.5,32.04,0.505,0.55,0.792,1.6,33,14.5,-0.82,7.4,6,10.9,0.88,22,7]
for items in one_hot_array[0]:
    features.append(int(items))
print(features)

descriptors=[]
variables=np.linspace(features[x], hrange, num_rows)
for variable in variables:
    features[x]=float(variable)
    descriptors.append(features.copy())
#print(descriptors)
extra_mol_descriptors=np.array(descriptors)

[150, -1.0, 59, 10, 22.5, 32.04, 0.505, 0.55, 0.792, 1.6, 33, 14.5, -0.82, 7.4, 6, 10.9, 0.88, 22, 7, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]


In [53]:
mols = [utils.make_mol(smi, keep_h=False, add_h=False) for smi in smis]
datapoints = [
    data.MoleculeDatapoint(mol, x_d=X_d)
    for mol, X_d in zip(
        mols,
        extra_mol_descriptors,
    )
]

## Get molecule datapoints

In [54]:
test_data = datapoints

## Get molecule dataset

In [55]:
featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()
test_dset = data.MoleculeDataset(test_data, featurizer=featurizer)
test_loader = data.build_dataloader(test_dset, shuffle=False)

# Set up trainer

In [56]:
with torch.inference_mode():
    trainer = pl.Trainer(
        logger=False,
        enable_progress_bar=True,
        accelerator="cpu",
        devices=1
    )
    test_preds = trainer.predict(mpnn, test_loader)

ðŸ’¡ Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: True (mps), used: False
TPU available: False, using: 0 TPU cores


/opt/anaconda3/envs/BGNN4NF/lib/python3.11/site-packages/lightning/pytorch/trainer/setup.py:175: GPU available but not used. You can set it by doing `Trainer(accelerator='gpu')`.
/opt/anaconda3/envs/BGNN4NF/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:434: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


In [58]:
test_preds = np.concatenate(test_preds, axis=0)
df_test['pred'] = test_preds
df_test

Unnamed: 0,full_smiles,pred
0,CO.N#CC1=CC=C(N)C=C1,0.364202
1,CO.N#CC1=CC=C(N)C=C1,0.36621
2,CO.N#CC1=CC=C(N)C=C1,0.368219
3,CO.N#CC1=CC=C(N)C=C1,0.370224
4,CO.N#CC1=CC=C(N)C=C1,0.372199
5,CO.N#CC1=CC=C(N)C=C1,0.374168
6,CO.N#CC1=CC=C(N)C=C1,0.376079
7,CO.N#CC1=CC=C(N)C=C1,0.37798
8,CO.N#CC1=CC=C(N)C=C1,0.37988
9,CO.N#CC1=CC=C(N)C=C1,0.38178


# Save predictions

In [None]:
save_path = chemprop_dir / "predictions" / "previous_predictions" / "prediction_test_2.csv"
df_test.to_csv(save_path)