In [1]:
import mordred as md
import numpy as np
import optuna
import pandas as pd
import torch
import torch.nn as nn
import torch_geometric.nn as gnn
from catboost import CatBoostRegressor, Pool
from rdkit import Chem
from rdkit.Chem import MolFromSmiles, MolToSmiles
from sklearn.model_selection import KFold, train_test_split
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torch_geometric.loader import DataLoader as GDataLoader
from tqdm.auto import tqdm
from tqdm.contrib.concurrent import thread_map

from project_filtering import (BASE_DIR, MODEL_NAMES, OUTPUT_DIR,
                               PROCESSED_DIR, SEED)
from project_filtering.datasets import (FCD_Dataset, FCFP_Dataset,
                                        GNNIMDataset, get_dict_gnn_dataset,
                                        prepare_gnn_dataset, CNN_Dataset)
from project_filtering.preprocessing import (get_clean_dataset,
                                             load_processed_data)
from project_filtering.utils import (encode_smiles, generate_fingerprints,
                                     generate_rdkit_descriptors)
from project_filtering.models import CNN, GNN, FCD, FCFP

In [2]:
MOD_RATE = 2.5
MU = 0.25 #[0.001,0.0025,0.005,0.01,0.025,0.05,0.1]

## Loading Data

In [None]:
synth_df = pd.read_csv(BASE_DIR/f"data/processed/synthetic-variable-mu-{MU}-{MOD_RATE}.csv", index_col=0, sep = ";")
synth_df

In [4]:
inchi = synth_df["inchi"].to_numpy()

In [5]:
smiles = synth_df["smiles"].to_numpy()
len(smiles), len(np.unique(smiles))

(79890, 79890)

In [6]:
molecular_properties = synth_df["values"].to_numpy()
len(molecular_properties)

79890

In [7]:
molecules = list(map(Chem.MolFromSmiles,tqdm(smiles)))

  0%|          | 0/79890 [00:00<?, ?it/s]

In [8]:
morgan_fingerprints, rdkit_fingerprints = generate_fingerprints(molecules)

Generating Morgan fingerprints


  0%|          | 0/79890 [00:00<?, ?it/s]

Generating RDKit fingerprints


  0%|          | 0/79890 [00:02<?, ?it/s]

In [9]:
rdkit_descriptors = generate_rdkit_descriptors(molecules)

Loading / Generating RDKit descriptors


  0%|          | 0/79890 [00:00<?, ?it/s]

(79890, 217)
Scaling RDKit descriptors to zero mean and unit variance


In [8]:
smiles_dict, encoded_smiles = encode_smiles(smiles)

  0%|          | 0/79890 [00:00<?, ?it/s]

In [9]:
len(smiles_dict)+1

36

In [11]:
gnn_num_fingerprints, gnn_fingerprints, gnn_mol_bonds = prepare_gnn_dataset(molecules)

  0%|          | 0/79890 [00:00<?, ?it/s]

Atom Dict 0
Hybdn Dict 0
FPs Dict 1193


## 5Fold Split

In [12]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
device

device(type='cuda')

In [13]:
SEED = 42

In [14]:
def train_fn(model, optim, loss_fn, epochs, train_dl, eval_dl, name):

    writer = SummaryWriter(log_dir=BASE_DIR / "logs" / name, flush_secs=15)

    torch.cuda.empty_cache()
    b_train = 1e5
    b_eval = 1e5

    bar = tqdm(range(epochs), leave=False, position=1)
    for epoch in bar:
        epoch_train_loss = model.train_fn(optim, loss_fn, train_dl)
        b_train = min(b_train, epoch_train_loss)

        epoch_eval_loss = model.eval_fn(loss_fn, eval_dl)
        b_eval = min(b_eval, epoch_eval_loss)

        bar.set_postfix_str(f"{epoch_train_loss:.3f}({b_train:.3f}) | {epoch_eval_loss:.3f}({b_eval:.3f})")

        writer.add_scalar("loss/train", epoch_train_loss, epoch)
        writer.add_scalar("loss/eval", epoch_eval_loss, epoch)
        if epoch_eval_loss <= b_eval:
            torch.save(model.state_dict(),
                       BASE_DIR / f"models/{name}.pth")
    bar.close()
    return b_train, b_eval

In [15]:
kf = KFold(5, shuffle=True, random_state=SEED)
predicted_values = {
    "smiles": [],
    "inchi": [],
    "values": [],
    "cnn": [],
    "gnn": [],
    "fcfp": [],
    "fcd": [],
    "cb": []
}
pbar = tqdm(kf.split(np.arange(len(smiles))), position=0, total=5)
for k, (unique_train_idx, unique_eval_idx) in enumerate(pbar):
    train_gnn_fingerprints, eval_gnn_fingerprints = [
        gnn_fingerprints[i] for i in unique_train_idx], [gnn_fingerprints[i] for i in unique_eval_idx]
    train_gnn_mol_bonds, eval_gnn_mol_bonds = [gnn_mol_bonds[i] for i in unique_train_idx], [
        gnn_mol_bonds[i] for i in unique_eval_idx]
    train_encoded_smiles, eval_encoded_smiles = encoded_smiles[
        unique_train_idx], encoded_smiles[unique_eval_idx]
    train_rdkit_descriptors, eval_rdkit_descriptors = rdkit_descriptors[
        unique_train_idx], rdkit_descriptors[unique_eval_idx]
    train_morgan_fingerprints, eval_morgan_fingerprints = morgan_fingerprints[
        unique_train_idx], morgan_fingerprints[unique_eval_idx]
    train_rdkit_fingerprints, eval_rdkit_fingerprints = rdkit_fingerprints[
        unique_train_idx], rdkit_fingerprints[unique_eval_idx]
    train_molecular_properties, eval_molecular_properties = molecular_properties[
        unique_train_idx], molecular_properties[unique_eval_idx]

    predicted_values["smiles"].extend(smiles[unique_eval_idx])
    predicted_values["inchi"].extend(inchi[unique_eval_idx])
    predicted_values["values"].extend(eval_molecular_properties)

    # pbar.write("Split Finished")
    model = CNN(len(smiles_dict)+1,
                n_conv_layers=4,
                kernel_size=5,
                conv_channels=512,
                n_lin_layers=8).to(device)
    eval_ds = CNN_Dataset(eval_encoded_smiles, eval_molecular_properties)
    eval_dl = DataLoader(eval_ds,
                         batch_size=512,
                         shuffle=False,
                         num_workers=4,
                         persistent_workers=True)
    train_ds = CNN_Dataset(train_encoded_smiles,
                           train_molecular_properties)
    train_dl = DataLoader(train_ds,
                          batch_size=512,
                          shuffle=True,
                          num_workers=4,
                          persistent_workers=True)
    # pbar.write("CNN Loaded")
    b_train, b_eval = train_fn(model=model,
                               optim=torch.optim.Adam(
                                   model.parameters(),
                                   lr=1e-4),
                               loss_fn=nn.L1Loss(reduction='sum'),
                               epochs=100,
                               train_dl=train_dl,
                               eval_dl=eval_dl,
                               name=f"5fold-variable-mu-{MU}-{MOD_RATE}-cnn-{SEED}-{k}")
    pbar.write(f"5Fold\t CNN\t {k}\t {b_train:.4f}\t\t {b_eval:.4f}")
    model.load_state_dict(torch.load(
        BASE_DIR /
        f"models/5fold-variable-mu-{MU}-{MOD_RATE}-cnn-{SEED}-{k}.pth",
        map_location=device, weights_only=True))
    predicted_values["cnn"].extend(model.eval_fn(
        nn.L1Loss(reduction='sum'), eval_dl, return_predictions=True))

    train_gnn_dataset = GNNIMDataset(get_dict_gnn_dataset(train_gnn_fingerprints,
                                                          train_gnn_mol_bonds,
                                                          train_molecular_properties
                                                          ))
    eval_gnn_dataset = GNNIMDataset(get_dict_gnn_dataset(eval_gnn_fingerprints,
                                                         eval_gnn_mol_bonds,
                                                         eval_molecular_properties
                                                         ))
    model = GNN(gnn_num_fingerprints,
                embed_fingerprints=128,
                n_conv_layers=7,
                conv_channels=512,
                n_lin_layers=4).to(device)
    eval_dl = GDataLoader(eval_gnn_dataset,
                          batch_size=128,
                          shuffle=False,
                          num_workers=4,
                          persistent_workers=True)
    train_dl = GDataLoader(train_gnn_dataset,
                           batch_size=128,
                           shuffle=True,
                           num_workers=4,
                           persistent_workers=True)
    b_train, b_eval = train_fn(model=model,
                               optim=torch.optim.Adam(
                                   model.parameters(),
                                   lr=1e-4),
                               loss_fn=nn.L1Loss(reduction='sum'),
                               epochs=100,
                               train_dl=train_dl,
                               eval_dl=eval_dl,
                               name=f"5fold-variable-mu-{MU}-{MOD_RATE}-gnn-{SEED}-{k}")

    pbar.write(f"5Fold\t GNN\t {k}\t {b_train:.4f}\t\t {b_eval:.4f}")
    model.load_state_dict(torch.load(
        BASE_DIR /
        f"models/5fold-variable-mu-{MU}-{MOD_RATE}-gnn-{SEED}-{k}.pth",
        map_location=device, weights_only=True))
    predicted_values["gnn"].extend(model.eval_fn(
        nn.L1Loss(reduction='sum'), eval_dl, return_predictions=True))

    model = FCFP(n_layers=7,
                 hidden_wts=512).to(device)
    eval_ds = FCFP_Dataset(eval_morgan_fingerprints,
                           eval_rdkit_fingerprints, eval_molecular_properties)
    eval_dl = DataLoader(eval_ds,
                         batch_size=512,
                         shuffle=False,
                         num_workers=4,
                         persistent_workers=True)
    train_ds = FCFP_Dataset(train_morgan_fingerprints,
                            train_rdkit_fingerprints, train_molecular_properties)
    train_dl = DataLoader(train_ds,
                          batch_size=512,
                          shuffle=True,
                          num_workers=4,
                          persistent_workers=True)
    b_train, b_eval = train_fn(model=model,
                               optim=torch.optim.Adam(
                                   model.parameters(),
                                   lr=1e-4),
                               loss_fn=nn.L1Loss(reduction='sum'),
                               epochs=100,
                               train_dl=train_dl,
                               eval_dl=eval_dl,
                               name=f"5fold-variable-mu-{MU}-{MOD_RATE}-fcfp-{SEED}-{k}")
    pbar.write(f"5Fold\t FCFP\t {k}\t {b_train:.4f}\t\t {b_eval:.4f}")
    model.load_state_dict(torch.load(
        BASE_DIR /
        f"models/5fold-variable-mu-{MU}-{MOD_RATE}-fcfp-{SEED}-{k}.pth",
        map_location=device, weights_only=True))
    predicted_values["fcfp"].extend(model.eval_fn(
        nn.L1Loss(reduction='sum'), eval_dl, return_predictions=True))

    model = FCD(n_layers=2,
                hidden_wts=2048).to(device)
    eval_ds = FCD_Dataset(eval_rdkit_descriptors, eval_molecular_properties)
    eval_dl = DataLoader(eval_ds,
                         batch_size=512,
                         shuffle=False,
                         num_workers=4,
                         persistent_workers=True)
    train_ds = FCD_Dataset(train_rdkit_descriptors,
                           train_molecular_properties)
    train_dl = DataLoader(train_ds,
                          batch_size=512,
                          shuffle=True,
                          num_workers=4,
                          persistent_workers=True)
    b_train, b_eval = train_fn(model=model,
                               optim=torch.optim.Adam(
                                   model.parameters(),
                                   lr=1e-4),
                               loss_fn=nn.L1Loss(reduction='sum'),
                               epochs=100,
                               train_dl=train_dl,
                               eval_dl=eval_dl,
                               name=f"5fold-variable-mu-{MU}-{MOD_RATE}-fcd-{SEED}-{k}")
    pbar.write(f"5Fold\t FCD\t {k}\t {b_train:.4f}\t\t {b_eval:.4f}")
    model.load_state_dict(torch.load(
        BASE_DIR /
        f"models/5fold-variable-mu-{MU}-{MOD_RATE}-fcd-{SEED}-{k}.pth",
        map_location=device, weights_only=True))
    predicted_values["fcd"].extend(model.eval_fn(
        nn.L1Loss(reduction='sum'), eval_dl, return_predictions=True))

    eval_ds = Pool(np.hstack([
        eval_morgan_fingerprints,
        eval_rdkit_fingerprints,
        eval_rdkit_descriptors,
    ]),
        eval_molecular_properties)
    trn_ds = Pool(np.hstack([
        train_morgan_fingerprints,
        train_rdkit_fingerprints,
        train_rdkit_descriptors,
    ]),
        train_molecular_properties)
    model = CatBoostRegressor(
        loss_function="MAE",
        task_type="GPU",
        devices="0",
        metric_period=20,
        learning_rate=6e-3,
        depth=7,
        iterations=5000,
        use_best_model=True,
        silent=True,
        allow_writing_files=False
    )
    model.fit(
        trn_ds, eval_set=eval_ds, plot=False)
    model.save_model(
        BASE_DIR / f"models/5fold-variable-mu-{MU}-{MOD_RATE}-cb-{SEED}-{k}.cb")
    b_train = model.get_best_score().get("learn",
                                         {"MAE": np.nan})["MAE"]
    b_eval = model.get_best_score().get("validation",
                                        {"MAE": np.nan})["MAE"]
    pbar.write(f"5Fold\t CB\t {k}\t {b_train:.4f}\t\t {b_eval:.4f}")
    model = CatBoostRegressor().load_model(
        BASE_DIR / f"models/5fold-variable-mu-{MU}-{MOD_RATE}-cb-{SEED}-{k}.cb")
    predicted_values["cb"].extend(np.stack(model.predict(eval_ds)))

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 CNN	 0	 0.0206		 0.0261


  0%|          | 0/63912 [00:00<?, ?it/s]

  0%|          | 0/15978 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 GNN	 0	 0.0162		 0.0241


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCFP	 0	 0.0139		 0.0297


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCD	 0	 0.0097		 0.0284
5Fold	 CB	 0	 0.0146		 0.0264


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 CNN	 1	 0.0206		 0.0258


  0%|          | 0/63912 [00:00<?, ?it/s]

  0%|          | 0/15978 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 GNN	 1	 0.0160		 0.0239


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCFP	 1	 0.0139		 0.0293


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCD	 1	 0.0101		 0.0277
5Fold	 CB	 1	 0.0147		 0.0257


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 CNN	 2	 0.0215		 0.0260


  0%|          | 0/63912 [00:00<?, ?it/s]

  0%|          | 0/15978 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 GNN	 2	 0.0164		 0.0240


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCFP	 2	 0.0137		 0.0293


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCD	 2	 0.0100		 0.0281
5Fold	 CB	 2	 0.0147		 0.0260


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 CNN	 3	 0.0216		 0.0256


  0%|          | 0/63912 [00:00<?, ?it/s]

  0%|          | 0/15978 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 GNN	 3	 0.0164		 0.0240


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCFP	 3	 0.0140		 0.0294


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCD	 3	 0.0100		 0.0281
5Fold	 CB	 3	 0.0147		 0.0260


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 CNN	 4	 0.0210		 0.0258


  0%|          | 0/63912 [00:00<?, ?it/s]

  0%|          | 0/15978 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 GNN	 4	 0.0161		 0.0239


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCFP	 4	 0.0138		 0.0297


  0%|          | 0/100 [00:00<?, ?it/s]

5Fold	 FCD	 4	 0.0100		 0.0280
5Fold	 CB	 4	 0.0147		 0.0262


In [16]:
predicted_df = pd.DataFrame(predicted_values)
predicted_df.to_csv(BASE_DIR/f"data/processed/predicted-variable-mu-{MU}-{MOD_RATE}-{SEED}.csv", sep=";")