In [1]:
import logging
import os

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import torch
import torch.nn as nn
from catboost import CatBoostRegressor
from rdkit import Chem, DataStructs
from rdkit.Chem.Fingerprints import FingerprintMols
from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import KFold, train_test_split
from sklearn.svm import LinearSVR
from torch.utils.data import DataLoader, Dataset
from torchinfo import summary
from tqdm.contrib import tmap
from tqdm.contrib.concurrent import process_map, thread_map
from tqdm.notebook import tqdm

from preprocessing_pipeline import get_train_data
from models import *
from datasets import *
from utils import *

# np.set_printoptions(threshold=sys.maxsize)
%matplotlib inline

torch.backends.cudnn.benchmark = True
print(torch.backends.cudnn.version())
print(torch.version.cuda)
device=torch.device("cuda")


7605
10.2


## Preparing dataset for training

Preprocessing NIST RI database, calculating descriptors, fingerprints

In [None]:
get_train_data("../Data/nist.ri")

Loading processed NIST dataset

In [2]:
df_nist = pd.read_csv("../Data/valid_nist.csv")
df_nist.columns = ["Formula", "RI", "ColType"]

In [None]:
df_nist.head()

Preparing database for CatBoost and SVR

In [None]:
nist_unique = pd.unique(df_nist["Formula"])
formulas = np.load("../Data/unique_nist.npy", allow_pickle=True)
print("Saved arrays have the same order as generated formulas:",
      np.all(formulas == nist_unique))
md_nist = np.load("../Data/md_nist.npy")
descriptors = (md_nist - md_nist.min(axis=0)) / (
    (md_nist.max(axis=0) - md_nist.min(axis=0)) + 1e-8)
fingerprints = np.load("../Data/fp_nist.npy")
maccs = np.load("../Data/maccs_nist.npy")
all_nist = np.concatenate([descriptors, maccs], axis=1)
dict_nist = dict(zip(formulas, range(len(formulas))))
db_nist = np.zeros(shape=(len(df_nist), all_nist.shape[-1] + 37))


def encode_column(col_type):
    res = np.zeros((37))
    res[col_type] = 1
    if col_type > 14:
        res[-1] = 1
    return res


for i, val in tqdm(df_nist.iterrows(), total=len(df_nist)):
    db_nist[i] = np.concatenate(
        [all_nist[dict_nist[val["Formula"]]],
         encode_column(val["ColType"])])


## 5-fold training

### CNN1D and MLP training

In [None]:
nist_unique = pd.unique(df_nist["Formula"])
formulas = np.load("../Data/unique_nist.npy", allow_pickle=True)
print("Saved arrays have the same order as generated formulas:",
      np.all(formulas == nist_unique))
md_nist = np.load("../Data/md_nist.npy")
descriptors = (md_nist - md_nist.min(axis=0)) / (
    (md_nist.max(axis=0) - md_nist.min(axis=0)) + 1e-8)
fingerprints = np.load("../Data/fp_nist.npy")
maccs = np.load("../Data/maccs_nist.npy")
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in kf.split(nist_unique):
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
        else:
            trn_mask.append(i)
    tst_df = df_nist.iloc[tst_mask]
    tst_df.to_csv(f"TST_{fold}.csv", index=False)
    trn_ds = MLP_Dataset(df_nist, descriptors, fingerprints, formulas, maccs,
                         trn_mask)
    tst_ds = MLP_Dataset(df_nist, descriptors, fingerprints, formulas, maccs,
                         tst_mask)
    trn_dl = DataLoader(trn_ds,
                        batch_size=512,
                        shuffle=True,
                        pin_memory=True,
                        num_workers=8)
    tst_dl = DataLoader(tst_ds, batch_size=512, pin_memory=True, num_workers=8)
    mlp = MLP().to(device)
    optim = torch.optim.Adam(mlp.parameters(), lr=1e-4)
    crit = nn.L1Loss()
    res = train(device, mlp, optim, crit, 0, 300, trn_dl, tst_dl,
                f"MLP_{fold}")
    print(f"MLP_{fold}:", res)

    trn_ds = CNN1D_Dataset(df_nist, trn_mask)
    tst_ds = CNN1D_Dataset(df_nist, tst_mask)
    trn_dl = DataLoader(trn_ds,
                        batch_size=128,
                        shuffle=True,
                        pin_memory=True,
                        num_workers=8)
    tst_dl = DataLoader(tst_ds, batch_size=128, pin_memory=True, num_workers=8)
    cnn1d = CNN1D().to(device)
    optim = torch.optim.Adam(cnn1d.parameters(), lr=1e-4)
    crit = nn.L1Loss()
    res = train(device, cnn1d, optim, crit, 0, 150, trn_dl, tst_dl,
                f"CNN1D_{fold}")
    print(f"CNN1D_{fold}:", res)

    fold += 1


### CNN2D training

In [None]:
nist_unique = pd.unique(df_nist["Formula"])
formulas = np.load("../Data/unique_nist.npy", allow_pickle=True)
print("Saved arrays have the same order as generated formulas:",
      np.all(formulas == nist_unique))
nist_2d = np.load("../Data/2d_nist.npy")
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in kf.split(nist_unique):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    trn_unique = nist_unique[trn_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(tqdm(df_nist["Formula"].values)):
        if val in tst_unique:
            tst_mask.append(i)
        else:
            trn_mask.append(i)
    tst_df = df_nist.iloc[tst_mask]
    # tst_df.to_csv(f"TST_{fold}.csv",index=False)
    print(
        "Check if 2d coords are the same:",
        np.all(nist_2d[trn_indx][0] == get_2d_coordinates(
            Chem.MolFromSmiles(trn_unique[0]))))
    trn_ds = CNN2D_Dataset(df_nist, nist_2d[trn_indx], nist_unique[trn_indx],
                           trn_mask)
    tst_ds = CNN2D_Dataset(df_nist, nist_2d[tst_indx], nist_unique[tst_indx],
                           tst_mask)
    trn_dl = DataLoader(trn_ds,
                        batch_size=64,
                        shuffle=True,
                        pin_memory=True,
                        num_workers=8)
    tst_dl = DataLoader(tst_ds, batch_size=64, pin_memory=True, num_workers=4)
    cnn2d = CNN2D().to(device)
    optim = torch.optim.Adam(cnn2d.parameters(), lr=1e-4)
    crit = nn.L1Loss()
    res = train(device, cnn2d, optim, crit, 0, 70, trn_dl, tst_dl,
                f"CNN2D_{fold}")
    print(f"CNN2D_{fold}:", res)

    fold += 1


### Training CatBoost
Splits are exactly the same in NNs and CatBoost

In [None]:
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in kf.split(nist_unique):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
        else:
            trn_mask.append(i)
    tst_db = db_nist[tst_mask]
    trn_db = db_nist[trn_mask]
    ri_nist_trn = df_nist["RI"].values[trn_mask] / 1000
    ri_nist_tst = df_nist["RI"].values[tst_mask] / 1000

    bst = CatBoostRegressor(iterations=2000,
                            learning_rate=0.10,
                            max_depth=10,
                            loss_function="RMSE",
                            thread_count=10,
                            custom_metric="MAE",
                            verbose=True)
    bst.fit(X=trn_db,
            y=ri_nist_trn,
            eval_set=(tst_db, ri_nist_tst),
            use_best_model=True,
            verbose=False,
            plot=True)
    bst.save_model(f"catboost_{fold}.model")

    fold += 1


### Training SVR

In [None]:
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in kf.split(nist_unique):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
        else:
            trn_mask.append(i)
    tst_db = db_nist[tst_mask]
    trn_db = db_nist[trn_mask]
    ri_nist_trn = df_nist["RI"].values[trn_mask] / 1000
    ri_nist_tst = df_nist["RI"].values[tst_mask] / 1000

    svr = LinearSVR()
    svr.fit(trn_db, ri_nist_trn)
    preds = svr.predict(tst_db)
    print(mae(ri_nist_tst, preds))
    # joblib.dump(svr,f"../Models/SVR_{fold}.model")
    fold += 1


## Predicting values in the dataset
Each NN out of five predicts values for its pre-defined test set, no data leaks here.

### SVR Prediction

In [None]:
nist_unique = pd.unique(df_nist["Formula"])
formulas = np.load("../Data/unique_nist.npy", allow_pickle=True)
print("Saved arrays have the same order as generated formulas:",
      np.all(formulas == nist_unique))
md_nist = np.load("../Data/md_nist.npy")
descriptors = (md_nist - md_nist.min(axis=0)) / (
    (md_nist.max(axis=0) - md_nist.min(axis=0)) + 1e-8)
fingerprints = np.load("../Data/fp_nist.npy")
maccs = np.load("../Data/maccs_nist.npy")
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in tqdm(kf.split(nist_unique)):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
        else:
            trn_mask.append(i)
    tst_db = db_nist[tst_mask]
    df_pred = df_nist.iloc[tst_mask].copy()

    svr = joblib.load(f"../Models/SVR_{fold}.model")
    preds = svr.predict(tst_db)
    df_pred["RI"] = preds * 1000
    # df_pred.to_csv(f"../Data/SVR_pred_{fold}.csv")
    fold += 1


### CNN1D prediction

In [None]:
nist_unique = pd.unique(df_nist["Formula"])
formulas = np.load("../Data/unique_nist.npy", allow_pickle=True)
print("Saved arrays have the same order as generated formulas:",
      np.all(formulas == nist_unique))
md_nist = np.load("../Data/md_nist.npy")
descriptors = (md_nist - md_nist.min(axis=0)) / (
    (md_nist.max(axis=0) - md_nist.min(axis=0)) + 1e-8)
fingerprints = np.load("../Data/fp_nist.npy")
maccs = np.load("../Data/maccs_nist.npy")
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in tqdm(kf.split(nist_unique), total=5):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
    tst_df = df_nist.iloc[tst_mask].copy()

    tst_ds = CNN1D_Dataset(df_nist, tst_mask)
    tst_dl = DataLoader(tst_ds, batch_size=128, pin_memory=True, num_workers=8)

    res = []
    cnn1d = CNN1D().to(device)
    cnn1d.load_state_dict(torch.load(f"CNN1D_{fold}_model.pth"))
    for data in tst_dl:
        cnn1d.eval()
        with torch.no_grad():
            form, col, ris = data
            res.append(
                cnn1d(form.to(device), col.to(device)).detach().cpu().numpy())
    res = np.vstack(res)
    tst_df["RI"] = res * 1000
    # tst_df.to_csv(f"CNN1D_pred_{fold}.csv")

    fold += 1


### MLP prediction

In [None]:
nist_unique = pd.unique(df_nist["Formula"])
formulas = np.load("../Data/unique_nist.npy", allow_pickle=True)
print("Saved arrays have the same order as generated formulas:",
      np.all(formulas == nist_unique))
md_nist = np.load("../Data/md_nist.npy")
descriptors = (md_nist - md_nist.min(axis=0)) / (
    (md_nist.max(axis=0) - md_nist.min(axis=0)) + 1e-8)
fingerprints = np.load("../Data/fp_nist.npy")
maccs = np.load("../Data/maccs_nist.npy")
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in tqdm(kf.split(nist_unique), total=5):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
    tst_df = df_nist.iloc[tst_mask].copy()

    tst_ds = MLP_Dataset(df_nist, descriptors, fingerprints, formulas, maccs,
                         tst_mask)
    tst_dl = DataLoader(tst_ds, batch_size=512, pin_memory=True, num_workers=8)
    mlp = MLP().to(device)
    mlp.load_state_dict(torch.load(f"MLP_{fold}_model.pth"))
    res = []
    for data in tst_dl:
        with torch.no_grad():
            md, fp, ms, col, ris = data
            res.append(
                mlp(md.to(device), fp.to(device), ms.to(device),
                    col.to(device)).detach().cpu().numpy())
    res = np.vstack(res)
    tst_df["RI"] = res * 1000
    # tst_df.to_csv(f"MLP_pred_{fold}.csv")

    fold += 1


### CNN2D Prediction

In [None]:
nist_unique = pd.unique(df_nist["Formula"])
formulas = np.load("../Data/unique_nist.npy", allow_pickle=True)
print("Saved arrays have the same order as generated formulas:",
      np.all(formulas == nist_unique))
nist_2d = np.load("../Data/2d_nist.npy")
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in tqdm(kf.split(nist_unique), total=5):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    trn_unique = nist_unique[trn_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
        else:
            trn_mask.append(i)
    tst_df = df_nist.iloc[tst_mask].copy()
    tst_ds = CNN2D_Dataset(df_nist, nist_2d[tst_indx], nist_unique[tst_indx],
                           tst_mask)
    tst_dl = DataLoader(tst_ds, batch_size=64, pin_memory=True, num_workers=4)
    res = []
    cnn2d = CNN2D().to(device)
    cnn2d.load_state_dict(torch.load(f"../Models/CNN2D_{fold}_model.pth"))
    for data in tst_dl:
        cnn2d.eval()
        with torch.no_grad():
            form, col, ris = data
            res.append(
                cnn2d(form.to(device), col.to(device)).detach().cpu().numpy())
    res = np.vstack(res)
    tst_df["RI"] = res * 1000
    # tst_df.to_csv(f"CNN2D_pred_{fold}.csv")

    fold += 1


### CatBoost prediction

In [None]:
fold = 0

kf = KFold(n_splits=5, shuffle=True, random_state=42)
for trn_indx, tst_indx in tqdm(kf.split(nist_unique)):
    # print(len(trn_indx),len(tst_indx))
    trn_mask, tst_mask = [], []
    tst_unique = nist_unique[tst_indx]
    tst_unique = set(tst_unique)
    for i, val in enumerate(df_nist["Formula"].values):
        if val in tst_unique:
            tst_mask.append(i)
    tst_db = db_nist[tst_mask]
    ri_nist_tst = df_nist["RI"].values[tst_mask] / 1000

    bst = CatBoostRegressor()
    bst.load_model(f"catboost_{fold}.model")
    df_pred = df_nist.iloc[tst_mask].copy()
    res = bst.predict(data=tst_db)
    df_pred.loc[:, "RI"] = res * 1000
    df_pred.to_csv(f"catboost_pred_{fold}.csv", index=False)
    fold += 1


### Merging 5-fold datasets into one

In [None]:
ress = []
for n in [
        "TST", "CNN1D_pred", "MLP_pred", "catboost_pred", "CNN2D_pred",
        "SVR_pred"
]:
    res = []
    for i in range(5):
        df = pd.read_csv(f"../Data/{n}_{i}.csv")[["Formula", "ColType",
                                                  "RI"]].to_numpy()
        res.append(df)
    ress.append(np.vstack(res))
ress = np.hstack(ress)


In [None]:
df_pred = pd.DataFrame(ress[:, [0, 1, 2, 5, 8, 11, 14, 17]])
df_pred.columns = [
    "Formula", "ColType", "RI_X", "RI_1D", "RI_MLP", "RI_CB", "RI_2D", "RI_SVR"
]
# df_pred.to_csv("../Data/preds_nist.csv",index=False)
df_pred.head()