In [2]:
import os
import pandas as pd
import numpy as np
import deepchem as dc
from pubchempy import get_cids, get_compounds

from rdkit import Chem

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [3]:
def display_df(df):
    display(HTML(df.to_html()))
    return None

In [3]:
class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None, exclude_cols=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
            df_sel = df_sel.select_dtypes(["number", "bool"])
        
        df_sel = df_sel.astype("float32")
        
        all_cols = df_sel.columns.to_list()
        all_cols = [col for col in all_cols if col not in self._exclude_cols]
        df_sel = df_sel[all_cols]
        ncols = len(all_cols)
        
        corr_mat = df_sel.corr().abs()
        self._corr_mat = corr_mat
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = corr_mat.loc[col_i, col_j]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        print("Number of columns droped due to collinearity:", len(droped_col))
        return df.drop(droped_col, axis="columns")

In [4]:
class NumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, method="mean", exclude_cols=None):
        self._method = method
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
    
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        num_cols = [col for col in num_cols if col not in self._exclude_cols]
        
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in num_cols:
            self._impute_values[col] = df_train[col].agg(self._method)
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(val)
        
        # align columns
        df = df[self._train_cols]
        return df
    

class CatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, val="MISSING"):
        self._val = val
    
    def fit(self, df_train):
        cat_cols = df_train.select_dtypes(["object", "category", "bool"]).columns.to_list()
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in cat_cols:
            self._impute_values[col] = self._val
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].astype("object").fillna(val).astype("category")
                
        # align columns
        df = df[self._train_cols]
        return df

In [5]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, exclude_cols=None, to_array=False):
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
            
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        num_cols = [col for col in num_cols if col not in self._exclude_cols]
        
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
                df[col] = df[col] - self._mean[col]
                df[col] = df[col].astype("float32")
                
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

# pdY

## Clean data

In [None]:
assert False

df_neu = pd.read_excel("data/raw/Data1.xlsx", sheet_name="FullData")
df_neu["new_id"] = np.arange(df_neu.shape[0])
df_neu["new_id"] = "_" + df_neu["new_id"].astype(str)
df_neu = df_neu.rename(columns={"Ligand SMILES": "smiles"})

print("df_neu", df_neu.shape)
display_df(df_neu.head())

df_neu["is_lower_bound"] = df_neu["IC50 (nM)"].astype(str).apply(lambda s: s.startswith(">")).astype(int)
df_neu["is_upper_bound"] = df_neu["IC50 (nM)"].astype(str).apply(lambda s: s.startswith("<")).astype(int)

ii = (df_neu["is_lower_bound"] == 1) | (df_neu["is_upper_bound"] == 1)
df_neu = df_neu[~ii]
print("df_neu", df_neu.shape)

df_neu["code"] = "labeled"

df_neu["ic50_clean"] = df_neu["IC50 (nM)"].astype(float).copy()

RT = 0.593
df_neu["dG"] = RT * np.log(df_neu["ic50_clean"]*1e-09)

df_neu["smiles_len"] = df_neu["smiles"].apply(lambda x: len(x))

print("df_neu", df_neu.shape)
display_df(df_neu[["new_id", "smiles", "dG", "code", "smiles_len"]].head())

# remove too long smiles
print("remove too long smiles")
print("df_neu", df_neu.shape)
df_neu = df_neu[df_neu["smiles_len"] <= 200]
print("df_neu", df_neu.shape)


# remove samples which deviate too much from mean
df01 = df_neu.groupby(["smiles"], as_index=False).agg({"new_id": "count", "dG": ["min", "max", "mean"]})
df01.columns = ["smiles", "count", "dG_min", "dG_max", "dG_mean"]
df01["diff"] = df01["dG_max"] - df01["dG_min"]
print("df01", df01.shape)
display_df(df01.head())

df_neu = df_neu.merge(df01[["smiles", "dG_mean", "count"]], how="left", on="smiles")
df_neu["abs_dG_diff"] =  (df_neu["dG"] - df_neu["dG_mean"]).abs()
print("df_neu", df_neu.shape)
display_df(df_neu.head())

df_neu = df_neu[df_neu["abs_dG_diff"] <= 2.5]
print("df_neu", df_neu.shape)
display_df(df_neu.head())


df_neu.to_csv("data/process/Data1_clean.csv", index=False)

In [None]:
display_df(pd.read_csv("data/process/Data1_clean.csv").head())

# Train/test/val

In [None]:
assert False

df_neu = pd.read_csv("data/process/Data1_clean.csv")
df_neu = df_neu[["new_id", "smiles", "dG", "abs_dG_diff", "count", "smiles_len"]]
print("df_neu", df_neu.shape)
display_df(df_neu.head())

df_tvt = df_neu[["smiles", "count"]].drop_duplicates()
df_tvt = df_tvt.sample(frac=1).reset_index(drop=True)
df_tvt = df_tvt.sort_values(by="count")
print("df_tvt", df_tvt.shape)
display_df(df_tvt.head())

ntest = 165
nval = 100
ntrain = df_tvt.shape[0] - ntest - nval
df_tvt["train_test"] = ["test"]*ntest + ["val"]*nval + ["train"]*ntrain

print("df_tvt", df_tvt.shape)
display_df(df_tvt.head())

df_tvt[["smiles", "train_test"]].to_csv("data/process/tvt.csv", index=False)

## labeled pdY for regrssion

In [None]:
assert False

pdY = pd.read_csv("data/process/Data1_clean.csv")
pdY = pdY[["new_id", "smiles", "code", "smiles_len", "dG"]]
print("pdY", pdY.shape)
display_df(pdY.head())

df_tvt = pd.read_csv("data/process/tvt.csv")
print("df_tvt", df_tvt.shape)
display_df(df_tvt.head())

pdY = pdY.merge(df_tvt, how="left", on="smiles")
print("pdY", pdY.shape)
display_df(pdY.head())

pdY.to_csv("data/process/pdY_labeled_reg.csv", index=False)

## pdY for chembl_27 for prediction

In [None]:
assert False

pdY_chembl_27 = pd.read_csv("../AchE_ML/data/process/pdY_chembl_27.csv")
pdY_chembl_27 = pdY_chembl_27.rename(columns={"chemid": "new_id"})
pdY_chembl_27 = pdY_chembl_27.drop(["id"], axis=1)
print("pdY_chembl_27", pdY_chembl_27.shape)
pdY_chembl_27 = pdY_chembl_27.drop_duplicates(subset=["smiles"])
print("pdY_chembl_27", pdY_chembl_27.shape)

display_df(pdY_chembl_27.head())

pdY_chembl_27.to_csv("data/process/pdY_chembl_27.csv", index=False)

In [None]:
pd.read_csv("data/process/pdY_chembl_27.csv").head()

In [None]:
a = pd.read_csv("data/process/pdY_chembl_27.csv").columns.tolist()
a

## pdY for MCE library

In [None]:
assert False

files = [
    "MCE Library-Detailed Information-HY-LD-000003385-1-Jun 09, 2022.xlsx",
    "MCE Library-Detailed Information-HY-LD-000003385-2-Jun 09, 2022.xlsx",
    "MCE Library-Detailed Information-HY-LD-000004022-Feb 16, 2023.xlsx",
    "MCE Library-Detailed Information-HY-LD-000004023-HY-L0004-Feb 16, 2023.xlsx",
]

pdY_mce = []
for f in files:
    p = os.path.join("data/raw", f)
    print(p)
    df = pd.read_excel(p, sheet_name="Library Detailed Information", header=6)
    print(df.shape)
    pdY_mce.append(df[["Smiles"]])
pdY_mce = pd.concat(pdY_mce, axis=0, ignore_index=True)
print("pdY_mce", pdY_mce.shape)
pdY_mce = pdY_mce.drop_duplicates()

print("pdY_mce", pdY_mce.shape)
display_df(pdY_mce.head())

pdY_mce = pdY_mce.rename(columns={"Smiles": "smiles"})

pdY_mce["new_id"] = list(range(pdY_mce.shape[0]))
pdY_mce["new_id"] = "_" + pdY_mce["new_id"].astype(str)
pdY_mce["dG"] = np.nan
pdY_mce["code"] = "mce"
pdY_mce["train_test"] = "pred"
pdY_mce["smiles_len"] = pdY_mce["smiles"].apply(len)
print("pdY_mce", pdY_mce.shape)
display_df(pdY_mce.head())

pdY_mce.to_csv("data/process/pdY_mce.csv", index=False)

# pdX

## Extract RDKitDescriptors

### labeled set

In [None]:
pdY_labeled = pd.read_csv("data/process/pdY_labeled_reg.csv")
print("pdY_labeled", pdY_labeled.shape)
display_df(pdY_labeled.head())

rdkit_featurizer = dc.feat.RDKitDescriptors()
X = rdkit_featurizer(pdY_labeled["smiles"])

X1 = []
for y in X:
    if y.shape[0] > 0:
        X1.append(y.tolist())
    else:
        y = [np.nan]*200
        X1.append(y)
X1 = np.array(X1)

X2 = pd.DataFrame(X1, columns=rdkit_featurizer.descriptors)
X2["new_id"] = pdY_labeled["new_id"]
X2["smiles"] = pdY_labeled["smiles"]
X2["dG"] = pdY_labeled["dG"]
X2["code"] = pdY_labeled["code"]
X2["train_test"] = pdY_labeled["train_test"]
X2["smiles_len"] = pdY_labeled["smiles_len"]
if False:
    X2.to_csv("data/process/pdXY_labeled_rdkit_descriptors_200ft.csv", index=False)

## chembl_27

In [None]:
assert False

pdX_chembl_27 = pd.read_csv("../AchE_ML/data/process/pdXY_rdkit_descriptors_200ft.csv")
print("pdX_chembl_27", pdX_chembl_27.shape)
ii = pdX_chembl_27["code"] == "chembl_27"
pdX_chembl_27 = pdX_chembl_27[ii]
print("pdX_chembl_27", pdX_chembl_27.shape)

pdX_chembl_27 = pdX_chembl_27.drop_duplicates(subset=["smiles"])
print("pdX_chembl_27", pdX_chembl_27.shape)

pdX_chembl_27 = pdX_chembl_27.drop(["id", "dG", "code", "train_test", "smiles_len"], axis=1)
print("pdX_chembl_27", pdX_chembl_27.shape)

pdY_chembl_27 = pd.read_csv("data/process/pdY_chembl_27.csv")
print("pdY_chembl_27", pdY_chembl_27.shape)

pdXY_chembl_27 = pdY_chembl_27.merge(pdX_chembl_27, how="left", on="smiles")
print("pdXY_chembl_27", pdXY_chembl_27.shape)
display_df(pdXY_chembl_27.head())
pdXY_chembl_27.to_csv("data/process/pdXY_chembl_27_rdkit_descriptors_200ft.csv", index=False)

## mce

In [None]:
pdY_mce = pd.read_csv("data/process/pdY_mce.csv")
print("pdY_mce", pdY_mce.shape)
display_df(pdY_mce.head())

rdkit_featurizer = dc.feat.RDKitDescriptors()
X = rdkit_featurizer(pdY_mce["smiles"])

X1 = []
for y in X:
    if y.shape[0] > 0:
        X1.append(y.tolist())
    else:
        y = [np.nan]*200
        X1.append(y)
X1 = np.array(X1)

X2 = pd.DataFrame(X1, columns=rdkit_featurizer.descriptors)
X2["new_id"] = pdY_mce["new_id"]
X2["smiles"] = pdY_mce["smiles"]
X2["dG"] = pdY_mce["dG"]
X2["code"] = pdY_mce["code"]
X2["train_test"] = pdY_mce["train_test"]
X2["smiles_len"] = pdY_mce["smiles_len"]
if False:
    X2.to_csv("data/process/pdXY_mce_rdkit_descriptors_200ft.csv", index=False)

## Remove mostly zero columns

### labeled set

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_200ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()

mostly_zero_cols = []
for col in PDX_COLS:
    zero_rate = (pdXY_train[col] == 0).mean()
    if zero_rate > 0.95:
        print("{}    {}".format(col, zero_rate))
        mostly_zero_cols.append(col)

print("mostly_zero_cols", len(mostly_zero_cols))
print("there remain {}".format(len(PDX_COLS) - len(mostly_zero_cols)))

pdXY_labeled = pdXY_labeled.drop(mostly_zero_cols, axis=1)
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

pdXY_labeled.to_csv("data/process/pdXY_labeled_rdkit_descriptors_132ft.csv", index=False)

### chembl_27

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_200ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()

mostly_zero_cols = []
for col in PDX_COLS:
    zero_rate = (pdXY_train[col] == 0).mean()
    if zero_rate > 0.95:
        print("{}    {}".format(col, zero_rate))
        mostly_zero_cols.append(col)

print("mostly_zero_cols", len(mostly_zero_cols))
print("there remain {}".format(len(PDX_COLS) - len(mostly_zero_cols)))

pdXY_chembl_27 = pd.read_csv("data/process/pdXY_chembl_27_rdkit_descriptors_200ft.csv")
print(pdXY_chembl_27.shape)
pdXY_chembl_27 = pdXY_chembl_27.drop(mostly_zero_cols, axis=1)
print(pdXY_chembl_27.shape)
display_df(pdXY_chembl_27.head())

pdXY_chembl_27.to_csv("data/process/pdXY_chembl_27_rdkit_descriptors_132ft.csv", index=False)

## mce

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_200ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()

mostly_zero_cols = []
for col in PDX_COLS:
    zero_rate = (pdXY_train[col] == 0).mean()
    if zero_rate > 0.95:
        print("{}    {}".format(col, zero_rate))
        mostly_zero_cols.append(col)

print("mostly_zero_cols", len(mostly_zero_cols))
print("there remain {}".format(len(PDX_COLS) - len(mostly_zero_cols)))

pdXY_mce = pd.read_csv("data/process/pdXY_mce_rdkit_descriptors_200ft.csv")
print(pdXY_mce.shape)
pdXY_mce = pdXY_mce.drop(mostly_zero_cols, axis=1)
print(pdXY_mce.shape)
display_df(pdXY_mce.head())

pdXY_mce.to_csv("data/process/pdXY_mce_rdkit_descriptors_132ft.csv", index=False)

## Remove correlated columns

### labeled set

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_132ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

remover = CollinearColumnRemover(0.95, exclude_cols=PDY_COLS)
remover.fit(pdXY_train)

pdXY_labeled = remover.transform(pdXY_labeled)
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

pdXY_labeled.to_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft.csv", index=False)

### chembl_27

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_132ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

remover = CollinearColumnRemover(0.95, exclude_cols=PDY_COLS)
remover.fit(pdXY_train)

pdXY_chembl_27 = pd.read_csv("data/process/pdXY_chembl_27_rdkit_descriptors_132ft.csv")
print(pdXY_chembl_27.shape)
pdXY_chembl_27 = remover.transform(pdXY_chembl_27)
print(pdXY_chembl_27.shape)
display_df(pdXY_chembl_27.head())

pdXY_chembl_27.to_csv("data/process/pdXY_chembl_27_rdkit_descriptors_104ft.csv", index=False)

## mce

In [None]:
assert True

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_132ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

remover = CollinearColumnRemover(0.95, exclude_cols=PDY_COLS)
remover.fit(pdXY_train)

pdXY_mce = pd.read_csv("data/process/pdXY_mce_rdkit_descriptors_132ft.csv")
print(pdXY_mce.shape)
pdXY_mce = remover.transform(pdXY_mce)
print(pdXY_mce.shape)
display_df(pdXY_mce.head())

pdXY_mce.to_csv("data/process/pdXY_mce_rdkit_descriptors_104ft.csv", index=False)

## Impute missing values

### labeled set

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

imputer = NumImputer(method="median", exclude_cols=PDY_COLS)
imputer.fit(pdXY_train)

pdXY_labeled = imputer.transform(pdXY_labeled)
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

pdXY_labeled.to_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft_imputed.csv", index=False)

### chembl_27

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

imputer = NumImputer(method="median", exclude_cols=PDY_COLS)
imputer.fit(pdXY_train)

pdXY_chembl_27 = pd.read_csv("data/process/pdXY_chembl_27_rdkit_descriptors_104ft.csv")
print(pdXY_chembl_27.shape)
pdXY_chembl_27 = imputer.transform(pdXY_chembl_27)
print(pdXY_chembl_27.shape)
display_df(pdXY_chembl_27.head())

pdXY_chembl_27.to_csv("data/process/pdXY_chembl_27_rdkit_descriptors_104ft_imputed.csv", index=False)

## mce

In [None]:
assert True

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

imputer = NumImputer(method="median", exclude_cols=PDY_COLS)
imputer.fit(pdXY_train)

pdXY_mce = pd.read_csv("data/process/pdXY_mce_rdkit_descriptors_104ft.csv")
print(pdXY_mce.shape)
pdXY_mce = imputer.transform(pdXY_mce)
print(pdXY_mce.shape)
display_df(pdXY_mce.head())

pdXY_mce.to_csv("data/process/pdXY_mce_rdkit_descriptors_104ft_imputed.csv", index=False)

## Standardize

### labeled set

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft_imputed.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

std = Standardizer(exclude_cols=PDY_COLS)
std.fit(pdXY_train)

pdXY_labeled = std.transform(pdXY_labeled)
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

pdXY_labeled.to_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft_imputed_std.csv", index=False)

### chembl_27

In [None]:
assert False

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft_imputed.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

std = Standardizer(exclude_cols=PDY_COLS)
std.fit(pdXY_train)

pdXY_chembl_27 = pd.read_csv("data/process/pdXY_chembl_27_rdkit_descriptors_104ft_imputed.csv")
print(pdXY_chembl_27.shape)
pdXY_chembl_27 = std.transform(pdXY_chembl_27)
print(pdXY_chembl_27.shape)
display_df(pdXY_chembl_27.head())

pdXY_chembl_27.to_csv("data/process/pdXY_chembl_27_rdkit_descriptors_104ft_imputed_std.csv", index=False)

## mce

In [7]:
assert True

pdXY_labeled = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft_imputed.csv")
print(pdXY_labeled.shape)
display_df(pdXY_labeled.head())

PDY_COLS = ["new_id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_labeled.columns if col not in PDY_COLS]
print("PDX_COLS", len(PDX_COLS))

pdXY_train = pdXY_labeled[pdXY_labeled["train_test"] == "train"].copy()
print("pdXY_train", pdXY_train.shape)

std = Standardizer(exclude_cols=PDY_COLS)
std.fit(pdXY_train)

pdXY_mce = pd.read_csv("data/process/pdXY_mce_rdkit_descriptors_104ft_imputed.csv")
print(pdXY_mce.shape)
pdXY_mce = std.transform(pdXY_mce)
print(pdXY_mce.shape)
display_df(pdXY_mce.head())

pdXY_mce.to_csv("data/process/pdXY_mce_rdkit_descriptors_104ft_imputed_std.csv", index=False)

(2745, 110)


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BalabanJ,BertzCT,Chi1n,Chi3v,Chi4n,HallKierAlpha,Ipc,Kappa2,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState10,VSA_EState8,VSA_EState9,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,fr_Al_COO,fr_Al_OH,fr_Ar_N,fr_Ar_OH,fr_C_O,fr_C_O_noCOO,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_Ndealkylation2,fr_alkyl_halide,fr_allylic_oxid,fr_amide,fr_aniline,fr_bicyclic,fr_ester,fr_ether,fr_guanido,fr_halogen,fr_ketone,fr_methoxy,fr_unbrch_alkane,new_id,smiles,dG,code,train_test,smiles_len
0,9.204259,-0.512778,0.281759,0.629239,137.182,0.091173,-0.387092,0.387092,0.091173,1.3,1.9,2.4,2.746978,185.708896,3.274054,1.407606,0.811271,-0.86,2.048167,3.343878,10.840195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.331835,5.563451,6.544756,6.103966,5.106527,0.0,0.0,0.0,5.733667,6.103966,6.544756,35.895287,0.0,5.733667,0.0,0.0,0.0,11.651284,0.0,0.0,11.667418,0.0,46.25,6.103966,5.106527,6.544756,0.0,5.563451,0.0,0.0,30.331835,0.0,5.733667,0.0,0.0,24.5,0.25,0.0,0.0,0.0,1.0,0.0,1.0,2.0,2.0,2.0,0.0,0.0,0.0,1.0,0.6787,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,_0,NCC(O)c1ccccc1,-4.178881,labeled,test,14
1,12.122645,-1.094993,0.045323,0.750823,292.335,0.335259,-0.477638,0.477638,0.335259,1.095238,1.714286,2.190476,3.102089,550.604641,6.843398,3.191477,1.973608,-2.37,2.408776,7.422319,15.740105,0.0,0.0,11.814359,0.0,5.969305,9.589074,4.794537,0.0,0.0,13.847474,31.040744,12.841643,16.938224,19.490139,29.158437,0.0,0.0,5.917906,33.612855,10.633577,23.762553,0.0,10.633577,11.374773,0.0,0.0,22.890192,9.589074,5.917906,43.970844,0.0,95.5,5.969305,14.383612,23.295717,24.216416,0.0,25.122838,0.0,13.847474,10.633577,5.106527,0.0,0.0,58.333333,0.4,0.0,0.0,0.0,1.0,0.0,1.0,3.0,3.0,6.0,0.0,0.0,0.0,1.0,2.7179,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,_4,CCC(CC)C(=O)Nc1cc(ccc1NC(C)=O)C(O)=O,-6.238166,labeled,train,36
2,12.122645,-1.094993,0.045323,0.750823,292.335,0.335259,-0.477638,0.477638,0.335259,1.095238,1.714286,2.190476,3.102089,550.604641,6.843398,3.191477,1.973608,-2.37,2.408776,7.422319,15.740105,0.0,0.0,11.814359,0.0,5.969305,9.589074,4.794537,0.0,0.0,13.847474,31.040744,12.841643,16.938224,19.490139,29.158437,0.0,0.0,5.917906,33.612855,10.633577,23.762553,0.0,10.633577,11.374773,0.0,0.0,22.890192,9.589074,5.917906,43.970844,0.0,95.5,5.969305,14.383612,23.295717,24.216416,0.0,25.122838,0.0,13.847474,10.633577,5.106527,0.0,0.0,58.333333,0.4,0.0,0.0,0.0,1.0,0.0,1.0,3.0,3.0,6.0,0.0,0.0,0.0,1.0,2.7179,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,_5,CCC(CC)C(=O)Nc1cc(ccc1NC(C)=O)C(O)=O,-4.333782,labeled,train,36
3,10.744376,-3.738215,0.020741,0.704946,201.203,0.33518,-0.477639,0.477639,0.33518,1.230769,1.692308,2.0,3.155269,420.310694,3.272258,2.359031,0.787191,-1.4,2.028259,3.153719,5.106527,0.0,0.0,10.023291,0.0,5.969305,0.0,18.351308,0.0,0.0,0.0,24.265468,0.0,10.458935,18.318862,15.992596,0.0,0.0,5.138974,4.895483,0.0,29.82892,0.0,5.138974,0.0,0.0,0.0,19.49363,10.023291,0.0,10.357989,0.0,97.46,15.992596,13.212334,10.458935,0.0,12.132734,12.132734,0.0,0.0,0.0,10.245501,-3.738215,21.488752,27.16613,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3.0,2.0,6.0,0.0,0.0,0.0,1.0,0.0322,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,_9,NS(=O)(=O)c1ccc(cc1)C(O)=O,-3.552938,labeled,train,26
4,5.595635,0.298552,0.298552,0.674491,336.367,0.230801,-0.492841,0.492841,0.230801,1.0,1.84,2.64,1.752407,1012.484166,8.29334,5.000725,3.768374,-2.95,3.107281,5.275648,18.947452,0.0,35.739647,12.48687,0.0,0.0,0.0,0.0,4.5671,0.0,0.0,35.215144,12.487189,25.169271,23.514551,10.772448,0.0,0.0,0.0,12.965578,21.012537,42.09213,34.255427,23.514551,0.0,22.998047,0.0,21.012537,12.965578,0.0,5.563451,22.029828,40.8,0.0,0.0,0.0,6.792942,46.736074,16.820831,14.219595,6.066367,35.029411,18.947452,0.0,2.272647,49.227353,0.25,0.0,2.0,2.0,2.0,1.0,3.0,4.0,0.0,5.0,0.0,0.0,0.0,5.0,3.0963,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,2.0,0.0,_13,COc1ccc2cc3-c4cc5OCOc5cc4CC[n+]3cc2c1OC,-6.133722,labeled,train,39


PDX_COLS 104
pdXY_train (2480, 110)
(10665, 110)
(10665, 110)


Unnamed: 0,MaxEStateIndex,MinEStateIndex,MinAbsEStateIndex,qed,MolWt,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge,FpDensityMorgan1,FpDensityMorgan2,FpDensityMorgan3,BalabanJ,BertzCT,Chi1n,Chi3v,Chi4n,HallKierAlpha,Ipc,Kappa2,PEOE_VSA1,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,SMR_VSA1,SMR_VSA10,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA9,SlogP_VSA1,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA8,TPSA,EState_VSA1,EState_VSA10,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,VSA_EState10,VSA_EState8,VSA_EState9,FractionCSP3,NumAliphaticCarbocycles,NumAliphaticHeterocycles,NumAliphaticRings,NumAromaticCarbocycles,NumAromaticHeterocycles,NumAromaticRings,NumHAcceptors,NumHDonors,NumHeteroatoms,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,fr_Al_COO,fr_Al_OH,fr_Ar_N,fr_Ar_OH,fr_C_O,fr_C_O_noCOO,fr_Imine,fr_NH0,fr_NH1,fr_NH2,fr_Ndealkylation2,fr_alkyl_halide,fr_allylic_oxid,fr_amide,fr_aniline,fr_bicyclic,fr_ester,fr_ether,fr_guanido,fr_halogen,fr_ketone,fr_methoxy,fr_unbrch_alkane,new_id,smiles,dG,code,train_test,smiles_len
0,1.138666,-2.918967,-0.647733,0.541493,1.247466,0.892267,3.838644,-0.976803,0.106505,-1.22576,-1.082271,-0.850802,-1.171334,2.317622,0.642358,2.072645,0.772009,-0.538859,1.869141,-0.084655,-1.793263,-0.885006,-0.832943,0.361869,2.116025,-0.053778,-0.777608,2.021004,7.05108,-0.304406,-0.364771,1.76519,2.334737,0.394254,-0.12493,0.733258,-0.41827,1.719054,-0.910249,-1.247859,1.774614,1.997559,0.140462,-1.899153,2.342764,-0.457739,1.455515,0.1331,0.026615,0.733749,-0.488287,0.501495,-1.087064,0.011053,0.646648,0.724218,0.209956,1.399674,-0.771664,3.036183,0.024248,-0.660625,-0.312239,0.401637,6.953226,-1.91788,-1.142437,-0.82788,0.763083,0.078144,1.660003,1.904394,2.118734,-0.349121,-1.88724,0.888452,-0.288132,1.69871,1.346951,2.298739,1.357571,-1.263338,-0.599917,2.776975,-0.394079,-1.196629,-0.554257,-0.412337,4.097916,-1.418708,-1.027247,-0.245475,4.18749,-0.199046,-0.350927,-0.378899,-0.264844,-0.286409,-1.182143,-0.637499,4.924836,-0.189413,-0.305589,-0.210872,_0,O=S(C1=CC(C(F)(F)F)=C(Cl)C=C1)(N2CCN(C(C3=CC=C(N4C(C)=CC=N4)C=C3)=O)CC2)=O,,mce,pred,74
1,0.935286,0.95076,1.091449,0.276643,0.298401,-0.808311,3.153329,-1.879857,-1.05102,-1.851758,-1.523975,-0.364818,-0.861641,2.468217,0.713967,0.860485,1.468903,-1.407935,2.44513,-0.525335,-1.070505,-0.885006,-0.832943,-1.843024,4.650688,-1.08449,1.437216,-1.421093,-0.336859,-0.304406,1.099502,0.781063,3.116891,-0.550441,-1.32329,2.472108,-0.41827,1.84749,-0.910249,-1.442996,1.216949,1.809861,-0.483425,-1.166097,-0.591283,-0.457739,-0.378225,-0.31007,0.018419,-0.601355,-0.701536,5.047383,-1.180418,-1.264159,-0.973872,-0.712505,0.158401,3.676657,-1.219659,-0.853025,2.724495,2.321457,-1.647747,-0.210422,-0.208823,-0.137056,-1.463476,-0.82788,0.763083,0.078144,1.660003,4.226369,3.0954,-0.751453,-1.038427,-0.848073,-0.288132,-0.456175,-0.552104,3.337338,1.217006,-1.263338,-0.599917,2.776975,-0.394079,-0.139401,0.824437,-0.412337,1.707505,0.350664,-1.027247,-0.245475,-0.284937,-0.199046,0.940318,-0.378899,3.435515,-0.286409,-1.182143,-0.637499,-0.38831,-0.189413,-0.305589,-0.210872,_1,O=C(C(C1=CN(CCCN(C)C)C2=C1C=CC=C2)=C3C4=CNC5=C4C=CC=C5)NC3=O,,mce,pred,60
2,0.812681,-2.644796,-0.497358,0.717278,0.401803,0.876519,-0.117063,-0.019637,1.310088,-0.359424,-0.141831,0.225305,-0.751158,1.085159,-0.283879,0.521865,-0.405675,-0.260755,0.712402,-0.464184,-1.777627,-0.502066,-0.832943,-1.843024,2.116025,0.942391,0.295687,-0.161263,3.973561,-0.304406,1.799066,0.918731,-1.808031,0.52695,-0.019093,2.62207,-0.41827,-0.59388,-0.910249,-1.452482,-0.861119,1.960957,-0.483425,-1.899153,2.342764,-0.457739,3.412213,-1.03912,0.19064,-0.601355,-0.20346,0.566293,-1.397664,-0.313572,0.297036,-0.385471,0.679254,1.036759,1.446911,1.162039,-0.813537,-1.097155,0.34665,2.491323,3.775962,-1.346725,-1.881671,-0.82788,0.763083,0.078144,1.660003,-0.417581,1.142069,-0.751453,-1.462834,0.020189,-0.288132,1.69871,1.346951,1.260139,1.67369,-1.263338,-0.599917,-0.264269,-0.394079,-0.139401,-0.554257,-0.412337,0.5123,-1.418708,-1.027247,-0.245475,4.18749,-0.199046,-0.350927,-0.378899,-0.264844,-0.286409,-1.182143,-0.637499,3.596549,-0.189413,-0.305589,-0.210872,_2,OC(C1=CC=C(/C=C2SC(N(CC3=CC=CC(C(F)(F)F)=C3)C\2=O)=S)C=C1)=O,,mce,pred,60
3,1.722703,1.05848,0.431264,1.766542,-1.168176,-1.81744,2.856057,-1.710766,-2.465206,-0.204395,0.733274,1.798716,-0.437415,0.107421,-1.083489,-0.933592,-0.947738,-0.086939,0.560268,-1.389713,-1.41611,0.680356,0.892671,-1.843024,-0.418638,-1.08449,-1.827822,3.661371,-0.336859,-0.304406,0.024849,-1.223821,-0.610665,-0.387769,-1.694236,-0.331104,-0.41827,1.806645,-0.910249,-1.692091,0.383748,0.823522,-0.483425,-1.166097,1.682729,-0.457739,-0.378225,-1.026328,-1.176758,0.520379,-1.329818,1.4146,-1.43202,-1.264159,-1.475496,-1.137539,2.07129,-1.172795,-0.086513,2.475192,-0.422292,0.638177,-1.647747,-0.210422,1.233667,-1.851302,-1.667212,-0.82788,-0.868336,-1.699817,0.429636,4.226369,2.118734,-0.349121,-1.462834,-0.848073,-0.288132,-0.456175,-0.552104,1.260139,0.594448,-1.263338,-0.599917,5.81822,-0.394079,-2.253858,-1.932951,-0.412337,4.097916,-0.534022,-1.027247,-0.245475,-0.284937,-0.199046,-1.642171,1.377494,1.585336,-0.286409,-1.182143,-0.637499,0.939977,-0.189413,-0.305589,-0.210872,_3,CNC1=C2N=CN(CC3=CC=CC=C3F)C2=NC=N1,,mce,pred,34
4,1.454458,1.160468,-0.616672,0.308298,1.134927,-0.799627,2.849859,-1.707241,-1.038851,-1.899577,-1.491885,-0.551398,-1.593903,2.234755,1.589723,2.533277,3.007638,-1.106655,1.982638,-0.02482,-1.051682,-0.885006,-0.832943,-1.843024,4.650688,-1.08449,1.364316,-1.421093,-0.336859,-0.304406,0.709882,2.830671,3.462467,0.013641,-1.32329,1.465589,-0.41827,0.224512,-0.174897,-0.69571,2.76127,2.501416,-0.483425,-0.547996,1.942579,-0.457739,1.455515,-0.275845,-1.176758,0.442817,1.102691,-0.446773,-1.412586,-1.264159,-0.973872,-0.712505,1.966746,4.663326,-0.699666,0.360553,2.649266,0.787825,-0.312239,2.50982,-0.446378,0.0968,-0.824165,-0.82788,4.025922,3.634067,1.660003,1.904394,2.118734,-0.751453,-1.88724,-0.558652,-0.288132,3.853596,3.246007,4.375937,1.914376,-1.263338,-0.599917,1.256353,-0.394079,-0.139401,0.824437,-0.412337,4.097916,-1.418708,-1.027247,4.072096,-0.284937,-0.199046,0.940318,3.133888,1.585336,-0.286409,-1.182143,-0.637499,0.939977,-0.189413,-0.305589,-0.210872,_4,ClC1=C2C(CN(C3=CC=CC(C(N4CCC5(CCN(C6=CC=NC=C6)CC5)CC4)=O)=C3)C2=O)=CC=C1,,mce,pred,72


# Generate pdXY for ranker

In [None]:
assert True

pdXY_reg = pd.read_csv("data/process/pdXY_labeled_rdkit_descriptors_104ft_imputed_std.csv")
pdXY_reg.loc[pdXY_reg["train_test"] == "val", "train_test"] = "train"
pdXY_reg = pdXY_reg.rename(columns={"train_test": "tvt"})
print("pdXY_reg", pdXY_reg.shape)
display_df(pdXY_reg.head())

thresholds = [-12.1, -11.7, -10.9, -10. ,  -8.9,  -8.2,  -7.4,  -6.7,  -6. ,  -5.4]

pdXY_ranker = []
for i, th in enumerate(thresholds):
    df_ = pdXY_reg.copy()
    df_["bad"] = (df_["dG"] > th).astype(int)
    df_["data_set"] = "thres_{}".format(i)
    pdXY_ranker.append(df_)

pdXY_ranker = pd.concat(pdXY_ranker, axis=0, ignore_index=True)
print("pdXY_ranker", pdXY_ranker.shape)
display_df(pdXY_ranker.head())

pdXY_ranker.to_csv("data/process/pdXY_labeled_ranker_rdkit_descriptors_104ft_imputed_std.csv", index=False)