In [None]:
import pandas as pd
import numpy as np
import deepchem as dc
from pubchempy import get_cids, get_compounds

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [None]:
def display_df(df):
    display(HTML(df.to_html()))
    return None

In [None]:
class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None, exclude_cols=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
            df_sel = df_sel.select_dtypes(["number", "bool"])
        
        df_sel = df_sel.astype("float32")
        
        all_cols = df_sel.columns.to_list()
        all_cols = [col for col in all_cols if col not in self._exclude_cols]
        df_sel = df_sel[all_cols]
        ncols = len(all_cols)
        
        corr_mat = df_sel.corr().abs()
        self._corr_mat = corr_mat
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = corr_mat.loc[col_i, col_j]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        print("Number of columns droped due to collinearity:", len(droped_col))
        return df.drop(droped_col, axis="columns")

In [None]:
class NumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, method="mean", exclude_cols=None):
        self._method = method
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
    
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        num_cols = [col for col in num_cols if col not in self._exclude_cols]
        
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in num_cols:
            self._impute_values[col] = df_train[col].agg(self._method)
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(val)
        
        # align columns
        df = df[self._train_cols]
        return df
    

class CatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, val="MISSING"):
        self._val = val
    
    def fit(self, df_train):
        cat_cols = df_train.select_dtypes(["object", "category", "bool"]).columns.to_list()
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in cat_cols:
            self._impute_values[col] = self._val
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].astype("object").fillna(val).astype("category")
                
        # align columns
        df = df[self._train_cols]
        return df

In [None]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, exclude_cols=None, to_array=False):
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
            
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        num_cols = [col for col in num_cols if col not in self._exclude_cols]
        
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
                df[col] = df[col] - self._mean[col]
                df[col] = df[col].astype("float32")
                
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

# Load `data_dedup.xlsx`

In [None]:
data_sht1 = pd.read_excel("data/raw/data_dedup.xlsx", sheet_name="Sheet1")
data_sht1 = data_sht1.drop(["STT", "PUBCHEM_COMPOUND_CID"], axis=1)
data_sht1 = data_sht1.rename(columns={"SMILE": "smiles", "Delta_G": "dG"})
data_sht1["code"] = "labeled_pubchem"

data_sht1["train_test"] = "train"
rnd = np.random.RandomState(42)
test_idx = rnd.choice(data_sht1.shape[0], size=162, replace=False)
data_sht1.iloc[test_idx, 3] = "test"

data_sht1["smiles_len"] = data_sht1["smiles"].str.len()

print(data_sht1.shape)
data_sht1.head()

# Load `vietherbs.smi`

In [None]:
vietherbs = pd.read_table("data/raw/vietherbs.smi", sep="\s+", header=None)
vietherbs = vietherbs.rename(columns={0: "smiles", 1: "pdb"})
vietherbs = vietherbs.drop(["pdb"], axis=1)
vietherbs["dG"] = np.nan
vietherbs["code"] = "vietherbs"
vietherbs["train_test"] = "pred"
vietherbs["smiles_len"] = vietherbs["smiles"].str.len()
print(vietherbs.shape)
vietherbs.head()

# Load `chembl_27`

In [None]:
chembl_27 = pd.read_table("data/raw/chembl_27.smi", sep="\s+", header=None)
chembl_27 = chembl_27.rename(columns={0: "smiles", 1: "pdb"})
chembl_27 = chembl_27.drop(["pdb"], axis=1)
chembl_27["dG"] = np.nan
chembl_27["code"] = "chembl_27"
chembl_27["train_test"] = "pred"
chembl_27["smiles_len"] = chembl_27["smiles"].str.len()

chembl_27 = chembl_27[chembl_27["smiles_len"] <= 200]

print(chembl_27.shape)
chembl_27.head()

# Load 5 extra ligands

In [None]:
extra_5 = pd.read_table("data/raw/extra_5.smi", sep="\s+", header=None)
extra_5 = extra_5.rename(columns={0: "smiles", 1: "pdb"})
extra_5 = extra_5.drop(["pdb"], axis=1)
extra_5["dG"] = np.nan
extra_5["code"] = "extra_5"
extra_5["train_test"] = "pred"
extra_5["smiles_len"] = extra_5["smiles"].str.len()
print(extra_5.shape)
extra_5.head()

# Combine `pdY`

In [None]:
if False:
    pdY = pd.concat([data_sht1, vietherbs, chembl_27, extra_5], axis=0, ignore_index=True)
    pdY = pdY.reset_index()
    pdY = pdY.rename(columns={"index": "id"})
    pdY["id"] = pdY["id"].astype(str) + "_"
    print(pdY.shape)

    pdY.to_csv("data/process/pdY.csv", index=False)

pdY = pd.read_csv("data/process/pdY.csv")

In [None]:
if False:
    pdY[pdY["train_test"] == "train"].to_csv("data/process/pdY_train.csv", index=False)
    pdY[pdY["train_test"] == "test"].to_csv("data/process/pdY_test.csv", index=False)

    pdY[pdY["code"] == "vietherbs"].to_csv("data/process/pdY_vietherbs.csv", index=False)
    pdY[pdY["code"] == "chembl_27"].to_csv("data/process/pdY_chembl_27.csv", index=False)
    pdY[pdY["code"] == "extra_5"].to_csv("data/process/pdY_extra_5.csv", index=False)

# Extract RDKitDescriptors

In [None]:
assert False

rdkit_featurizer = dc.feat.RDKitDescriptors()
X = rdkit_featurizer(pdY["smiles"])

X1 = []
for y in X:
    if y.shape[0] > 0:
        X1.append(y.tolist())
    else:
        y = [np.nan]*200
        X1.append(y)
X1 = np.array(X1)

X2 = pd.DataFrame(X1, columns=rdkit_featurizer.descriptors)
X2["id"] = pdY["id"]
X2["smiles"] = pdY["smiles"]
X2["dG"] = pdY["dG"]
X2["code"] = pdY["code"]
X2["train_test"] = pdY["train_test"]
X2["smiles_len"] = pdY["smiles_len"]
if False:
    X2.to_csv("data/process/pdXY_rdkit_descriptors_200ft.csv", index=False)

## 5 extra ligands

In [None]:
pdY_ex5 = pd.read_csv("data/process/pdY_extra_5.csv")


In [None]:
assert False
rdkit_featurizer = dc.feat.RDKitDescriptors()

pdY_ex5 = pd.read_csv("data/process/pdY_extra_5.csv")
X = rdkit_featurizer(pdY_ex5["smiles"])

X1 = []
for y in X:
    if y.shape[0] > 0:
        X1.append(y.tolist())
    else:
        y = [np.nan]*200
        X1.append(y)
X1 = np.array(X1)

X2 = pd.DataFrame(X1, columns=rdkit_featurizer.descriptors)
X2["id"] = pdY_ex5["id"]
X2["smiles"] = pdY_ex5["smiles"]
X2["dG"] = pdY_ex5["dG"]
X2["code"] = pdY_ex5["code"]
X2["train_test"] = pdY_ex5["train_test"]
X2["smiles_len"] = pdY_ex5["smiles_len"]
if False:
    X2.to_csv("data/process/pdXY_ex5_rdkit_descriptors_200ft.csv", index=False)

# Remove mostly zero columns

In [None]:
pdXY_200 = pd.read_csv("data/process/pdXY_rdkit_descriptors_200ft.csv")

In [None]:
PDY_COLS = ["id", "smiles", "dG", "code", "train_test", "smiles_len"]
PDX_COLS = [col for col in pdXY_200.columns if col not in PDY_COLS]

In [None]:
pdXY_200_train = pdXY_200[pdXY_200["train_test"] == "train"].copy()

mostly_zero_cols = []
for col in PDX_COLS:
    zero_rate = (pdXY_200_train[col] == 0).mean()
    if zero_rate > 0.99:
        print("{}    {}".format(col, zero_rate))
        mostly_zero_cols.append(col)

pdXY_156 = pdXY_200.drop(mostly_zero_cols, axis=1)
print(pdXY_156.shape)

pdXY_156.to_csv("data/process/pdXY_rdkit_descriptors_156ft.csv", index=False)

In [None]:
pdXY_156.head()

# Remove correlated columns

In [None]:
assert False

pdXY_156 = pd.read_csv("data/process/pdXY_rdkit_descriptors_156ft.csv")
print(pdXY_156.shape)

pdXY_156_train = pdXY_156[pdXY_156["train_test"] == "train"].copy()
print(pdXY_156_train.shape)

remover = CollinearColumnRemover(0.95, exclude_cols=PDY_COLS)
remover.fit(pdXY_156_train)

pdXY_123 = remover.transform(pdXY_156)
print(pdXY_123.shape)

if False:
    pdXY_123.to_csv("data/process/pdXY_rdkit_descriptors_123ft.csv", index=False)

In [None]:
pdXY_123.head()

## 5 extra ligands

In [None]:
assert False

pdXY_156 = pd.read_csv("data/process/pdXY_rdkit_descriptors_156ft.csv")
print(pdXY_156.shape)

pdXY_156_train = pdXY_156[pdXY_156["train_test"] == "train"].copy()
print(pdXY_156_train.shape)

pdXY_ex5_156 = pd.read_csv("data/process/pdXY_ex5_rdkit_descriptors_156ft.csv")
print(pdXY_ex5_156.shape)

remover = CollinearColumnRemover(0.95, exclude_cols=PDY_COLS)
remover.fit(pdXY_156_train)

pdXY_ex5_123 = remover.transform(pdXY_ex5_156)
print(pdXY_ex5_123.shape)

if False:
    pdXY_ex5_123.to_csv("data/process/pdXY_ex5_rdkit_descriptors_123ft.csv", index=False)

# Impute missing and standardize for `pdXY_123`

In [None]:
pdXY_123 = pd.read_csv("data/process/pdXY_rdkit_descriptors_123ft.csv")
print("pdXY_123.shape", pdXY_123.shape)

PDY_COLS = ["id", "smiles", "dG", "code", "train_test", "smiles_len"]

imputer = NumImputer(method="median", exclude_cols=PDY_COLS)
imputer.fit(pdXY_123[pdXY_123["train_test"] == "train"].copy())
pdXY_123_clean = imputer.transform(pdXY_123)
print("pdXY_123_clean.shape", pdXY_123_clean.shape)


std = Standardizer(exclude_cols=PDY_COLS)
std.fit(pdXY_123_clean[pdXY_123_clean["train_test"] == "train"].copy())
pdXY_123_clean = std.transform(pdXY_123_clean)
print("pdXY_123_clean.shape", pdXY_123_clean.shape)

pdXY_123_clean.to_csv("data/process/pdXY_rdkit_descriptors_123ft_clean.csv", index=False)

## 5 extra ligands

In [None]:
assert True

pdXY_123 = pd.read_csv("data/process/pdXY_rdkit_descriptors_123ft.csv")
print("pdXY_123.shape", pdXY_123.shape)

PDY_COLS = ["id", "smiles", "dG", "code", "train_test", "smiles_len"]

pdXY_123 = pdXY_123[pdXY_123["train_test"] == "train"].copy()
pdXY_ex5_123 = pd.read_csv("data/process/pdXY_ex5_rdkit_descriptors_123ft.csv")
pdXY_123 = pd.concat([pdXY_123, pdXY_ex5_123], axis=0, ignore_index=True)


imputer = NumImputer(method="median", exclude_cols=PDY_COLS)
imputer.fit(pdXY_123[pdXY_123["train_test"] == "train"].copy())
pdXY_123_clean = imputer.transform(pdXY_123)
print("pdXY_123_clean.shape", pdXY_123_clean.shape)


std = Standardizer(exclude_cols=PDY_COLS)
std.fit(pdXY_123_clean[pdXY_123_clean["train_test"] == "train"].copy())
pdXY_123_clean = std.transform(pdXY_123_clean)
print("pdXY_123_clean.shape", pdXY_123_clean.shape)

pdXY_123_clean = pdXY_123_clean[pdXY_123_clean["code"] == "extra_5"]
print("pdXY_123_clean.shape", pdXY_123_clean.shape)

pdXY_123_clean.to_csv("data/process/pdXY_ex5_rdkit_descriptors_123ft_clean.csv", index=False)

In [None]:
pdXY_123_clean