In [None]:
import pandas as pd
import numpy as np
import deepchem as dc
from pubchempy import get_cids, get_compounds

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler

In [None]:
def display_df(df):
    display(HTML(df.to_html()))
    return None

In [None]:
class CollinearColumnRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold, col_regex=None, exclude_cols=None):
        """
        :param threshold: float in [0, 1], if two columns have correlation greater than threshold
                          one of them will be removed
        :param col_regex: str, regular expression to select columns
        """
        self._threshold = threshold
        self._col_regex = col_regex
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
    
    def _collinear_columns(self, df, threshold):
        if self._col_regex is None:
            df_sel = df.select_dtypes(["number", "bool"])
        else:
            df_sel = df.filter(regex=self._col_regex)
            df_sel = df_sel.select_dtypes(["number", "bool"])
        
        df_sel = df_sel.astype("float32")
        
        all_cols = df_sel.columns.to_list()
        all_cols = [col for col in all_cols if col not in self._exclude_cols]
        df_sel = df_sel[all_cols]
        ncols = len(all_cols)
        
        corr_mat = df_sel.corr().abs()
        self._corr_mat = corr_mat
        collin_cols = []
        for i in range(ncols-1):
            col_i = all_cols[i]
            if col_i in collin_cols:
                continue
            
            for j in range(i + 1, ncols):
                col_j = all_cols[j]
                if col_j in collin_cols:
                    continue
                
                corr = corr_mat.loc[col_i, col_j]
                if corr > threshold:
                    collin_cols.append(col_j)
        
        collin_cols = list(set(collin_cols))
        return collin_cols
    
    
    def fit(self, df):
        self._collin_cols = self._collinear_columns(df, self._threshold)
        return self
    
    def transform(self, df):
        all_cols = df.columns.to_list()
        nonexist_cols = [col for col in self._collin_cols if col not in all_cols]
        if len(nonexist_cols) > 0:
            print("WARNING: These collinear cols to be droped do not exist in df:", nonexist_cols)
            
        droped_col = [col for col in self._collin_cols if col in all_cols]
        print("Number of columns droped due to collinearity:", len(droped_col))
        return df.drop(droped_col, axis="columns")

In [None]:
class NumImputer(BaseEstimator, TransformerMixin):
    def __init__(self, method="mean", exclude_cols=None):
        self._method = method
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
    
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        num_cols = [col for col in num_cols if col not in self._exclude_cols]
        
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in num_cols:
            self._impute_values[col] = df_train[col].agg(self._method)
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].fillna(val)
        
        # align columns
        df = df[self._train_cols]
        return df
    

class CatImputer(BaseEstimator, TransformerMixin):
    def __init__(self, val="MISSING"):
        self._val = val
    
    def fit(self, df_train):
        cat_cols = df_train.select_dtypes(["object", "category", "bool"]).columns.to_list()
        self._train_cols = df_train.columns.to_list()
        
        self._impute_values = {}
        for col in cat_cols:
            self._impute_values[col] = self._val
        return self
    
    def transform(self, df):
        df = df.copy()
        cols = df.columns.to_list()
        assert set(cols) == set(self._train_cols), "Do not have the same set of cols as train"
        
        for col, val in self._impute_values.items():
            if df[col].isnull().sum() > 0:
                df[col] = df[col].astype("object").fillna(val).astype("category")
                
        # align columns
        df = df[self._train_cols]
        return df

In [None]:
class Standardizer(BaseEstimator, TransformerMixin):
    def __init__(self, exclude_cols=None, to_array=False):
        if exclude_cols is None:
            self._exclude_cols = []
        else:
            self._exclude_cols = exclude_cols
            
        self._to_array = to_array
        
    def fit(self, df_train):
        num_cols = df_train.select_dtypes(["number"]).columns.to_list()
        num_cols = [col for col in num_cols if col not in self._exclude_cols]
        
        self._mean = {col: df_train[col].mean() for col in num_cols}
        self._std = {col: df_train[col].std() for col in num_cols}
        return self
    
    def transform(self, df):
        for col in self._mean:
            if self._std[col] > 0:
                df[col] = (df[col] - self._mean[col]) / self._std[col]
                df[col] = df[col].astype("float32")
            else:
                print("WARNING: " + col + " has zero std.")
                df[col] = df[col] - self._mean[col]
                df[col] = df[col].astype("float32")
                
        if self._to_array:
            return df.values.astype(np.float32)
        else:
            return df

In [None]:
!ls data/raw

# Load `IC50_list.xlsx`

In [None]:
bfe_df = pd.read_excel("data/raw/IC50_list.xlsx", sheet_name="IC50_all_clean")
bfe_df = bfe_df.drop(["No",], axis=1)
bfe_df["raw_Code"] = bfe_df["Code"].copy()
bfe_df["code"] = "labled"
bfe_df["dG"] = bfe_df["BFE"].copy()
bfe_df["smiles_len"] = bfe_df["smiles"].str.len()

smiles_counts = bfe_df["smiles"].value_counts().to_dict()
bfe_df["smiles_counts"] = bfe_df["smiles"].transform(lambda x: smiles_counts[x])

bfe_df["train_test"] = "train"
rnd = np.random.RandomState(42)
unique_smiles = bfe_df.loc[bfe_df["smiles_counts"] == 1, "smiles"].values
test_smiles = np.random.choice(unique_smiles, size=120, replace=False)
bfe_df.loc[bfe_df["smiles"].isin(test_smiles), "train_test"] ="test"


print(bfe_df.shape)
bfe_df.head()

In [None]:
bfe_df["train_test"].value_counts()

In [None]:
bfe_df["smiles_len"].min(), bfe_df["smiles_len"].max()

In [None]:
bfe_df[bfe_df["smiles_len"] > 250]

In [None]:
bfe_df["train_test"].value_counts()

In [None]:
figure_size = (3.2, 3.2*6/8)
dpi = 300
fontsize = 7
font = {"fontname": "Arial"}
out = "figures/dg_train_test_distr.pdf"

plt.figure(figsize=figure_size)

bfe_df.loc[bfe_df["train_test"] == "train", "dG"].plot(kind="kde", label="train")
bfe_df.loc[bfe_df["train_test"] == "test", "dG"].plot(kind="kde", label="test")
plt.legend(fontsize=fontsize)
plt.xlabel("$\Delta G_{EXP}$ (kcal/mol)", fontsize=fontsize, **font)
plt.ylabel("Density", fontsize=fontsize, **font)

plt.tight_layout()
plt.savefig(out, dpi=dpi)

In [None]:
plt.legend?

# Load `vietherbs.smi`

In [None]:
!ls ../AchE_ML

In [None]:
vietherbs = pd.read_table("../AchE_ML/data/raw/vietherbs.smi", sep="\s+", header=None)
vietherbs = vietherbs.rename(columns={0: "smiles", 1: "pdb"})
vietherbs = vietherbs.drop(["pdb"], axis=1)
vietherbs["raw_Code"] = np.nan
vietherbs["dG"] = np.nan
vietherbs["code"] = "vietherbs"
vietherbs["train_test"] = "pred"
vietherbs["smiles_len"] = vietherbs["smiles"].str.len()
print(vietherbs.shape)
vietherbs.head()

In [None]:
vietherbs["smiles_len"].min(), vietherbs["smiles_len"].max()

# Load `chembl_27`

In [None]:
chembl_27 = pd.read_table("../AchE_ML/data/raw/chembl_27.smi", sep="\s+", header=None)
chembl_27 = chembl_27.rename(columns={0: "smiles", 1: "pdb"})
chembl_27 = chembl_27.drop(["pdb"], axis=1)
chembl_27["raw_Code"] = np.nan
chembl_27["dG"] = np.nan
chembl_27["code"] = "chembl_27"
chembl_27["train_test"] = "pred"
chembl_27["smiles_len"] = chembl_27["smiles"].str.len()

print(chembl_27.shape)
chembl_27 = chembl_27[chembl_27["smiles_len"] <= 250]

print(chembl_27.shape)
chembl_27.head()

In [None]:
chembl_27["smiles_len"].min(), chembl_27["smiles_len"].max()

In [None]:
chembl_27["smiles_len"].quantile(q=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99])

# Load Nirmatrelvir (PF-07321332)

In [None]:
nirm = pd.read_table("data/raw/nirmatrelvir.smi", sep="\s+", header=None)
nirm = nirm.rename(columns={0: "smiles"})
nirm = nirm.drop([1], axis=1)

nirm["raw_Code"] = "Nirmatrelvir"
nirm["dG"] = np.nan
nirm["code"] = "PF-07321332"
nirm["train_test"] = "pred"
nirm["smiles_len"] = nirm["smiles"].str.len()

print(nirm.shape)
nirm.head()

# Combine `pdY`

In [None]:
if True:
    selected_cols = ["raw_Code", "code", "smiles", "smiles_len", "train_test", "dG"]
    pdY = pd.concat([bfe_df[selected_cols], vietherbs[selected_cols], chembl_27[selected_cols], nirm[selected_cols]], axis=0, ignore_index=True)
    
    print(pdY.shape)

    pdY.to_csv("data/process/pdY.csv", index=False)

pdY = pd.read_csv("data/process/pdY.csv")

In [None]:
pdY["code"].unique()

In [None]:
pdY["train_test"].value_counts()

In [None]:
if True:
    pdY[pdY["train_test"] == "train"].to_csv("data/process/pdY_train.csv", index=False)
    pdY[pdY["train_test"] == "test"].to_csv("data/process/pdY_test.csv", index=False)

    pdY[pdY["code"] == "vietherbs"].to_csv("data/process/pdY_vietherbs.csv", index=False)
    pdY[pdY["code"] == "chembl_27"].to_csv("data/process/pdY_chembl_27.csv", index=False)
    
    pdY[pdY["code"] == "PF-07321332"].to_csv("data/process/pdY_nirmatrelvir.csv", index=False)

# Extract RDKitDescriptors

In [None]:
assert True

rdkit_featurizer = dc.feat.RDKitDescriptors()
X = rdkit_featurizer(pdY["smiles"])

X1 = []
for y in X:
    if y.shape[0] > 0:
        X1.append(y.tolist())
    else:
        y = [np.nan]*200
        X1.append(y)
X1 = np.array(X1)

X2 = pd.DataFrame(X1, columns=rdkit_featurizer.descriptors)
X2["raw_Code"] = pdY["raw_Code"]
X2["code"] = pdY["code"]
X2["smiles"] = pdY["smiles"]
X2["smiles_len"] = pdY["smiles_len"]
X2["train_test"] = pdY["train_test"]
X2["dG"] = pdY["dG"]
if False:
    X2.to_csv("data/process/pdXY_rdkit_descriptors_200ft.csv", index=False)

In [None]:
pdY = pd.read_csv("data/process/pdY.csv")
rdkit_featurizer = dc.feat.RDKitDescriptors()
rdkit_featurizer(pdY["smiles"][:10])

## Nirmatrelvir (PF-07321332)

In [None]:
rdkit_featurizer = dc.feat.RDKitDescriptors()

pdY_nirm = pd.read_csv("data/process/pdY_nirmatrelvir.csv")
X = rdkit_featurizer(pdY_nirm["smiles"])

X2 = pd.DataFrame(X, columns=rdkit_featurizer.descriptors)
X2["raw_Code"] = pdY_nirm["raw_Code"]
X2["code"] = pdY_nirm["code"]
X2["smiles"] = pdY_nirm["smiles"]
X2["smiles_len"] = pdY_nirm["smiles_len"]
X2["train_test"] = pdY_nirm["train_test"]
X2["dG"] = pdY_nirm["dG"]
if True:
    X2.to_csv("data/process/pdXY_nirm_rdkit_descriptors_200ft.csv", index=False)

In [None]:
!ls data/process

# Remove mostly zero columns

In [None]:
pdXY_200 = pd.read_csv("data/process/pdXY_rdkit_descriptors_200ft.csv")

In [None]:
pdXY_200.head()

In [None]:
PDY_COLS = ["raw_Code", "code", "smiles", "smiles_len", "train_test", "dG"]
PDX_COLS = [col for col in pdXY_200.columns if col not in PDY_COLS]
len(PDX_COLS)

In [None]:
pdXY_200_train = pdXY_200[pdXY_200["train_test"] == "train"].copy()

mostly_zero_cols = []
for col in PDX_COLS:
    zero_rate = (pdXY_200_train[col] == 0).mean()
    print(col, zero_rate)
    if zero_rate > 0.90:
        print("{}    {}".format(col, zero_rate))
        mostly_zero_cols.append(col)

pdXY_132 = pdXY_200.drop(mostly_zero_cols, axis=1)
print(pdXY_132.shape)

pdXY_132.to_csv("data/process/pdXY_rdkit_descriptors_132ft.csv", index=False)

In [None]:
pdXY_132.head()

## Nirmatrelvir (PF-07321332)

In [None]:
pdXY_200 = pd.read_csv("data/process/pdXY_rdkit_descriptors_200ft.csv")
pdXY_nirm_200 = pd.read_csv("data/process/pdXY_nirm_rdkit_descriptors_200ft.csv")

PDY_COLS = ["raw_Code", "code", "smiles", "smiles_len", "train_test", "dG"]
PDX_COLS = [col for col in pdXY_200.columns if col not in PDY_COLS]
len(PDX_COLS)

In [None]:
pdXY_200_train = pdXY_200[pdXY_200["train_test"] == "train"].copy()

mostly_zero_cols = []
for col in PDX_COLS:
    zero_rate = (pdXY_200_train[col] == 0).mean()
    print(col, zero_rate)
    if zero_rate > 0.90:
        print("{}    {}".format(col, zero_rate))
        mostly_zero_cols.append(col)
        
        
pdXY_nirm_132 = pdXY_nirm_200.drop(mostly_zero_cols, axis=1)
print(pdXY_nirm_132.shape)

pdXY_nirm_132.to_csv("data/process/pdXY_nirm_rdkit_descriptors_132ft.csv", index=False)

# Remove correlated columns

In [None]:
assert False

pdXY_132 = pd.read_csv("data/process/pdXY_rdkit_descriptors_132ft.csv")
print(pdXY_132.shape)

pdXY_132_train = pdXY_132[pdXY_132["train_test"] == "train"].copy()
print(pdXY_132_train.shape)

remover = CollinearColumnRemover(0.95, exclude_cols=PDY_COLS)
remover.fit(pdXY_132_train)

pdXY_105 = remover.transform(pdXY_132)
print(pdXY_105.shape)

if False:
    pdXY_105.to_csv("data/process/pdXY_rdkit_descriptors_105ft.csv", index=False)

In [None]:
pdXY_105.head()

## Nirmatrelvir (PF-07321332)

In [None]:
pdXY_132 = pd.read_csv("data/process/pdXY_rdkit_descriptors_132ft.csv")
print(pdXY_132.shape)

pdXY_132_train = pdXY_132[pdXY_132["train_test"] == "train"].copy()
print(pdXY_132_train.shape)

remover = CollinearColumnRemover(0.95, exclude_cols=PDY_COLS)
remover.fit(pdXY_132_train)

pdXY_nirm_132 = pd.read_csv("data/process/pdXY_nirm_rdkit_descriptors_132ft.csv")
pdXY_nirm_105 = remover.transform(pdXY_nirm_132)
print(pdXY_nirm_105.shape)

if True:
    pdXY_nirm_105.to_csv("data/process/pdXY_nirm_rdkit_descriptors_105ft.csv", index=False)

# Impute missing and standardize for `pdXY_105`

In [None]:
pdXY_105 = pd.read_csv("data/process/pdXY_rdkit_descriptors_105ft.csv")
print("pdXY_105.shape", pdXY_105.shape)

PDY_COLS = ["raw_Code", "code", "smiles", "smiles_len", "train_test", "dG"]

imputer = NumImputer(method="median", exclude_cols=PDY_COLS)
imputer.fit(pdXY_105[pdXY_105["train_test"] == "train"].copy())
pdXY_105_clean = imputer.transform(pdXY_105)
print("pdXY_105_clean.shape", pdXY_105_clean.shape)


std = Standardizer(exclude_cols=PDY_COLS)
std.fit(pdXY_105_clean[pdXY_105_clean["train_test"] == "train"].copy())
pdXY_105_clean = std.transform(pdXY_105_clean)
print("pdXY_105_clean.shape", pdXY_105_clean.shape)

pdXY_105_clean.to_csv("data/process/pdXY_rdkit_descriptors_105ft_clean.csv", index=False)

# Nirmatrelvir (PF-07321332)

In [None]:
pdXY_105 = pd.read_csv("data/process/pdXY_rdkit_descriptors_105ft.csv")
print("pdXY_105.shape", pdXY_105.shape)

PDY_COLS = ["raw_Code", "code", "smiles", "smiles_len", "train_test", "dG"]

pdXY_nirm_105 = pd.read_csv("data/process/pdXY_nirm_rdkit_descriptors_105ft.csv")
pdXY_105 = pd.concat([pdXY_105, pdXY_nirm_105], axis=0, ignore_index=True)

imputer = NumImputer(method="median", exclude_cols=PDY_COLS)
imputer.fit(pdXY_105[pdXY_105["train_test"] == "train"].copy())
pdXY_105_clean = imputer.transform(pdXY_105)
print("pdXY_105_clean.shape", pdXY_105_clean.shape)

std = Standardizer(exclude_cols=PDY_COLS)
std.fit(pdXY_105_clean[pdXY_105_clean["train_test"] == "train"].copy())
pdXY_105_clean = std.transform(pdXY_105_clean)
print("pdXY_105_clean.shape", pdXY_105_clean.shape)

pdXY_105_clean = pdXY_105_clean[pdXY_105_clean["code"] == "PF-07321332"]
print("pdXY_105_clean.shape", pdXY_105_clean.shape)
pdXY_105_clean.to_csv("data/process/pdXY_nirm_rdkit_descriptors_105ft_clean.csv", index=False)

In [None]:
pdXY_nirm_105

In [None]:
pdXY_105_clean.head()

In [None]:
del pdXY_105, pdXY_105_clean

In [None]:
pdXY_105 = pd.read_csv("data/process/pdXY_rdkit_descriptors_105ft.csv")

In [None]:
pdXY_105.head()

In [None]:
PDY_COLS = ["raw_Code", "code", "smiles", "smiles_len", "train_test", "dG"]
PDX_COLS = [col for col in pdXY_105.columns if col not in PDY_COLS]
print(len(PDX_COLS))

In [None]:
pdXY_105[PDX_COLS].max().sort_values()

In [None]:
(pdXY_105["MinAbsPartialCharge"] == np.inf).sum()

In [None]:
pdXY_105[pdXY_105["MaxPartialCharge"] == np.inf]