In [1]:
from pathlib import Path
from warnings import filterwarnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import requests
from io import StringIO
import rdkit
from sklearn import svm, metrics, clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import auc, accuracy_score, recall_score
from sklearn.metrics import roc_curve, roc_auc_score
from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import rdmolops
from tqdm import tqdm_notebook
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from rdkit.Chem import SaltRemover
from IPython.display import SVG 
from rdkit.Chem import rdDepictor  
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.MolStandardize import rdMolStandardize
from rdkit import RDLogger 
import json
import pickle
import os
from pathlib import Path
import random

RDLogger.DisableLog('rdApp.*')
import warnings
warnings.filterwarnings('ignore')
SEED = 22

random.seed(22)
os.environ["PYTHONHASHSEED"] = str(22)
np.random.seed(22)


HERE = Path.cwd()
s2m = lambda s: Chem.MolFromSmiles(s)
m2s = lambda m: Chem.MolToSmiles(m)
cancolise = lambda s: m2s(s2m(s)) 
get_dict_query = lambda q : data[str(int(q))]
get_maccs = lambda smiles: np.array(MACCSkeys.GenMACCSKeys(s2m(smiles)))
get_mrgn2 = lambda smiles: np.array(GetMorganFingerprintAsBitVect(s2m(smiles), 2, nBits=2048))
get_mrgn3 = lambda smiles: np.array(GetMorganFingerprintAsBitVect(s2m(smiles), 3, nBits=2048))

In [3]:
dataset_file = 'AAP35567_example_data.csv'
#load data
df = pd.read_csv(
    dataset_file,
)
col = ['sid', 'cid', 'activity', 'protacxn', 'smiles', 'active']
df = df[col]
df

Unnamed: 0,sid,cid,activity,protacxn,smiles,active
0,24839317,4883364.0,Active,AAP35567,COc1ccccc1NS(=O)(=O)c1ccc(O)c(C(=O)OCC(=O)N2c3...,1
1,4250003,3243876.0,Active,AAP35567,CCN(CC)S(=O)(=O)N1CCC(C(=O)NCC2COc3ccccc3O2)CC1,1
2,26755795,444732.0,Active,AAP35567,CC(=CC(C)C=CC(O)=NO)C(=O)c1ccc(N(C)C)cc1,1
3,24840091,4883613.0,Active,AAP35567,NC(=O)C1CCN(c2ccc([N+](=O)[O-])cc2Cl)CC1,1
4,26747902,444899.0,Active,AAP35567,CCCCC/C=C\C/C=C\C/C=C\C/C=C\CCCC(=O)O,1
...,...,...,...,...,...,...
161355,24805552,3208561.0,Inactive,AAP35567,COc1cccc(N2CCN(Cc3nnnn3C3CCCCC3)CC2)c1,0
161356,7977452,1329847.0,Inactive,AAP35567,Cc1onc(-c2ccccc2Cl)c1C(=O)OCn1nnc2ccccc2c1=O,0
161357,24796843,1329926.0,Inactive,AAP35567,COc1ccc(CNC(=O)CN(C)S(=O)(=O)c2ccc(Cl)cc2)cc1,0
161358,22407532,2670545.0,Inactive,AAP35567,COc1ccc(OCC(=O)NNC(=O)Cc2coc3cc(OC)ccc23)cc1,0


In [4]:
def traing_validate_plot_model(DATA, fingerprint, target, fingerprint_any, label_to_model):
    model_dict = {}
    
    train_x,test_x,train_y,test_y = train_test_split(fingerprint_any, label_to_model, test_size=0.2, random_state=SEED)

    # NBVAL_CHECK_OUTPUT
    training_dict = {'Training':len(train_x),'Test':len(test_x)}
    model_RF = RandomForestClassifier(n_estimators=10, criterion="entropy")
    # Specify model
    model_ANN = MLPClassifier(hidden_layer_sizes=(30, 3), random_state=22)
    models_trained = {model_RF:f'RF_{fingerprint}',model_ANN:f'ANN_{fingerprint}'}

    for ml_model, model_name in models_trained.items():
    # Fit model on single split
        model_list = []
        model_list.append(training_dict)

        # Fit the model
        ml_model.fit(train_x, train_y)

        #Save model
        pickle.dump(ml_model, open(f'{DATA}/{model_name}_is_active.pkl', 'wb'))

        # Prediction probability on test set
        test_prob = ml_model.predict_proba(test_x)[:, 1]

        # Prediction class on test set
        test_pred = ml_model.predict(test_x)

        # Performance of model on test set
        accuracy = accuracy_score(test_y, test_pred)
        sens = recall_score(test_y, test_pred)
        spec = recall_score(test_y, test_pred, pos_label=0)
        auc = roc_auc_score(test_y, test_prob)

        # Performance metrics
        model_performance_dict = {'accuracy': accuracy,'sens':sens,'spec':spec,'auc':auc}
        model_list.append(model_performance_dict)
        model_dict[model_name] = model_list
        # Append ANN model
        models.append({"label": model_name, "model": ml_model}) #(30, 3)


    fig, ax = plt.subplots()

    # Below for loop iterates through your models list
    for model in models:
        # Select the model
        ml_model = model["model"]
        # Prediction probability on test set
        test_prob = ml_model.predict_proba(test_x)[:, 1]
        # Prediction class on test set
        test_pred = ml_model.predict(test_x)
        # Compute False postive rate and True positive rate
        fpr, tpr, thresholds = metrics.roc_curve(test_y, test_prob)
        # Calculate Area under the curve to display on the plot
        auc = roc_auc_score(test_y, test_prob)
        # Plot the computed values
        ax.plot(fpr, tpr, label=(f"{model['label']} AUC area = {auc:.2f}"))

    # Custom settings for the plot
    ax.plot([0, 1], [0, 1], "r--")
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.set_title("Receiver Operating Characteristic")
    ax.legend(loc="lower right")
    # Save plot
    fig.savefig(f"{DATA}/{target}_{fingerprint}_roc_auc", dpi=300, bbox_inches="tight", transparent=True)
    plt.close()

    return model_dict

In [5]:
target= 'AAP35567'
target_items =[]
DATA = HERE / f"{target}"

# Getting Data
df = df[['cid','smiles','active']]
data_count = {'Active':int(df.active.sum()),'Inactive':(len(df) - int(df.active.sum()))}
target_items.append(data_count)

#Dict for saving model information
target_only_dict = {}

#Getting Descriptor
df["fp_maccs"] = df["smiles"].apply(get_maccs) # MACCS descriptor
df["fp_mrgn"] = df["smiles"].apply(get_mrgn2)  # MORGAN OR ECFP2 descriptor  
fingerprint_maccs = df.fp_maccs.tolist()
fingerprint_mrgn = df.fp_mrgn.tolist()

#label
label = df.active.tolist()



for fingerprint in ['maccs','morgan2']:
    models = []
    if fingerprint == 'maccs':
        model_dict = traing_validate_plot_model(DATA,fingerprint, target,fingerprint_maccs, label)
    if fingerprint == 'morgan2':
        model_dict = traing_validate_plot_model(DATA,fingerprint, target,fingerprint_mrgn, label)
        

    target_items.append(model_dict)
target_only_dict[target] = target_items

with open(f'{DATA}/target_{target}.json', 'w') as fp:
    json.dump(target_only_dict, fp)
