In [1]:
import pandas as pd
import numpy as np
import json

import utils
import ml

import warnings
warnings.filterwarnings("ignore")

## Downloading dataset

In [2]:
df = utils.download_data(filename='dataset.csv')
df = df[["smiles", "pIC50"]]
# Add column for activity
df["active"] = np.zeros(len(df))

# Mark every molecule as active with an pIC50 of >= 8.0, 0 otherwise
pIC50_cut_off = 8.0
df.loc[df[df.pIC50 >= pIC50_cut_off].index, "active"] = 1.0

df["finger print"] = df["smiles"].apply(utils.smiles_to_descriptors, type='morgan2')
df["selfies"] = df["smiles"].apply(utils.smiles_to_descriptors, type='selfies')
df['mordred'] = pd.read_pickle('saved_results/cleaned_up_mordred_descriptors.pkl')
mordred_features = np.vstack(df['mordred'])


import selfies as sf
with open('saved_results/selfies_voc.json', 'r') as f:
    voc = json.load(f)
pad_to_len = max(sf.len_selfies(s) for s in df["selfies"])
df['selfies encoding'] = df["selfies"].apply(utils.selfies_to_encoding, vocab_stoi=voc, pad_to_len=pad_to_len)

df

dataset.csv already exists in the current directory.


Unnamed: 0,smiles,pIC50,active,finger print,selfies,mordred,selfies encoding
0,Brc1cccc(Nc2ncnc3cc4ccccc4cc23)c1,11.522879,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[Br][C][=C][C][=C][C][Branch2][Ring1][=Branch1...,"[0.0, 0.0, 20.0, 22.0, 34.0, 22.0, 0.0, 0.0, 4...","[27, 34, 15, 34, 15, 34, 29, 46, 13, 41, 34, 1..."
1,CCOc1cc2ncnc(Nc3cccc(Br)c3)c2cc1OCC,11.221849,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[C][C][O][C][=C][C][=N][C][=N][C][Branch1][=N]...,"[0.0, 0.0, 16.0, 17.0, 42.0, 24.0, 0.0, 0.0, 6...","[34, 34, 44, 34, 15, 34, 18, 34, 18, 34, 28, 1..."
2,CN(C)c1cc2c(Nc3cccc(Br)c3)ncnc2cn1,11.221849,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",[C][N][Branch1][C][C][C][=C][C][=C][Branch1][=...,"[0.0, 0.0, 16.0, 17.0, 35.0, 21.0, 0.0, 0.0, 6...","[34, 41, 28, 34, 34, 34, 15, 34, 15, 28, 18, 4..."
3,Brc1cccc(Nc2ncnc3cc4[nH]cnc4cc23)c1,11.096910,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[Br][C][=C][C][=C][C][Branch2][Ring1][Branch1]...,"[0.0, 0.0, 19.0, 21.0, 31.0, 21.0, 0.0, 0.0, 6...","[27, 34, 15, 34, 15, 34, 29, 46, 28, 41, 34, 1..."
4,CNc1cc2c(Nc3cccc(Br)c3)ncnc2cn1,11.096910,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",[C][N][C][=C][C][=C][Branch1][=N][N][C][=C][C]...,"[0.0, 0.0, 16.0, 17.0, 32.0, 20.0, 0.0, 0.0, 6...","[34, 41, 34, 15, 34, 15, 28, 18, 41, 34, 15, 3..."
...,...,...,...,...,...,...,...
4630,COc1cc(C=C(C#N)C#N)cc(C)c1O,2.585027,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[C][O][C][=C][C][Branch1][=Branch2][C][=C][Bra...,"[0.0, 0.0, 6.0, 6.0, 26.0, 16.0, 0.0, 0.0, 4.0...","[34, 44, 34, 15, 34, 28, 14, 34, 15, 28, 46, 3..."
4631,O=C(O)/C=C/c1ccc(O)cc1,2.522879,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[O][=C][Branch1][C][O][/C][=C][/C][=C][C][=C][...,"[1.0, 0.0, 6.0, 6.0, 20.0, 12.0, 0.0, 0.0, 3.0...","[44, 15, 28, 34, 44, 7, 15, 7, 15, 34, 15, 28,..."
4632,CN(c1cccnc1)c1cc2c(Nc3ccc(F)c(Cl)c3)c(C#N)cnc2cn1,2.301030,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",[C][N][Branch1][=Branch2][C][=C][C][=C][N][=C]...,"[0.0, 0.0, 22.0, 23.0, 43.0, 29.0, 0.0, 0.0, 8...","[34, 41, 28, 14, 34, 15, 34, 15, 41, 15, 46, 1..."
4633,N#CC(C#N)Cc1ccc(O)cc1,2.187087,0.0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[N][#C][C][Branch1][Ring1][C][#N][C][C][=C][C]...,"[0.0, 0.0, 6.0, 6.0, 21.0, 13.0, 0.0, 0.0, 3.0...","[41, 3, 34, 28, 46, 34, 4, 34, 34, 15, 34, 15,..."


## Cross validation

### Baseline model

In [40]:
from sklearn import svm, metrics, clone
# Specify model
model_SVM = svm.SVC(kernel="rbf", C=1, gamma=0.5, probability=True)

base_line_cv = utils.base_model_crossvalidation(model_SVM, df, X_columns='finger print', y_columns='active')

5it [07:42, 92.45s/it]

ACC:	 0.81 ± 0.01 
AUC:	 0.82 ± 0.02 
F1:	 0.10 ± 0.03 






In [41]:
with open('saved_results/base_line_cv.json', 'w') as f:
    json.dump(base_line_cv, f, indent = 4)

### RNN model

In [6]:
saving_path_mordred_RFE = 'saved_results/Mordred_recursive_feature_selection.json'
with open(saving_path_mordred_RFE, 'r') as f:
    RFE_dict = json.load(f)

nummber_of_features = [e["Number of features"] for e in RFE_dict]
AUC_scores = [e["AUC Score"] for e in RFE_dict]
selected_modred_features_indices = [e["Selected Mordred features indices"] for e in RFE_dict]
optimal_mordred_features_indices = selected_modred_features_indices[np.argmax(AUC_scores)]
print(f'{len(optimal_mordred_features_indices)} Mordred features are selected, that result Test AUC: {AUC_scores[np.argmax(AUC_scores)]}\
\n\nMordred feature indices:\n{optimal_mordred_features_indices}')


60 Mordred features are selected, that result Test AUC: 0.903

Mordred feature indices:
[56, 126, 129, 134, 135, 137, 138, 139, 154, 158, 162, 169, 176, 180, 190, 194, 199, 203, 206, 207, 246, 264, 343, 350, 354, 358, 387, 417, 423, 428, 429, 431, 441, 444, 445, 452, 455, 459, 464, 468, 470, 471, 473, 474, 485, 490, 491, 492, 493, 494, 495, 496, 514, 527, 600, 613, 616, 617, 618, 623]


In [33]:
from dataclasses import dataclass
@dataclass
class LSTM_Config:
    vocab_size: int = len(voc)
    batch_size: int = 64
    rnn_units: int = 16
    hidden_dim: int = 32
    embedding_dim: int = 8
    reg_strength: float = 0.001
    lr: float = 1e-3
    drop_rate: float = 0.2
    epochs: int = 100
    early_stopping_patience: int = 8
    

lstm_config = LSTM_Config()


In [34]:
RNN_simple = utils.RNN_model_crossvalidation(df, lstm_config,
                                                    optimal_mordred_features_indices=optimal_mordred_features_indices,
                                                    add_finger_print=False, add_mordred=False)
with open('saved_results/RNN_simple_cv.json', 'w') as f:
    json.dump(RNN_simple, f, indent = 4)

0it [00:00, ?it/s]



1it [07:30, 450.14s/it]



2it [12:37, 366.36s/it]



3it [21:36, 445.19s/it]



4it [27:19, 404.83s/it]



5it [37:28, 449.78s/it]

ACC:	 0.81 ± 0.01 
AUC:	 0.80 ± 0.01 
F1:	 0.30 ± 0.16 






In [36]:
RNN_with_ECFP4 = utils.RNN_model_crossvalidation(df, lstm_config,
                                                    optimal_mordred_features_indices=optimal_mordred_features_indices,
                                                    add_finger_print=True, add_mordred=False)

with open('saved_results/RNN_with_ECFP4_cv.json', 'w') as f:
    json.dump(RNN_with_ECFP4, f, indent = 4)

0it [00:00, ?it/s]



1it [03:32, 212.45s/it]



2it [07:09, 215.37s/it]



3it [09:24, 178.46s/it]



4it [14:01, 217.33s/it]



5it [16:50, 202.12s/it]

ACC:	 0.85 ± 0.01 
AUC:	 0.89 ± 0.01 
F1:	 0.59 ± 0.03 






In [38]:
RNN_with_mordred = RNN_model_crossvalidation(df, lstm_config,
                                                    optimal_mordred_features_indices=optimal_mordred_features_indices,
                                                    add_finger_print=True, add_mordred=True)

with open('saved_results/RNN_with_ECFP4_mordred_cv.json', 'w') as f:
    json.dump(RNN_with_mordred, f, indent = 4)

0it [00:00, ?it/s]



1it [04:10, 250.95s/it]



2it [06:53, 198.89s/it]



3it [09:38, 183.56s/it]



4it [13:04, 192.41s/it]



5it [16:24, 196.95s/it]

ACC:	 0.86 ± 0.01 
AUC:	 0.89 ± 0.01 
F1:	 0.60 ± 0.02 




