In [1]:
import pandas as  pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn import metrics

import math
from rdkit.Chem import AllChem
import rdkit
Chem = rdkit.Chem
from rdkit.Chem import GraphDescriptors
from rdkit.Chem import MACCSkeys, rdFingerprintGenerator

from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, Conv1D, MaxPooling1D
from tensorflow.keras.optimizers import Adam

In [2]:
baseall = pd.read_csv('./data/base_all.tsv', sep = '\t')

In [3]:
baseall = baseall.drop_duplicates(subset=['Molecule ChEMBL ID'])

In [4]:
baseall = baseall[['Molecule ChEMBL ID', 'Standard Value', 'Smiles']]

In [6]:
baseall = baseall.rename(columns={'Molecule ChEMBL ID': 'ChEMBL ID', 'Standard Value': 'IC50'})

In [7]:
baseall['Unit'] = 'nM'

In [8]:
def convert_ic50_to_pic50(IC50_value):
    pIC50_value = 9 - math.log10(IC50_value)
    return pIC50_value

In [9]:
baseall["pIC50"] = baseall.apply(lambda x: round(convert_ic50_to_pic50(x.IC50), 2), axis=1)

In [10]:
# add a column
baseall['active'] = np.zeros(len(baseall))
# label 1 for a molecule with pIC50 >= 6.5, 0 otherwise
baseall.loc[baseall[baseall.pIC50 >= 6.5].index, "active"] = 1.0

In [11]:
def smiles_to_fp(smiles, method = 'maccs', n_bits = 2048):
    mol = Chem.MolFromSmiles(smiles)

    if method == 'maccs':
        return np.array(MACCSkeys.GenMACCSKeys(mol))
    
    if method == 'morgan2':
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius = 2, fsize = n_bits)
        return np.array(fpg.GetFingerprint(mol))

    if method == 'morgan3':
        fpg = rdFingerprintGenerator.GetMorganGenerator(radius = 3, fsize = n_bits)
        return np.array(fpg.GetFingerPrint(mol))

    else:
        return np.array(MACCkeys.GenMACCSKeys(mol))

In [12]:
baseall['fp'] = baseall['Smiles'].apply(smiles_to_fp)
baseall.head()

Unnamed: 0,ChEMBL ID,IC50,Smiles,Unit,pIC50,active,fp
0,CHEMBL247416,2.0,CC1CC1CNc1cc(C(=O)N[C@@H](Cc2ccccc2)[C@@H](N)C...,nM,8.7,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,CHEMBL429477,66.0,COCCN(C[C@H]1C[C@@H]1C)c1cc(C(=O)N[C@@H](Cc2cc...,nM,7.18,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,CHEMBL393850,279.0,CC1CC1CNc1cc(C(=O)N[C@H](CN)Cc2ccccc2)cc(N(C)S...,nM,6.55,1.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,CHEMBL1546,100000.0,CC(N)Cc1ccc(O)cc1,nM,4.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,CHEMBL502753,1000.0,CC(C)C[C@H](NC(=O)c1cc(C(=O)N[C@H](C)c2ccc(F)c...,nM,6.0,0.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [22]:
X = baseall['fp']
y = baseall['pIC50']

In [23]:
X = baseall.fp.tolist()
y = baseall.active.tolist()

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
sc = StandardScaler()
X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [36]:
knn = KNeighborsClassifier(n_neighbors=5)

In [37]:
knn.fit(X_train_sc, y_train)
knn.score(X_train_sc, y_train)

0.8591525707586669

In [38]:
knn.score(X_test_sc, y_test)

0.8057602143335566

In [39]:
cross_val_score(knn, X_train_sc, y_train, cv=10).mean()

0.8030500887940258

In [45]:
pred = knn.predict(X_test_sc)

In [50]:
metrics.precision_score(y_test, pred)

0.7685851318944844

In [51]:
metrics.recall_score(y_test, pred)

0.8685636856368564

In [52]:
print(metrics.classification_report(y_test, pred))

              precision    recall  f1-score   support

         0.0       0.85      0.74      0.79       755
         1.0       0.77      0.87      0.82       738

    accuracy                           0.81      1493
   macro avg       0.81      0.81      0.81      1493
weighted avg       0.81      0.81      0.81      1493

