In [2]:
from padelpy import from_smiles
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

SEED = 45

In [3]:
train_data_path = os.path.join('..', 'data', 'preprocessed', '001_preprocessed_train_data.csv')
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv('../data/preprocessed/001_preprocessed_test_data.csv')
train_df.head()

Unnamed: 0,CANONICAL_SMILES,ACTIVITY
0,C1CCN(C1)C(=O)C2=NOC(=C2)C3=CC=CC=C3Cl,1
1,C1C2CC3CC1CC(C2)(C3)C(=O)NCC4=NN=C(N4C5=CC=C(C...,1
2,CCN(CCC#N)C1=CC(=C(C=C1)/C=N/NC2=NC(=CC(=O)N2)C)C,0
3,CC1=C(N(C2=C1C=C(C=C2)C(=O)OCCCN(C)C)CC3=CC=CC...,1
4,C1=CC=C2C(=C1)NC3=C(C(C(=C(N23)N)C#N)C4=CC=C(C...,1


## Calculating fingerprints

In [4]:
train_smiles_data = train_df.copy()
train_smiles_data['Name'] = train_df.index
train_smiles_data.drop('ACTIVITY', axis=1, inplace=True)


test_smiles_data = test_df.copy()
test_smiles_data['Name'] = test_df.index
test_smiles_data.drop('ACTIVITY', axis=1, inplace=True)


train_smiles_data.head()

Unnamed: 0,CANONICAL_SMILES,Name
0,C1CCN(C1)C(=O)C2=NOC(=C2)C3=CC=CC=C3Cl,0
1,C1C2CC3CC1CC(C2)(C3)C(=O)NCC4=NN=C(N4C5=CC=C(C...,1
2,CCN(CCC#N)C1=CC(=C(C=C1)/C=N/NC2=NC(=CC(=O)N2)C)C,2
3,CC1=C(N(C2=C1C=C(C=C2)C(=O)OCCCN(C)C)CC3=CC=CC...,3
4,C1=CC=C2C(=C1)NC3=C(C(C(=C(N23)N)C#N)C4=CC=C(C...,4


In [None]:
# calculate padel descriptors
import sys
sys.path.append('../scripts/')

from calculatePadelDesc import run_padel
import sys

train_smiles_data.to_csv('train_smiles.smi', sep='\t', index=False, header=False)
run_padel('train_smiles.smi', 'train_padel_desc.csv', '../padelDescriptor/padelDescriptor.jar')

In [None]:
test_smiles_data.to_csv('test_smiles.smi', sep='\t', index=False, header=False)
run_padel('test_smiles.smi', 'test_padel_desc.csv', '../padelDescriptor/padelDescriptor.jar')

In [6]:
test_desc = pd.read_csv('test_padel_desc.csv')
test_desc.sort_values(by=['Name'], inplace=True)

test_desc = pd.merge(test_desc, test_smiles_data, on='Name', how='inner')
test_desc = pd.merge(test_desc, test_df, on='CANONICAL_SMILES', how='inner')

test_desc.drop(['Name', 'CANONICAL_SMILES'], axis=1, inplace = True)

test_desc.to_csv('../data/preprocessed/002_test_padel_fps.csv', index=False)

test_desc.head()

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,naAromAtom,nAromBond,nAtom,nHeavyAtom,nH,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ACTIVITY
0,2,1.6063,2.5802,111.9878,58.124239,0,0,50,27,23,...,52.65719,1.950266,18.058042,9.47162,6.061732,152000000000.0,30.0,0.95,122.0,1
1,0,0.4641,0.215389,116.4357,57.277067,0,0,47,28,19,...,56.786613,2.028093,23.257984,11.177582,9.092101,2512.0,35.0,1.795,138.0,1
2,0,0.6107,0.372954,106.3547,58.632204,0,0,52,24,28,...,49.039531,2.043314,9.465805,2.557161,6.908643,1362.0,37.0,3.074,122.0,1
3,0,1.3655,1.86459,65.9238,33.287516,0,0,28,16,12,...,32.273884,2.017118,8.437846,8.437846,0.0,410.0,25.0,2.088,80.0,0
4,0,1.9584,3.835331,124.5475,58.605895,0,0,47,32,15,...,64.307573,2.009612,27.136828,8.454813,6.055903,3377.0,52.0,3.785,162.0,1


## Preprocesiing

In [8]:
desc = pd.read_csv('train_padel_desc.csv')
desc.sort_values(by=['Name'], inplace=True)

desc = pd.merge(desc, train_smiles_data, on='Name', how='inner')
desc = pd.merge(desc, train_df, on='CANONICAL_SMILES', how='inner')

desc.drop(['Name', 'CANONICAL_SMILES'], axis=1, inplace = True)
desc.to_csv('../data/preprocessed/002_train_padel_fps.csv', index=False)

# # drop empty or one data columns
# low_variant_cols = desc.nunique()[desc.nunique() < 2].index
# low_variant_cols

# desc.drop(low_variant_cols, axis=1, inplace=True)

desc.shape

(9347, 1445)

In [41]:
desc.shape

(9346, 1198)

## Preprocessing categorical data

In [6]:
desc.describe()

  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _ensure_numeric((avg - values) ** 2)
  sqr = _e

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,nAtom,nHeavyAtom,nH,nB,nC,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ACTIVITY
count,9346.0,9253.0,9253.0,9253.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,...,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9267.0,9346.0,9346.0
mean,0.116199,0.573901,2.224509,104.141609,54.877386,46.125615,25.847528,20.278087,0.000107,18.916542,...,52.401802,2.024742,19.920766,7.840438,9.090722,5129041000.0,39.57126,1.922119,133.64327,0.401455
std,0.432471,1.376718,3.273835,24.442425,13.788791,12.119101,6.130049,6.973267,0.010344,4.895887,...,12.662543,0.038712,6.920471,4.878467,4.775299,31383720000.0,12.565586,1.321932,34.500398,0.490219
min,0.0,-7.5606,2e-06,21.465,11.139172,9.0,5.0,3.0,0.0,2.0,...,7.87132,1.574264,2.532151,0.0,0.0,18.0,1.0,-6.996,10.0,0.0
25%,0.0,-0.2907,0.239219,87.4975,45.554877,38.0,22.0,15.0,0.0,16.0,...,43.815496,2.001979,14.972028,5.027177,6.094195,1108.0,31.0,1.0895,110.0,0.0
50%,0.0,0.6265,1.058018,103.5187,54.424618,45.0,26.0,20.0,0.0,19.0,...,52.051752,2.027358,19.792143,7.659857,9.169569,1841.0,39.0,1.885,132.0,0.0
75%,0.0,1.5191,3.034216,120.4307,63.327799,54.0,30.0,24.0,0.0,22.0,...,60.592086,2.049783,24.094879,10.660637,12.397808,2864.75,47.0,2.7185,156.0,1.0
max,6.0,7.5686,57.283706,415.4782,330.021032,196.0,113.0,111.0,1.0,71.0,...,227.461918,2.154775,122.873867,70.160254,61.155255,1056000000000.0,209.0,10.916,608.0,1.0


In [None]:
# outlier
# variance
# correlation
# normalization



In [27]:
continuous_cols = desc.select_dtypes(include='float64').columns

Q1 = desc[continuous_cols].quantile(0.25)
Q3 = desc[continuous_cols].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
desc[continuous_cols] = desc[continuous_cols].clip(lower=lower_bound, upper=upper_bound, axis=1)
desc.describe()

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,nAtom,nHeavyAtom,nH,nB,nC,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ACTIVITY
count,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,...,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0,9346.0
mean,0.0,0.58,1.96,103.99,54.69,45.97,25.78,20.17,0.0,18.87,...,52.28,2.03,19.86,7.75,9.06,2191.58,39.37,1.92,133.26,0.4
std,0.0,1.33,2.16,23.43,12.71,11.4,5.76,6.52,0.0,4.67,...,11.96,0.04,6.54,4.5,4.63,1424.92,11.61,1.25,32.44,0.49
min,0.0,-2.96,0.0,38.65,18.9,14.0,10.0,3.0,0.0,7.0,...,18.65,1.93,2.53,0.0,0.0,18.0,7.0,-1.31,41.0,0.0
25%,0.0,-0.28,0.24,87.63,45.55,38.0,22.0,15.0,0.0,16.0,...,43.82,2.0,14.97,5.03,6.09,1108.0,31.0,1.1,110.0,0.0
50%,0.0,0.61,1.08,103.79,54.42,45.0,26.0,20.0,0.0,19.0,...,52.05,2.03,19.79,7.66,9.17,1841.0,39.0,1.9,132.0,0.0
75%,0.0,1.51,3.0,120.28,63.33,54.0,30.0,24.0,0.0,22.0,...,60.59,2.05,24.09,10.66,12.4,2864.75,47.0,2.71,156.0,1.0
max,0.0,4.19,7.13,169.25,89.99,78.0,42.0,37.5,0.0,31.0,...,85.76,2.12,37.78,19.11,21.85,5499.88,71.0,5.13,225.0,1.0


In [24]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')

impute_columns = desc.columns
imputed_df = imputer.fit_transform(desc)

desc = pd.DataFrame(imputed_df, columns=impute_columns)
desc.head()


Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,nAtom,nHeavyAtom,nH,nB,nC,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ACTIVITY
0,0.0,0.04,0.0,74.93,39.29,32.0,19.0,13.0,0.0,14.0,...,38.84,2.04,14.64,5.66,6.43,726.0,26.0,2.17,100.0,1.0
1,0.0,1.69,2.85,139.57,75.41,63.0,35.0,28.0,0.0,27.0,...,73.19,2.09,23.7,2.59,12.94,3948.0,57.0,5.16,198.0,1.0
2,0.0,1.38,1.91,100.69,53.75,47.0,25.0,22.0,0.0,18.0,...,49.89,2.0,20.5,2.52,17.98,1794.0,35.0,2.1,120.0,0.0
3,0.0,2.57,6.59,117.77,62.95,55.0,27.0,28.0,0.0,23.0,...,54.64,2.02,12.04,5.55,6.49,2074.0,41.0,3.09,138.0,1.0
4,0.0,1.71,2.93,107.62,49.99,37.0,25.0,12.0,0.0,19.0,...,51.68,2.07,16.92,0.0,14.4,1320.0,47.0,2.42,138.0,1.0


In [26]:
# desc.drop('Name',axis=1, inplace=True)
desc.to_csv('../data/preprocessed/002_train_pubchem_fps.csv', index=False)

## Initial modeling

In [19]:
train_dataset = pd.read_csv('../data/preprocessed/002_train_pubchem_fps.csv')
y = train_dataset['ACTIVITY']
X = train_dataset.drop('ACTIVITY', axis=1)

train_dataset.head()

Unnamed: 0,nAcid,ALogP,ALogp2,AMR,apol,nAtom,nHeavyAtom,nH,nB,nC,...,WTPT-1,WTPT-2,WTPT-3,WTPT-4,WTPT-5,WPATH,WPOL,XLogP,Zagreb,ACTIVITY
0,0,0.0435,0.001892,74.9344,39.292309,32,19,13,0,14,...,38.842627,2.044349,14.641057,5.661056,6.431126,726.0,26.0,2.168,100.0,1
1,0,1.6888,2.852045,139.5712,75.406204,63,35,28,0,27,...,73.193694,2.091248,23.704766,2.586746,12.936973,3948.0,57.0,5.162,198.0,1
2,0,1.381,1.907161,100.6898,53.751446,47,25,22,0,18,...,49.88537,1.995415,20.501919,2.522207,17.979712,1794.0,35.0,2.103,120.0,0
3,0,2.5668,6.588462,117.7691,62.954204,55,27,28,0,23,...,54.636352,2.023569,12.038128,5.551697,6.486431,2074.0,41.0,3.09,138.0,1
4,0,1.7116,2.929575,107.6245,49.991516,37,25,12,0,19,...,51.678547,2.067142,16.921402,0.0,14.395067,1320.0,47.0,2.42,138.0,1


In [20]:
y.value_counts()

ACTIVITY
0    5594
1    3752
Name: count, dtype: int64

In [21]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=45)
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 31/32 [09:58<00:08,  8.80s/it]

[LightGBM] [Info] Number of positive: 2613, number of negative: 3929
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.150047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 223781
[LightGBM] [Info] Number of data points in the train set: 6542, number of used features: 983
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.399419 -> initscore=-0.407886
[LightGBM] [Info] Start training from score -0.407886


100%|██████████| 32/32 [10:06<00:00, 18.97s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NuSVC,0.78,0.76,0.76,0.77,49.6
SVC,0.76,0.75,0.75,0.76,41.86
LGBMClassifier,0.76,0.74,0.74,0.75,7.91
RidgeClassifierCV,0.75,0.73,0.73,0.75,3.42
ExtraTreesClassifier,0.75,0.73,0.73,0.75,5.77
RidgeClassifier,0.74,0.72,0.72,0.74,0.87
LogisticRegression,0.74,0.72,0.72,0.74,10.5
LinearDiscriminantAnalysis,0.74,0.72,0.72,0.74,3.43
LinearSVC,0.73,0.72,0.72,0.73,47.17
RandomForestClassifier,0.74,0.72,0.72,0.74,16.93


## Random Forest Classifier - Data processing

In [40]:
train_fps.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
