In [41]:
from padelpy import from_smiles
import pandas as pd
import numpy as np
import os

SEED = 45

In [None]:
train_data_path = os.path.join('..', 'data', 'preprocessed', '001_preprocessed_train_data.csv')
train_df = pd.read_csv(train_data_path)
test_df = pd.read_csv('../data/preprocessed/001_preprocessed_test_data.csv')
train_df.head()

Unnamed: 0,CANONICAL_SMILES,ACTIVITY
0,C1CCN(C1)C(=O)C2=NOC(=C2)C3=CC=CC=C3Cl,1
1,C1C2CC3CC1CC(C2)(C3)C(=O)NCC4=NN=C(N4C5=CC=C(C...,1
2,CCN(CCC#N)C1=CC(=C(C=C1)/C=N/NC2=NC(=CC(=O)N2)C)C,0
3,CC1=C(N(C2=C1C=C(C=C2)C(=O)OCCCN(C)C)CC3=CC=CC...,1
4,C1=CC=C2C(=C1)NC3=C(C(C(=C(N23)N)C#N)C4=CC=C(C...,1


## Calculating fingerprints

In [None]:
! wget https://github.com/dataprofessor/padel/raw/main/fingerprints_xml.zip -P ../fingerprints_xml
! unzip ../fingerprints_xml/fingerprints_xml.zip -d ../fingerprints_xml/

In [21]:
import glob

xml_files = glob.glob("../fingerprints_xml/*.xml")
xml_files.sort()
xml_files

['../fingerprints_xml/AtomPairs2DFingerprintCount.xml',
 '../fingerprints_xml/AtomPairs2DFingerprinter.xml',
 '../fingerprints_xml/EStateFingerprinter.xml',
 '../fingerprints_xml/ExtendedFingerprinter.xml',
 '../fingerprints_xml/Fingerprinter.xml',
 '../fingerprints_xml/GraphOnlyFingerprinter.xml',
 '../fingerprints_xml/KlekotaRothFingerprintCount.xml',
 '../fingerprints_xml/KlekotaRothFingerprinter.xml',
 '../fingerprints_xml/MACCSFingerprinter.xml',
 '../fingerprints_xml/PubchemFingerprinter.xml',
 '../fingerprints_xml/SubstructureFingerprintCount.xml',
 '../fingerprints_xml/SubstructureFingerprinter.xml']

In [22]:
FP_List = [
    'AtomPairs2DCount',
    'AtomPairs2D',
    'EState',
    'CDKextended',
    'CDK',
    'CDKgraphonly',
    'KlekotaRothCount',
    'KlekotaRoth',
    'MACCS',
    'PubChem',
    'SubstructureCount',
    'Substructure'
]

fp = dict(zip(FP_List, xml_files))
fp

{'AtomPairs2DCount': '../fingerprints_xml/AtomPairs2DFingerprintCount.xml',
 'AtomPairs2D': '../fingerprints_xml/AtomPairs2DFingerprinter.xml',
 'EState': '../fingerprints_xml/EStateFingerprinter.xml',
 'CDKextended': '../fingerprints_xml/ExtendedFingerprinter.xml',
 'CDK': '../fingerprints_xml/Fingerprinter.xml',
 'CDKgraphonly': '../fingerprints_xml/GraphOnlyFingerprinter.xml',
 'KlekotaRothCount': '../fingerprints_xml/KlekotaRothFingerprintCount.xml',
 'KlekotaRoth': '../fingerprints_xml/KlekotaRothFingerprinter.xml',
 'MACCS': '../fingerprints_xml/MACCSFingerprinter.xml',
 'PubChem': '../fingerprints_xml/PubchemFingerprinter.xml',
 'SubstructureCount': '../fingerprints_xml/SubstructureFingerprintCount.xml',
 'Substructure': '../fingerprints_xml/SubstructureFingerprinter.xml'}

In [23]:
train_df.head()

Unnamed: 0,CANONICAL_SMILES,ACTIVITY
0,C1CCN(C1)C(=O)C2=NOC(=C2)C3=CC=CC=C3Cl,1
1,C1C2CC3CC1CC(C2)(C3)C(=O)NCC4=NN=C(N4C5=CC=C(C...,1
2,CCN(CCC#N)C1=CC(=C(C=C1)/C=N/NC2=NC(=CC(=O)N2)C)C,0
3,CC1=C(N(C2=C1C=C(C=C2)C(=O)OCCCN(C)C)CC3=CC=CC...,1
4,C1=CC=C2C(=C1)NC3=C(C(C(=C(N23)N)C#N)C4=CC=C(C...,1


In [43]:
test_df.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [26]:
train_df.to_csv('molecule.smi', sep='\t', index=False, header=False)

In [46]:
from padelpy import padeldescriptor


fingerprint = 'PubChem'
fingerprint_output_file = ''.join([fingerprint, '.csv'])
fingerprint_descriptortypes = fp[fingerprint]

padeldescriptor(mol_dir='molecule.smi',d_file=fingerprint_output_file, #'Substructure.csv'
                #descriptortypes='SubstructureFingerprint.xml', 
                descriptortypes= fingerprint_descriptortypes,
                detectaromaticity=True,
                standardizenitro=True,
                standardizetautomers=True,
                threads=2,
                removesalt=True,
                log=True,
                fingerprints=True)

In [47]:
desc = pd.read_csv(fingerprint_output_file)
desc

Unnamed: 0,Name,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2330,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2331,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2332,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2333,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
desc.drop('Name',axis=1, inplace=True)
desc.to_csv('../data/preprocessed/002_train_pubchem_fps.csv', index=False)

## Initial modeling

In [36]:
train_fps = pd.read_csv('../data/preprocessed/002_train_pubchem_fps.csv')
activity = train_df['ACTIVITY']
train_fps.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [38]:
activity.value_counts()

ACTIVITY
0    5589
1    3747
Name: count, dtype: int64

In [39]:
from lazypredict.Supervised import LazyClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_fps, activity, test_size=.3, random_state=45)
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

 97%|█████████▋| 31/32 [08:26<00:04,  4.22s/it]

[LightGBM] [Info] Number of positive: 2605, number of negative: 3930
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011665 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1614
[LightGBM] [Info] Number of data points in the train set: 6535, number of used features: 538
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.398623 -> initscore=-0.411207
[LightGBM] [Info] Start training from score -0.411207


100%|██████████| 32/32 [08:27<00:00, 15.86s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NuSVC,0.75,0.73,0.73,0.75,33.26
ExtraTreesClassifier,0.76,0.73,0.73,0.75,3.69
LGBMClassifier,0.75,0.73,0.73,0.74,1.0
RandomForestClassifier,0.76,0.73,0.73,0.75,2.79
LogisticRegression,0.74,0.72,0.72,0.74,8.73
RidgeClassifier,0.74,0.72,0.72,0.73,0.56
RidgeClassifierCV,0.74,0.72,0.72,0.73,2.2
LinearDiscriminantAnalysis,0.74,0.72,0.72,0.73,2.19
LinearSVC,0.74,0.72,0.72,0.73,94.73
SVC,0.74,0.71,0.71,0.73,30.29


## Random Forest Classifier - Data processing

In [40]:
train_fps.head()

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
