# Loading all the libraries

In [44]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdmolfiles
from rdkit.Chem.SaltRemover import SaltRemover
from rdkit.Chem.rdmolfiles import SmilesWriter
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprint
from rdkit.SimDivFilters.rdSimDivPickers import MaxMinPicker
from rdkit import DataStructs
import os
import csv
from mordred import Calculator, descriptors
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculati
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix , matthews_corrcoef , cohen_kappa_score

# Loading given dataset.
The given SMILES were converted into SDF format using KNIME analytics. The structures were loaded using RDKit and descriptors were calculated using MORDRED python library. Filter based feature selection technique was applied to select the features on the basis of feature importance obtained from descision tree classifier.

In [8]:
data=pd.read_csv('input_forms/final_data.csv')

In [9]:
label=pd.read_csv('label_training.csv')

In [None]:
X=data.iloc[:,:]

In [14]:
y=label.iloc[:,-1]

# Data imbalance
The data was evenly balanced to remove under and over-sampling

In [16]:
rus = RandomUnderSampler(random_state=42, replacement=True)# fit predictor and target variable
x_rus, y_rus = rus.fit_resample(X, y)

print('original dataset shape:', Counter(y))
print('Resample dataset shape', Counter(y_rus))

original dataset shape: Counter({1: 7550, 0: 1556})
Resample dataset shape Counter({0: 1556, 1: 1556})


# To split the data into Training and test in the ratio 70:30

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Applying gradient boosted algorithm
Several classification algorithms were applied and the best results were obtained from the Gradient boosted algorithm.

In [29]:
forest = GradientBoostingClassifier(n_estimators=500,random_state=0)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))

Accuracy on training set: 0.936
Accuracy on test set: 0.880


In [31]:
y_train_pred=forest.predict(X_train)

In [32]:
y_test_pred=forest.predict(X_test)

# Model Evaluation

In [33]:
tn, fp, fn, tp = confusion_matrix(y_train, y_train_pred).ravel()

In [34]:
print(tn, fp, fn, tp)

718 380 27 5249


In [35]:
tn_t, fp_t, fn_t, tp_t = confusion_matrix(y_test, y_test_pred).ravel()

In [36]:
tn_t, fp_t, fn_t, tp_t

(210, 248, 81, 2193)

In [45]:
cohen_kappa_score(y_test, y_test_pred)

0.4949587348964327

In [46]:
matthews_corrcoef(y_test, y_test_pred)

0.512071225730151

# To predict the CAS anti-viral dataset
The SDF file was downloaded from ACS website and same molecular descriptors(as used in training set) were calculated.

# Loading data for CAS anti-viral dataset

In [47]:
cas=pd.read_csv('selected_cas.csv')

In [54]:
X_cas=cas.iloc[:,0:20]

In [55]:
y_cas=forest.predict(X_cas)

In [57]:
y_cas_result=pd.DataFrame(y_cas,columns=['cas_result'])

In [None]:
y_cas_result.to_csv('cas_res.csv')