# **QSAR Model Building of H. Pylori Urease Inhibitors**

EL ATOUCH MOUNSIF

# Read in data

In [1]:
import pandas as pd

In [2]:
dataset_url = 'https://raw.githubusercontent.com/mounsifelatouch/cdd/master/data/bioactivity_data_PubchemFingerprinter.csv'
dataset = pd.read_csv(dataset_url)
dataset

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880,activity
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
608,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
609,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
610,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1


In [3]:
X = dataset.drop(['activity'], axis=1)
X

Unnamed: 0,PubchemFP0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP4,PubchemFP5,PubchemFP6,PubchemFP7,PubchemFP8,PubchemFP9,...,PubchemFP871,PubchemFP872,PubchemFP873,PubchemFP874,PubchemFP875,PubchemFP876,PubchemFP877,PubchemFP878,PubchemFP879,PubchemFP880
0,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,1,1,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
608,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
609,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
610,1,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [15]:
y = dataset['activity']

# Remove low variance features

In [16]:
from sklearn.feature_selection import VarianceThreshold

def remove_low_variance(input_data, threshold=0.01):
    selection = VarianceThreshold(threshold)
    selection.fit(input_data)
    return input_data[input_data.columns[selection.get_support(indices=True)]]

X = remove_low_variance(X, threshold=0.01)
X

Unnamed: 0,PubchemFP1,PubchemFP2,PubchemFP3,PubchemFP11,PubchemFP12,PubchemFP13,PubchemFP14,PubchemFP15,PubchemFP16,PubchemFP17,...,PubchemFP819,PubchemFP820,PubchemFP821,PubchemFP822,PubchemFP824,PubchemFP826,PubchemFP830,PubchemFP833,PubchemFP834,PubchemFP836
0,1,0,0,1,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,1,0,0,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,1,0,1,1,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0
3,1,1,0,1,1,0,1,0,0,0,...,1,0,1,0,0,0,0,0,0,0
4,1,0,0,1,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,1,0,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
608,1,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
609,1,0,0,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
610,1,0,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,1,0,1


In [17]:
X.to_csv('descriptor_list.csv', index = False)

# Logistic Regression Model

In [22]:
from sklearn.linear_model import LogisticRegression
seed = 123
import warnings
warnings.filterwarnings('ignore')

In [20]:
model = LogisticRegression(C=1.7118630423373915, class_weight='balanced', max_iter=335, penalty='l2', solver='sag', random_state=seed).fit(X, y)

## Model Prediction

In [26]:
y_pred_proba = model.predict_proba(X)[::,1]
y_pred_proba

array([5.22142531e-04, 4.92609690e-04, 3.05829633e-04, 1.63370164e-03,
       1.04969183e-04, 6.84532133e-05, 2.56353142e-04, 5.75563375e-04,
       3.39577669e-03, 2.95149580e-03, 4.70095829e-04, 1.15091743e-04,
       1.15091743e-04, 4.70095829e-04, 2.41171065e-05, 2.55178551e-05,
       1.31898654e-03, 1.15091743e-04, 2.65740317e-04, 1.53749517e-04,
       1.25566394e-04, 1.15091743e-04, 2.59661861e-03, 3.16588801e-04,
       3.78700301e-04, 1.19005176e-04, 7.79439996e-05, 4.20954648e-04,
       5.78285264e-06, 1.15181135e-01, 1.23860722e-04, 1.58454272e-03,
       9.18378618e-04, 1.89359838e-02, 1.47958522e-01, 1.01255917e-01,
       5.89011674e-02, 6.08341069e-01, 4.82411704e-03, 8.82069779e-03,
       5.12797707e-03, 9.91401968e-01, 6.61206449e-01, 9.93145131e-01,
       9.97518485e-01, 9.86292144e-01, 7.44060859e-01, 9.89386778e-01,
       9.97779609e-01, 9.74666608e-01, 9.87850394e-01, 9.00622670e-01,
       9.46354061e-01, 9.89148877e-01, 9.83348584e-01, 9.96483267e-01,
      

## Model Performance

In [23]:
from sklearn.metrics import *

In [27]:
roc_auc = roc_auc_score(y, y_pred_proba)
roc_auc

0.9974488384955752

In [30]:
precision, recall, thresholds_log = precision_recall_curve(y, y_pred_proba)
auc_precision_recall = auc(recall, precision)
auc_precision_recall

0.9901237103269839

In [34]:
precision_sc = precision_score(y, model.predict(X), average=None)
recall_sc = recall_score(y, model.predict(X), average=None)
test_score = accuracy_score(y, model.predict(X)) * 100
train_score = accuracy_score(y, model.predict(X)) * 100
f1 = f1_score(y, model.predict(X))

In [35]:
precision_sc, recall_sc, f1

(array([0.9954955 , 0.94047619]),
 array([0.97787611, 0.9875    ]),
 0.9634146341463415)

# Save Model as Pickle Object

In [36]:
import pickle

In [37]:
pickle.dump(model, open('HPU_model.pkl', 'wb'))