In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('final_data6.csv')
df

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,CC1=NN(C(=C1C(=O)OCC(=O)NCCC2=CC=CC=C2)Cl)C3=C...,1
1,CC1=C(SC2=C1C(=O)N(C(=N2)SCC3=C(ON=C3C)C)C4CCC...,1
2,CCN(CC)CCCN(CC1=CC2=CC(=C(C=C2NC1=O)OC)OC)C(=O...,0
3,CC1=CC=CC=C1NC2=C/C(=N\S(=O)(=O)C3=CC=CS3)/C4=...,1
4,COC1=CC=C(C=C1)NC(=O)N2CCN(CC2)C3=NC(=CS3)C4=C...,1
...,...,...
9395,CCN(CC)S(=O)(=O)N1CCC(CC1)C(=O)NCC2=CC=CO2,0
9396,C1=CC=C(C=C1)N2C(=NN=N2)NCCC3=CC=CC=C3F,0
9397,CC1=CC(=NC(=C1C#N)SCC2=CC=C(C=C2)C#N)C,0
9398,COC1=CC=C(C=C1)CS(=O)(=O)/C(=C(\C(=O)NC2=CN=C(...,1


In [2]:
from rdkit import Chem
from rdkit.Chem import AllChem

smiles_list = df['PUBCHEM_EXT_DATASOURCE_SMILES'].tolist() 
mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]

In [3]:
def morgan_fps(data):
    fps = [AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048) for mol in data]
    fp_array = [np.array(fp) for fp in fps]  
    column_names = ["morgan_" + str(i) for i in range(2048)]  
    return pd.DataFrame(fp_array, columns=column_names)

X = morgan_fps(mols)
X













Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9395,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9396,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9397,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9398,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
y = df["PUBCHEM_ACTIVITY_OUTCOME"]
y

0       1
1       1
2       0
3       1
4       1
       ..
9395    0
9396    0
9397    0
9398    1
9399    0
Name: PUBCHEM_ACTIVITY_OUTCOME, Length: 9400, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 5)
X_train


Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
7568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1107,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4364,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2022,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5155,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3046,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4079,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2254,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
from imblearn.over_sampling import SMOTE

# assuming X_train and y_train are your training features and labels

smote = SMOTE(sampling_strategy='minority',random_state = 9)

X_sm, y_sm = smote.fit_resample(X_train, y_train)


In [15]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# Initialize AdaBoost with Decision Tree as the base estimator
adaboost_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),  # Weak learner
    n_estimators=185, 
    learning_rate=0.1, 
    random_state=2
)

# Train the model on the synthetic dataset
adaboost_model.fit(X_sm, y_sm)

# Make predictions
preds = adaboost_model.predict(X_test)
proba = adaboost_model.predict_proba(X_test)




In [16]:
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
roc_auc = roc_auc_score(y_test, proba[:, 1]) 


print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1} accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")

sensitivity: 0.6435233160621762, specificity: 0.7355191256830601, precision: 0.7195828505214369, f1: 0.6794310722100657 accuracy: 0.6882978723404255, roc_auc: 0.7534284662646167 
balanced_accuracy: 0.6895212208726181


In [17]:
import pandas as pd
import numpy as np
dp= pd.read_csv('val_data.csv')
dp

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,C1=NC(=S)C2=C(N1)N(C=N2)C3C(C(C(O3)CO)O)O,1
1,C1=C2C3=C(C(=C1O)O)OC(=O)C4=CC(=C(C(=C43)OC2=O...,1
2,CC(CC1=CC(=C(C=C1)O)O)(C(=O)O)NN,0
3,CCCCOC1=CC=C(C=C1)C(=O)CCN2CCCCC2.Cl,0
4,CC1CC(=O)NN=C1C2=CC3=C(C=C2)N=C(N3)C4=CC=C(C=C...,0
...,...,...
1815,CC1=C2C(=CC=C1)C(=NC(=N2)CNS(=O)(=O)C3=C(C=CC(...,1
1816,C1=CC(=C(C=C1CC(C(=O)O)N)O)O,1
1817,COCC1=C(C=NN1C2=NC=C3CCCC4=CC=CC=C4C3=N2)C(=O)...,0
1818,CC[C@]1(CCC(O1)[C@@]2(CCC3(O2)C[C@@H]([C@H]([C...,1


In [18]:
valdata = dp.copy()
colname = ["PUBCHEM_EXT_DATASOURCE_SMILES", "PUBCHEM_ACTIVITY_OUTCOME"]
valdata = valdata[colname]

valdata.drop_duplicates(inplace = True, ignore_index = True)
valdata.reset_index(inplace = True, drop = True)
valdata

Unnamed: 0,PUBCHEM_EXT_DATASOURCE_SMILES,PUBCHEM_ACTIVITY_OUTCOME
0,C1=NC(=S)C2=C(N1)N(C=N2)C3C(C(C(O3)CO)O)O,1
1,C1=C2C3=C(C(=C1O)O)OC(=O)C4=CC(=C(C(=C43)OC2=O...,1
2,CC(CC1=CC(=C(C=C1)O)O)(C(=O)O)NN,0
3,CCCCOC1=CC=C(C=C1)C(=O)CCN2CCCCC2.Cl,0
4,CC1CC(=O)NN=C1C2=CC3=C(C=C2)N=C(N3)C4=CC=C(C=C...,0
...,...,...
1815,CC1=C2C(=CC=C1)C(=NC(=N2)CNS(=O)(=O)C3=C(C=CC(...,1
1816,C1=CC(=C(C=C1CC(C(=O)O)N)O)O,1
1817,COCC1=C(C=NN1C2=NC=C3CCCC4=CC=CC=C4C3=N2)C(=O)...,0
1818,CC[C@]1(CCC(O1)[C@@]2(CCC3(O2)C[C@@H]([C@H]([C...,1


In [19]:
smi_list = valdata["PUBCHEM_EXT_DATASOURCE_SMILES"].to_list()
molsv = [Chem.MolFromSmiles(smi) for smi in smi_list]

X_val = morgan_fps(molsv)
X_val





Unnamed: 0,morgan_0,morgan_1,morgan_2,morgan_3,morgan_4,morgan_5,morgan_6,morgan_7,morgan_8,morgan_9,...,morgan_2038,morgan_2039,morgan_2040,morgan_2041,morgan_2042,morgan_2043,morgan_2044,morgan_2045,morgan_2046,morgan_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1816,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1817,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1818,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [20]:
y_val = valdata["PUBCHEM_ACTIVITY_OUTCOME"]
y_val

0       1
1       1
2       0
3       0
4       0
       ..
1815    1
1816    1
1817    0
1818    1
1819    1
Name: PUBCHEM_ACTIVITY_OUTCOME, Length: 1820, dtype: int64

In [21]:
y_valpred = adaboost_model.predict(X_val)
y_valproba = adaboost_model.predict_proba(X_val)

In [22]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_auc_score

tn, fp, fn, tp = confusion_matrix(y_val, y_valpred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
accuracy = (tn + tp) / (tn+tp+fn+fp)
balanced_accuracy = (sensitivity + specificity) / 2
precision = tp / (tp + fp)
f1 = 2 * (precision * sensitivity) / (precision + sensitivity)
#roc_auc = roc_auc_score(y_val, y_valproba[:, 1]) # calculate ROC AUC

print(f"sensitivity: {sensitivity}, specificity: {specificity}, precision: {precision}, f1: {f1}, accuracy: {accuracy}, roc_auc: {roc_auc} \nbalanced_accuracy: {balanced_accuracy}")


sensitivity: 0.7615384615384615, specificity: 0.589010989010989, precision: 0.6494845360824743, f1: 0.7010622154779971, accuracy: 0.6752747252747253, roc_auc: 0.7534284662646167 
balanced_accuracy: 0.6752747252747253
