In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, balanced_accuracy_score
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_validate
from sklearn import preprocessing
from imblearn.under_sampling import RandomUnderSampler
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.ensemble import VotingClassifier
from rdkit.Chem import MACCSkeys

In [None]:
def cf(cm):
    plt.rcParams["figure.figsize"] = (7, 5)
    cm_matrix = pd.DataFrame(data=cm, 
                             columns=['0', '1'], 
                             index=['0', '1'])
    sns.heatmap(cm_matrix.astype('int'), annot=True, cmap='Greens', fmt="d", 
               xticklabels=['NRB', 'RB'], yticklabels=['NRB', 'RB'], annot_kws={"size": 20})
    plt.tick_params(left=False, bottom=False)
    plt.xlabel('Predicted Class', fontsize=25)
    plt.xticks(fontsize=18)
    plt.ylabel('True Class', fontsize=25)
    plt.yticks(fontsize=18)
    plt.yticks(rotation=0) 
    plt.show()

In [None]:
seed = 2021

In [None]:
def ensemb(vot='soft', pca=False, fp=True, scaling=True, 
           undersampling=False, seed=123, test=0.2):
    
    df = pd.read_csv('data/All-Public_dataset_Mordred.csv')
    df = df.fillna(0)
    
    knn = KNeighborsClassifier(metric='manhattan', n_neighbors=5, weights='distance')
    svm = SVC(C=1, gamma=0.001, kernel='rbf', probability=True)
    rf = RandomForestClassifier(max_depth=8, min_samples_leaf=8, min_samples_split=16, n_estimators=100)
    gb = GradientBoostingClassifier(learning_rate=0.1, max_depth=4, n_estimators=75, subsample=0.7)
    
    votingC = VotingClassifier(estimators=[('knn', knn), 
                                           ('svm', svm),
                                           ('rf',rf), 
                                           ('gb', gb)], voting=vot, n_jobs=-1)
    
    if fp:
        molfps = []
        for i in range(df.shape[0]):
            mol = (Chem.MolFromSmiles(df['SMILES'].iloc[i]))
            bitstring = list(MACCSkeys.GenMACCSKeys(mol).ToBitString())
            fp_vect = list(map(int, bitstring))
            molfps.append(fp_vect)

        df['Fingerprint'] = molfps

        x = df.iloc[:, 2:-1].to_numpy()

        if pca:
            p = PCA(n_components=4)
            x = p.fit_transform(x)

        X = []
        for i in range(x.shape[0]):
            X.append(np.concatenate((x[i,:], np.array(df['Fingerprint'].iloc[i]))))
    if not fp:
        X = df.iloc[:, 2:].to_numpy()
    
    if scaling:
        X = preprocessing.StandardScaler().fit_transform(X)

    X = np.array(X)
    y = df['Class'].to_numpy()

    if undersampling:
        print('Undersampling...')
        X, y = RandomUnderSampler(random_state=seed).fit_resample(X, y)
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test, shuffle=True, stratify=y, random_state=seed)
    
    votingC.fit(x_train, y_train)
    y_pred = votingC.predict(x_test)
    
    ba = balanced_accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test.tolist(), y_pred)
    sn = cm[1,1]/(cm[1,1]+cm[1,0])
    sp = cm[0,0]/(cm[0,0]+cm[0,1])
    er = (cm[0,1]+cm[1,0])/sum(sum(cm))    
    
    return ba, sn, sp, er, cm

In [None]:
def prepro(model, pca=False, scaling=True):
    
    df = pd.read_csv('data/All-Public_dataset_Mordred.csv')
    df = df.fillna(0)
    
    if model == 'KNN':
        df = pd.read_csv('data/All-Public_dataset_Mordred_GA_KNN.csv')
        df = df.fillna(0)
        clf = KNeighborsClassifier(metric='manhattan', n_neighbors=5, weights='distance')
    elif model == 'SVM':
        clf = SVC(C=1, gamma=0.001, kernel='rbf')
    elif model == 'RF':
        clf = RandomForestClassifier(max_depth=8, min_samples_leaf=8, min_samples_split=16, n_estimators=100)
    elif model == 'GB':
        clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=4, n_estimators=75, subsample=0.7)

    molfps = []
    for i in range(df.shape[0]):
        mol = (Chem.MolFromSmiles(df['SMILES'].iloc[i]))
        bitstring = list(MACCSkeys.GenMACCSKeys(mol).ToBitString())
        fp_vect = list(map(int, bitstring))
        molfps.append(fp_vect)

    df['Fingerprint'] = molfps

    x = df.iloc[:, 2:-1].to_numpy()

    X = []
    for i in range(x.shape[0]):
        X.append(np.concatenate((x[i,:], np.array(df['Fingerprint'].iloc[i]))))

    X = np.array(X)
    if scaling:
        X = preprocessing.StandardScaler().fit_transform(X)
    y = df['Class'].to_numpy()
    
    return clf, X, y

In [None]:
def predict(clf, X, y, seed=123, test=0.2, undersampling=False):
    
    if undersampling:
        X, y = RandomUnderSampler(random_state=seed).fit_resample(X, y)
    
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test, shuffle=True, stratify=y, random_state=seed)
    
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    
    ba = balanced_accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test.tolist(), y_pred)
    sn = cm[1,1]/(cm[1,1]+cm[1,0])
    sp = cm[0,0]/(cm[0,0]+cm[0,1])
    er = (cm[0,1]+cm[1,0])/sum(sum(cm))

    return ba, sn, sp, er

In [None]:
clf, X, y = prepro('KNN')

knn_ba_list = []
knn_sn_list = []
knn_sp_list = []
knn_er_list = []
for t in tqdm([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]):
    ba_list = []
    sn_list = []
    sp_list = []
    er_list = []
    for r in range(0, 1000, 10):
        ba, sn, sp, er = predict(clf, X, y, seed=r, test=t)
        ba_list.append(ba)
        sn_list.append(sn)
        sp_list.append(sp)
        er_list.append(er)
    knn_ba_list.append(ba_list)
    knn_sn_list.append(sn_list)
    knn_sp_list.append(sp_list)
    knn_er_list.append(er_list)

In [None]:
clf, X, y = prepro('SVM')

svm_ba_list = []
svm_sn_list = []
svm_sp_list = []
svm_er_list = []
for t in tqdm([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]):
    ba_list = []
    sn_list = []
    sp_list = []
    er_list = []
    for r in range(0, 1000, 10):
        ba, sn, sp, er = predict(clf, X, y, seed=r, test=t)
        ba_list.append(ba)
        sn_list.append(sn)
        sp_list.append(sp)
        er_list.append(er)
    svm_ba_list.append(ba_list)
    svm_sn_list.append(sn_list)
    svm_sp_list.append(sp_list)
    svm_er_list.append(er_list)

In [None]:
clf, X, y = prepro('RF')

rf_ba_list = []
rf_sn_list = []
rf_sp_list = []
rf_er_list = []
for t in tqdm([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]):
    ba_list = []
    sn_list = []
    sp_list = []
    er_list = []
    for r in range(0, 1000, 10):
        ba, sn, sp, er = predict(clf, X, y, seed=r, test=t)
        ba_list.append(ba)
        sn_list.append(sn)
        sp_list.append(sp)
        er_list.append(er)
    rf_ba_list.append(ba_list)
    rf_sn_list.append(sn_list)
    rf_sp_list.append(sp_list)
    rf_er_list.append(er_list)

In [None]:
clf, X, y = prepro('GB')

gb_ba_list = []
gb_sn_list = []
gb_sp_list = []
gb_er_list = []
for t in tqdm([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]):
    ba_list = []
    sn_list = []
    sp_list = []
    er_list = []
    for r in range(0, 1000, 10):
        ba, sn, sp, er = predict(clf, X, y, seed=r, test=t)
        ba_list.append(ba)
        sn_list.append(sn)
        sp_list.append(sp)
        er_list.append(er)
    gb_ba_list.append(ba_list)
    gb_sn_list.append(sn_list)
    gb_sp_list.append(sp_list)
    gb_er_list.append(er_list)

In [None]:
enso_ba_list = []
enso_sn_list = []
enso_sp_list = []
enso_er_list = []
for t in tqdm([0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]):
    ba_list = []
    sn_list = []
    sp_list = []
    er_list = []
    for r in range(0, 1000, 10):
        ba, sn, sp, er, cm = ensemb(vot='soft', seed=r, test=t)
        ba_list.append(ba)
        sn_list.append(sn)
        sp_list.append(sp)
        er_list.append(er)
    enso_ba_list.append(ba_list)
    enso_sn_list.append(sn_list)
    enso_sp_list.append(sp_list)
    enso_er_list.append(er_list)

In [None]:
results_df = pd.DataFrame({'KNN_BA':knn_ba_list, 'KNN_Sn':knn_sn_list, 'KNN_Sp':knn_sp_list, 'KNN_ER':knn_er_list, 
                           'SVM_BA':svm_ba_list, 'SVM_Sn':svm_sn_list, 'SVM_Sp':svm_sp_list, 'SVM_ER':svm_er_list, 
                           'RF_BA':rf_ba_list, 'RF_Sn':rf_sn_list, 'RF_Sp':rf_sp_list, 'RF_ER':rf_er_list, 
                           'GB_BA':gb_ba_list, 'GB_Sn':gb_sn_list, 'GB_Sp':gb_sp_list, 'GB_ER':gb_er_list, 
                           'ENSO_BA':enso_ba_list, 'ENSO_Sn':enso_sn_list, 'ENSO_Sp':enso_sp_list, 'ENSO_ER':enso_er_list})
results_df

In [None]:
results_df.to_json('ML_random_test_Main.json', orient='table')