In [1]:
import pandas as pd
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from joblib import load
import statistics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, precision_score, recall_score
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("N315output.csv",index_col=False)

In [3]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [4]:
selected_columns=["locus tag","essential","DNA","protein sequence"]
Xs= df.drop(columns=selected_columns)
y=df["essential"]

In [5]:
from sklearn.model_selection import KFold

model_rf_lassoRFE = load('../../../model/RF_model_FSS.joblib')
scores_fss_ac = []
scores_fss_mcc= []
scores_fss_f1 = []
scores_fss_auc = []
scores_fss_precision = []
scores_fss_recall = []
feature_rf = ['GC_Content', 'CAI', 'A', 'R', 'N', 'D', 'G', 'K', 'F', 'S', 'T', 'Y', 'V', 'nSE2', 'nSE3', 'nGE2', 'nGE3']
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for train_index, test_index in kf.split(Xs):
    X_train_fold, X_test_fold = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train_fold, y_train_fold)
    model_rf_lassoRFE.fit(X_train_undersampled[feature_rf], y_train_undersampled)
    y_predict = model_rf_lassoRFE.predict(X_test_fold[feature_rf])
    acc=accuracy_score(y_test, y_predict)
    scores_fss_ac.append(acc)
    mcc = matthews_corrcoef(y_test,y_predict)
    scores_fss_mcc.append(mcc)
    auc = roc_auc_score(y_test,y_predict)
    scores_fss_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_fss_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_fss_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_fss_recall.append(recall)

In [6]:
print(scores_fss_ac)
print(statistics.mean(scores_fss_ac),statistics.variance(scores_fss_ac))

[0.5813953488372093, 0.5697674418604651, 0.5503875968992248, 0.6201550387596899, 0.5736434108527132, 0.5581395348837209, 0.5271317829457365, 0.6201550387596899, 0.6550387596899225, 0.624031007751938]
0.587984496124031 0.0016059731987260381


In [7]:
model_xgb_nfs = load('../../../model/XGB_model.joblib')
scores_nfs_ac = []
scores_nfs_mcc = []
scores_nfs_auc = []
scores_nfs_f1 = []
scores_nfs_precision = []
scores_nfs_recall = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, X_test_fold = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train_fold, y_train_fold)
    model_xgb_nfs.fit(X_train_undersampled, y_train_undersampled)
    y_predict = model_xgb_nfs.predict(X_test_fold)
    acc=accuracy_score(y_test, y_predict)
    scores_nfs_ac.append(acc)
    mcc = matthews_corrcoef(y_test,y_predict)
    scores_nfs_mcc.append(mcc)
    auc = roc_auc_score(y_test,y_predict)
    scores_nfs_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_nfs_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_nfs_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_nfs_recall.append(recall)

In [8]:
print(scores_nfs_ac)
print(statistics.mean(scores_nfs_ac),statistics.variance(scores_nfs_ac))

[0.6434108527131783, 0.7093023255813954, 0.6705426356589147, 0.7054263565891473, 0.6744186046511628, 0.6589147286821705, 0.686046511627907, 0.6937984496124031, 0.7170542635658915, 0.6976744186046512]
0.6856589147286822 0.0005556890945389243


In [9]:
model_mlp_lasso = load('../../../model/MLP_model_lasso.joblib')
scores_lasso_ac = []
scores_lasso_mcc = []
scores_lasso_auc = []
scores_lasso_f1 = []
scores_lasso_precision = []
scores_lasso_recall = []
feature_lasso = ['GC_Content', 'CAI', 'A', 'N', 'H', 'nSE3', 'nGE3']
for train_index, test_index in kf.split(Xs):
    X_train_fold, X_test_fold = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train_fold, y_train_fold)
    model_mlp_lasso.fit(X_train_undersampled[feature_lasso], y_train_undersampled)
    y_predict = model_mlp_lasso.predict(X_test_fold[feature_lasso])
    acc=accuracy_score(y_test, y_predict)
    scores_lasso_ac.append(acc)
    mcc = matthews_corrcoef(y_test,y_predict)
    scores_lasso_mcc.append(mcc)
    auc = roc_auc_score(y_test,y_predict)
    scores_lasso_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_lasso_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_lasso_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_lasso_recall.append(recall)

In [10]:
print(scores_lasso_ac)
print(np.mean(scores_lasso_ac),np.var(scores_lasso_ac))

[0.4844961240310077, 0.6085271317829457, 0.686046511627907, 0.5891472868217055, 0.437984496124031, 0.4108527131782946, 0.375968992248062, 0.5232558139534884, 0.8062015503875969, 0.7403100775193798]
0.5662790697674419 0.018987590889970554


In [11]:
model_svm_rfe = load('../../../model/SVM_model.joblib')
scores_rfe_ac = []
scores_rfe_mcc = []
scores_rfe_auc = []
scores_rfe_f1 = []
scores_rfe_precision = []
scores_rfe_recall = []
feature_rfe = ['CAI', 'R', 'N', 'D', 'C', 'Q', 'G', 'H', 'F', 'P', 'T', 'Y', 'nSE2', 'nGE2']
for train_index, test_index in kf.split(Xs):
    X_train_fold, X_test_fold = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_train_undersampled, y_train_undersampled = rus.fit_resample(X_train_fold, y_train_fold)
    model_svm_rfe.fit(X_train_undersampled[feature_rfe], y_train_undersampled)
    y_predict = model_svm_rfe.predict(X_test_fold[feature_rfe])
    acc=accuracy_score(y_test, y_predict)
    scores_rfe_ac.append(acc)
    mcc = matthews_corrcoef(y_test,y_predict)
    scores_rfe_mcc.append(mcc)
    auc = roc_auc_score(y_test,y_predict)
    scores_rfe_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_rfe_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_rfe_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_rfe_recall.append(recall)

In [12]:
print(scores_rfe_ac)
print(np.mean(scores_rfe_ac),np.var(scores_rfe_ac))

[0.6007751937984496, 0.627906976744186, 0.6046511627906976, 0.686046511627907, 0.6550387596899225, 0.627906976744186, 0.627906976744186, 0.6356589147286822, 0.6821705426356589, 0.6705426356589147]
0.6418604651162791 0.0008268733850129203


In [18]:
def generateString(scores):
    mean_score = round(np.mean(scores),4)
    var_score = round(np.var(scores),4)
    return f'{mean_score}±{var_score}'

In [19]:
import csv

with open('resultsN315.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Type','ACC','MCC','AUC','F1','Precision', 'Recall'])
    writer.writerow(['nFS',generateString(scores_nfs_ac),generateString(scores_nfs_mcc),generateString(scores_nfs_auc),generateString(scores_nfs_f1),generateString(scores_nfs_precision),generateString(scores_nfs_recall)])
    writer.writerow(['Lasso',generateString(scores_lasso_ac),generateString(scores_lasso_mcc),generateString(scores_lasso_auc),generateString(scores_lasso_f1),generateString(scores_lasso_precision),generateString(scores_lasso_recall)])
    writer.writerow(['RFE',generateString(scores_rfe_ac),generateString(scores_rfe_mcc),generateString(scores_rfe_auc),generateString(scores_rfe_f1),generateString(scores_rfe_precision),generateString(scores_rfe_recall)])
    writer.writerow(['FSS',generateString(scores_fss_ac),generateString(scores_fss_mcc),generateString(scores_fss_auc),generateString(scores_fss_f1),generateString(scores_fss_precision),generateString(scores_fss_recall)])