In [1]:
import pandas as pd
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from nbimporter import NotebookLoader

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../code/output.csv",index_col=False)

In [3]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [4]:
selected_columns=["locus tag","essential","DNA","protein sequence"]
Xs= df.drop(columns=selected_columns)
y=df["essential"]

In [5]:
from sklearn.linear_model import LassoCV

lassoCV = LassoCV(cv=20,random_state=10)
all_feature_names = ["GC_Content","CAI","A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V","nSE2","nSE3","nGE2","nGE3"]

In [6]:
space = {
    'C': hp.loguniform('C', -5, 2), 
    'degree': hp.quniform('degree', 2, 5, 1),
    'gamma': hp.choice('gamma', ['auto', 'scale', hp.loguniform('gamma_val', -5, 2)])  
}

In [7]:
from sklearn.metrics import accuracy_score

def svm_ac_cv(params):
    model = SVC(C = params["C"],kernel ="linear", degree = int(params["degree"]), gamma = params["gamma"])
    scorer = make_scorer(accuracy_score)
    score = -cross_val_score(model, X_resampled, y_resampled, cv=5,scoring=scorer ).mean()
    return score

In [19]:
from sklearn.model_selection import KFold
from joblib import load
from sklearn.feature_selection import RFE

scores = []
score = 0
models = []
model_svm =load("../../model/SVM_model.joblib")
kf = KFold(n_splits=10, shuffle=True, random_state=42)
feature_svm = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    rfe = RFE(model_svm, n_features_to_select=20)
    rfe.fit(X_resampled, y_resampled)
    X_resampled = rfe.transform(X_resampled)  # For training data
    Xs_test = rfe.transform(Xs_test)  # For testing data
    lassoCV.fit(X_resampled,y_resampled)
    lassoCV.fit(Xs_test,y_test)
    selected_features_rfe = [all_feature_names[i] for i in range(len(all_feature_names)) if rfe.support_[i]]
    features= [selected_features_rfe[i] for i, coef in enumerate(lassoCV.coef_) if coef != 0]
    print(features)
    trials = Trials()
    params=fmin(fn=svm_ac_cv, # function to optimize
              space=space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=50, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.default_rng(42) # fixing random state for the reproducibility
    )
    model = SVC(C = params["C"],kernel ="linear", degree = int(params["degree"]), gamma = params["gamma"])
    models.append(model)
    model.fit(X_resampled,y_resampled)
    tpe_test_score=accuracy_score(y_test, model.predict(Xs_test))
    scores.append(tpe_test_score)
    print(tpe_test_score)
    if score < tpe_test_score:
        score = tpe_test_score
        best_model2 = model
        feature_svm = features

['GC_Content', 'CAI', 'A', 'R', 'D', 'E', 'H', 'I', 'F', 'P', 'T', 'W', 'V', 'nSE3']
100%|██████████| 50/50 [00:06<00:00,  8.09trial/s, best loss: -0.5779155435759209]
0.6183574879227053
['GC_Content', 'CAI']
100%|██████████| 50/50 [00:06<00:00,  7.42trial/s, best loss: -0.5793447993447993]
0.7294685990338164
['CAI', 'A', 'E', 'G', 'I', 'L']
100%|██████████| 50/50 [00:06<00:00,  7.70trial/s, best loss: -0.590566037735849] 
0.644927536231884
['GC_Content', 'CAI', 'A', 'D', 'G', 'I', 'L', 'F', 'P', 'S', 'T', 'V']
100%|██████████| 50/50 [00:06<00:00,  7.80trial/s, best loss: -0.5800359389038634]
0.572463768115942
['CAI', 'F', 'nSE2', 'nGE2']
100%|██████████| 50/50 [00:06<00:00,  7.59trial/s, best loss: -0.5580996884735203]
0.5483091787439613
['CAI', 'D', 'C', 'Q', 'G', 'H', 'L', 'F', 'V', 'nSE2']
100%|██████████| 50/50 [00:06<00:00,  7.63trial/s, best loss: -0.5704176904176904]
0.34541062801932365
['CAI', 'nSE3', 'nGE3']
100%|██████████| 50/50 [00:06<00:00,  8.28trial/s, best loss: -0.602

In [12]:
print(score)

0.7294685990338164


In [13]:
print(feature_svm)

['GC_Content', 'CAI']


In [15]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

scores_svm_ac = []
scores_svm_mcc = []
scores_svm_auc = []
scores_svm_f1 = []
scores_svm_precision = []
scores_svm_recall = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    best_model.fit(X_resampled[feature_svm],y_resampled)
    y_predict = best_model.predict(Xs_test[feature_svm])
    acc=accuracy_score(y_test, y_predict)
    scores_svm_ac.append(acc)
    auc = roc_auc_score(y_test,y_predict)
    scores_svm_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_svm_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_svm_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_svm_recall.append(recall)

In [13]:
from joblib import dump

dump(best_model, '../../model/SVM_model_Ecoli.joblib')

['../../model/RF_model_Ecoli.joblib']

In [17]:
def generateString(scores):
    mean_score = round(np.mean(scores),4)
    var_score = round(np.var(scores),4)
    return f'{mean_score}±{var_score}'


In [18]:
scores = []
score = 0
models = []
model_svm =load("../../model/SVM_model.joblib")
kf = KFold(n_splits=10, shuffle=True, random_state=42)
feature_svm = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    trials = Trials()
    params=fmin(fn=svm_ac_cv, # function to optimize
              space=space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=50, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.default_rng(42) # fixing random state for the reproducibility
    )
    model = SVC(C = params["C"],kernel ="linear", degree = int(params["degree"]), gamma = params["gamma"])
    models.append(model)
    model.fit(X_resampled,y_resampled)
    tpe_test_score=accuracy_score(y_test, model.predict(Xs_test))
    scores.append(tpe_test_score)
    print(tpe_test_score)
    if score < tpe_test_score:
        score = tpe_test_score
        best_model = model

100%|██████████| 50/50 [00:03<00:00, 16.60trial/s, best loss: -0.5685175202156334]
0.5700483091787439
100%|██████████| 50/50 [00:03<00:00, 15.67trial/s, best loss: -0.5775102375102376]
0.7342995169082126
100%|██████████| 50/50 [00:02<00:00, 16.79trial/s, best loss: -0.5773584905660377]
0.6473429951690821
100%|██████████| 50/50 [00:02<00:00, 16.70trial/s, best loss: -0.585696316262354] 
0.572463768115942
100%|██████████| 50/50 [00:03<00:00, 16.62trial/s, best loss: -0.548753894080997] 
0.5483091787439613
100%|██████████| 50/50 [00:03<00:00, 15.78trial/s, best loss: -0.5613923013923013]
0.642512077294686
100%|██████████| 50/50 [00:02<00:00, 16.91trial/s, best loss: -0.5890589992531741]
0.5531400966183575
100%|██████████| 50/50 [00:03<00:00, 16.59trial/s, best loss: -0.6004672057502246]
0.5217391304347826
100%|██████████| 50/50 [00:02<00:00, 16.85trial/s, best loss: -0.5852470799640611]
0.6400966183574879
100%|██████████| 50/50 [00:03<00:00, 16.49trial/s, best loss: -0.5663432165318958]
0

In [20]:
scores_svm_ac2 = []
scores_svm_mcc2 = []
scores_svm_auc2 = []
scores_svm_f12 = []
scores_svm_precision2 = []
scores_svm_recall2 = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    best_model.fit(X_resampled[feature_svm],y_resampled)
    y_predict = best_model.predict(Xs_test[feature_svm])
    acc=accuracy_score(y_test, y_predict)
    scores_svm_ac2.append(acc)
    auc = roc_auc_score(y_test,y_predict)
    scores_svm_auc2.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_svm_f12.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_svm_precision2.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_svm_recall2.append(recall)

In [21]:
import csv
with open('SVM.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Type','ACC','AUC','F1','Precision', 'Recall'])
    writer.writerow(['SVM',generateString(scores_svm_ac),generateString(scores_svm_auc),generateString(scores_svm_f1),generateString(scores_svm_precision),generateString(scores_svm_recall)])
    writer.writerow(['SVM',generateString(scores_svm_ac2),generateString(scores_svm_auc2),generateString(scores_svm_f12),generateString(scores_svm_precision2),generateString(scores_svm_recall2)])

In [23]:
from scipy.stats import ks_2samp
p_values = []
features_svm = ['GC_Content', 'CAI']
best_model2 = load('../../model/SVM_model_Ecoli.joblib')
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    best_model2.fit(X_resampled[features_svm],y_resampled)
    y_predict_fs = best_model2.predict(Xs_test[features_svm])
    best_model.fit(X_resampled,y_resampled)
    y_predict = best_model.predict(Xs_test)
    ks_statistic, p_value = ks_2samp(y_predict_fs,y_predict)
    p_values.append(p_value)
print(np.mean(p_values))

0.09843263891125446
