In [1]:
import pandas as pd
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from nbimporter import NotebookLoader

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../code/output.csv",index_col=False)

In [3]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [4]:
selected_columns=["locus tag","essential","DNA","protein sequence"]
Xs= df.drop(columns=selected_columns)
y=df["essential"]

In [5]:
from sklearn.linear_model import LassoCV

lassoCV = LassoCV(cv=20,random_state=10)
all_feature_names = ["GC_Content","CAI","A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V","nSE2","nSE3","nGE2","nGE3"]

In [6]:
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 700, 1),
    'max_depth': hp.quniform('max_depth', 3, 20, 1),
    'min_samples_split': hp.uniform('min_samples_split', 0.1, 1),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.1, 0.5),
}

In [7]:
from sklearn.metrics import accuracy_score

def rf_ac_cv(params):
    params = {
    'n_estimators': int(params['n_estimators']), 
    'max_depth': int(params['max_depth']), 
    'min_samples_split': params['min_samples_split'],
    'min_samples_leaf': params['min_samples_leaf']
}
    scorer = make_scorer(accuracy_score)
    model = RandomForestClassifier(random_state=42, **params)
    score = -cross_val_score(model, X_resampled, y_resampled, cv=5,scoring=scorer, ).mean()
    return score

In [9]:
from sklearn.model_selection import KFold
from joblib import load
from sklearn.feature_selection import RFE

scores = []
score = 0
models = []
model_rf = load("../../model/RF_model.joblib")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
features_rf = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    rfe = RFE(model_rf, n_features_to_select=20)
    rfe.fit(X_resampled, y_resampled)
    X_resampled = rfe.transform(X_resampled)  # For training data
    Xs_test = rfe.transform(Xs_test)  # For testing data
    lassoCV.fit(X_resampled,y_resampled)
    lassoCV.fit(Xs_test,y_test)
    selected_features_rfe = [all_feature_names[i] for i in range(len(all_feature_names)) if rfe.support_[i]]
    features= [selected_features_rfe[i] for i, coef in enumerate(lassoCV.coef_) if coef != 0]
    print(features)
    trials = Trials()
    best=fmin(fn=rf_ac_cv, # function to optimize
              space=space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=20, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.default_rng(42) # fixing random state for the reproducibility
    )
    model = RandomForestClassifier(random_state=42, n_estimators=int(best['n_estimators']),max_depth=int(best['max_depth']),min_samples_split=best['min_samples_split'],min_samples_leaf=best['min_samples_leaf'])
    models.append(model)
    model.fit(X_resampled,y_resampled)
    tpe_test_score=accuracy_score(y_test, model.predict(Xs_test))
    scores.append(tpe_test_score)
    print(tpe_test_score)
    if score < tpe_test_score:
        score = tpe_test_score
        best_model = model
        features_rf = features

['GC_Content', 'CAI', 'N', 'D', 'C', 'E', 'G', 'H', 'I', 'K', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'nGE3']
100%|██████████| 20/20 [01:39<00:00,  4.99s/trial, best loss: -0.5614348832316433]
0.6570048309178744
['GC_Content', 'CAI']
100%|██████████| 20/20 [01:30<00:00,  4.53s/trial, best loss: -0.5472650771388499]
0.6111111111111112
['CAI', 'D', 'C', 'G', 'H', 'I', 'F', 'T', 'nSE2']
100%|██████████| 20/20 [00:55<00:00,  2.75s/trial, best loss: -0.57610101010101]  
0.6256038647342995
['GC_Content', 'CAI', 'R', 'D', 'E', 'G', 'H', 'I', 'L', 'F', 'P', 'T', 'V']
100%|██████████| 20/20 [00:47<00:00,  2.38s/trial, best loss: -0.5197069597069597]
0.5676328502415459
['GC_Content', 'CAI', 'E', 'H', 'I', 'K', 'F', 'P', 'T', 'V', 'nSE2', 'nGE3']
100%|██████████| 20/20 [00:43<00:00,  2.17s/trial, best loss: -0.5497896213183731]
0.5804111245465539


In [10]:
print(score)

0.6570048309178744


In [12]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

scores_rf_ac = []
scores_rf_mcc= []
scores_rf_f1 = []
scores_rf_auc = []
scores_rf_precision = []
scores_rf_recall = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    best_model.fit(X_resampled[features_rf],y_resampled)
    y_predict = best_model.predict(Xs_test[features_rf])
    acc=accuracy_score(y_test, y_predict)
    scores_rf_ac.append(acc)
    auc = roc_auc_score(y_test,y_predict)
    scores_rf_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_rf_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_rf_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_rf_recall.append(recall)

In [22]:
from joblib import dump

dump(best_model, '../../model/RF_model_Ecoli.joblib')

['../../model/RF_model_Ecoli.joblib']

In [14]:
def generateString(scores):
    mean_score = round(np.mean(scores),4)
    var_score = round(np.var(scores),4)
    return f'{mean_score}±{var_score}'


In [15]:
scores = []
score = 0
models = []
model_rf = load("../../model/RF_model.joblib")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    trials = Trials()
    best=fmin(fn=rf_ac_cv, # function to optimize
              space=space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=20, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.default_rng(42) # fixing random state for the reproducibility
    )
    model = RandomForestClassifier(random_state=42, n_estimators=int(best['n_estimators']),max_depth=int(best['max_depth']),min_samples_split=best['min_samples_split'],min_samples_leaf=best['min_samples_leaf'])
    models.append(model)
    model.fit(X_resampled,y_resampled)
    tpe_test_score=accuracy_score(y_test, model.predict(Xs_test))
    scores.append(tpe_test_score)
    print(tpe_test_score)
    if score < tpe_test_score:
        score = tpe_test_score
        best_model2 = model

100%|██████████| 20/20 [00:42<00:00,  2.13s/trial, best loss: -0.547065011571639]
0.607487922705314
100%|██████████| 20/20 [00:44<00:00,  2.23s/trial, best loss: -0.553763440860215]
0.6316425120772947
100%|██████████| 20/20 [01:16<00:00,  3.83s/trial, best loss: -0.564141414141414]
0.6002415458937198
100%|██████████| 20/20 [01:17<00:00,  3.85s/trial, best loss: -0.5]              
0.08333333333333333
100%|██████████| 20/20 [01:20<00:00,  4.03s/trial, best loss: -0.5520102851799906]
0.5731559854897219


In [18]:
scores_rf_ac2 = []
scores_rf_f12 = []
scores_rf_auc2 = []
scores_rf_precision2 = []
scores_rf_recall2 = []
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    best_model.fit(X_resampled,y_resampled)
    y_predict = best_model.predict(Xs_test)
    acc=accuracy_score(y_test, y_predict)
    scores_rf_ac2.append(acc)
    auc = roc_auc_score(y_test,y_predict)
    scores_rf_auc2.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_rf_f12.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_rf_precision2.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_rf_recall2.append(recall)

In [19]:
import csv
with open('RF.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Type','ACC','AUC','F1','Precision', 'Recall'])
    writer.writerow(['FS',generateString(scores_rf_ac),generateString(scores_rf_auc),generateString(scores_rf_f1),generateString(scores_rf_precision),generateString(scores_rf_recall)])
    writer.writerow(['RF',generateString(scores_rf_ac2),generateString(scores_rf_auc2),generateString(scores_rf_f12),generateString(scores_rf_precision2),generateString(scores_rf_recall2)])

In [21]:
from scipy.stats import ks_2samp
p_values = []
features_rf = ['GC_Content', 'CAI', 'N', 'D', 'C', 'E', 'G', 'H', 'I', 'K', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'nGE3']
for train_index, test_index in kf.split(Xs):
    X_train_fold, Xs_test = Xs.iloc[train_index], Xs.iloc[test_index]
    y_train_fold, y_test = y.iloc[train_index], y.iloc[test_index]
    X_resampled, y_resampled = rus.fit_resample(X_train_fold, y_train_fold)
    best_model.fit(X_resampled[features_rf],y_resampled)
    y_predict_fs = best_model.predict(Xs_test[features_rf])
    best_model2.fit(X_resampled,y_resampled)
    y_predict = best_model2.predict(Xs_test)
    ks_statistic, p_value = ks_2samp(y_predict_fs,y_predict)
    p_values.append(p_value)
print(np.mean(p_values))

0.43123640308815697
