In [1]:
import pandas as pd
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from joblib import load
import statistics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, precision_score, recall_score
warnings.filterwarnings("ignore")

In [2]:
ma = pd.read_csv("../../16 Mycoplosma/ma.csv")
map = pd.read_csv("../../16 Mycoplosma/map.csv")
marth = pd.read_csv("../../16 Mycoplosma/marth.csv")
mcap = pd.read_csv("../../16 Mycoplosma/mcap.csv")
mcon = pd.read_csv("../../16 Mycoplosma/mcon.csv")
mcro = pd.read_csv("../../16 Mycoplosma/mcro.csv")
mgal = pd.read_csv("../../16 Mycoplosma/mgal.csv")
mhyoJ = pd.read_csv("../../16 Mycoplosma/mhyoJ.csv")
mhyo232 = pd.read_csv("../../16 Mycoplosma/mhyo232.csv")
mhyo7448 = pd.read_csv("../../16 Mycoplosma/mhyo7448.csv")
mm163K = pd.read_csv("../../16 Mycoplosma/mm163K.csv")
mms = pd.read_csv("../../16 Mycoplosma/mms.csv")
mpHF = pd.read_csv("../../16 Mycoplosma/mpHF.csv")
mpM = pd.read_csv("../../16 Mycoplosma/mpM.csv")
ms53 = pd.read_csv("../../16 Mycoplosma/ms53.csv")

In [3]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [6]:
def generate_train_test_dataset(index):
    dataset_names = [ma, map, marth, mcap, mcon, mcro, mgal, mhyoJ, mhyo232, mhyo7448, mm163K, mms, mpHF, mpM, ms53]
    merged_train = dataset_names[:index]+dataset_names[index+1:]
    train_data = pd.concat(merged_train)
    test_data = dataset_names[index]
    return train_data,test_data

In [5]:
def handle_dataset(dataset):
    columns_to_drop = ['identity','name', 'Amino acid', 'protein_sequence', 'essentiality']
    Xs = dataset.drop(columns = columns_to_drop)
    y = dataset['essentiality']
    return Xs,y

In [8]:
from sklearn.model_selection import KFold

model_Sta = load('../../../model/XGB_model_Sta.joblib')
scores_sta_ac = []
scores_sta_f1 = []
scores_sta_auc = []
scores_sta_precision = []
scores_sta_recall = []
feature_sta = ['GC_Content', 'CAI', 'R', 'D', 'E', 'S', 'nSE2', 'nSE3', 'nGE3']

for i in range(0,15): 
    train_data, test_data = generate_train_test_dataset(i)
    Xs_train,y_train = handle_dataset(train_data)
    Xs_test,y_test = handle_dataset(test_data)
    X_resampled, y_resampled = rus.fit_resample(Xs_train, y_train)
    model_Sta.fit(X_resampled[feature_sta], y_resampled)
    y_predict = model_Sta.predict(Xs_test[feature_sta])
    acc=accuracy_score(y_test, y_predict)
    scores_sta_ac.append(acc)
    auc = roc_auc_score(y_test,y_predict)
    scores_sta_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_sta_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_sta_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_sta_recall.append(recall)

In [9]:
print(scores_sta_ac)
print(statistics.mean(scores_sta_ac),statistics.variance(scores_sta_ac))

[0.896551724137931, 0.8509316770186336, 0.720125786163522, 0.9061224489795918, 0.8926380368098159, 0.923728813559322, 0.7046004842615012, 0.8554913294797688, 0.8621190130624092, 0.869942196531792, 0.9614325068870524, 0.942643391521197, 0.955, 0.46153846153846156, 0.8904494382022472]
0.8462210205435498 0.01682171161730699


In [10]:
import csv
def generateString(scores):
    mean_score = round(np.mean(scores),4)
    var_score = round(np.var(scores),4)
    return f'{mean_score}±{var_score}'
with open('results.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Type','ACC','AUC','F1','Precision', 'Recall'])
    writer.writerow(['Sta',generateString(scores_sta_ac),generateString(scores_sta_auc),generateString(scores_sta_f1),generateString(scores_sta_precision),generateString(scores_sta_recall)])