In [1]:
import pandas as pd
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from joblib import load
import statistics
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, matthews_corrcoef, precision_score, recall_score
warnings.filterwarnings("ignore")

In [2]:
ma = pd.read_csv("../../16 Mycoplosma/ma.csv")
map = pd.read_csv("../../16 Mycoplosma/map.csv")
marth = pd.read_csv("../../16 Mycoplosma/marth.csv")
mcap = pd.read_csv("../../16 Mycoplosma/mcap.csv")
mcon = pd.read_csv("../../16 Mycoplosma/mcon.csv")
mcro = pd.read_csv("../../16 Mycoplosma/mcro.csv")
mgal = pd.read_csv("../../16 Mycoplosma/mgal.csv")
mhyoJ = pd.read_csv("../../16 Mycoplosma/mhyoJ.csv")
mhyo232 = pd.read_csv("../../16 Mycoplosma/mhyo232.csv")
mhyo7448 = pd.read_csv("../../16 Mycoplosma/mhyo7448.csv")
mm163K = pd.read_csv("../../16 Mycoplosma/mm163K.csv")
mms = pd.read_csv("../../16 Mycoplosma/mms.csv")
mpHF = pd.read_csv("../../16 Mycoplosma/mpHF.csv")
mpM = pd.read_csv("../../16 Mycoplosma/mpM.csv")
ms53 = pd.read_csv("../../16 Mycoplosma/ms53.csv")

In [10]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [11]:
def generate_train_test_dataset(index):
    dataset_names = [ma, map, marth, mcap, mcon, mcro, mgal, mhyoJ, mhyo232, mhyo7448, mm163K, mms, mpHF, mpM, ms53]
    merged_train = dataset_names[:index]+dataset_names[index+1:]
    train_data = pd.concat(merged_train)
    test_data = dataset_names[index]
    return train_data,test_data

In [12]:
def handle_dataset(dataset):
    columns_to_drop = ['identity','name', 'Amino acid', 'protein_sequence', 'essentiality']
    Xs = dataset.drop(columns = columns_to_drop)
    y = dataset['essentiality']
    return Xs,y

In [13]:
from sklearn.model_selection import KFold

model_ecoli = load('../../../model/XGB_model_Ecoli.joblib')
scores_ecoli_ac = []
scores_ecoli_f1 = []
scores_ecoli_auc = []
scores_ecoli_precision = []
scores_ecoli_recall = []
feature_ecoli = ['GC_Content', 'CAI', 'D', 'C', 'Q', 'H', 'L', 'F', 'V', 'nSE2']

for i in range(0,15): 
    train_data, test_data = generate_train_test_dataset(i)
    Xs_train,y_train = handle_dataset(train_data)
    Xs_test,y_test = handle_dataset(test_data)
    X_resampled, y_resampled = rus.fit_resample(Xs_train, y_train)
    model_ecoli.fit(X_resampled[feature_ecoli], y_resampled)
    y_predict = model_ecoli.predict(Xs_test[feature_ecoli])
    acc=accuracy_score(y_test, y_predict)
    scores_ecoli_ac.append(acc)
    auc = roc_auc_score(y_test,y_predict)
    scores_ecoli_auc.append(auc)
    f1 = f1_score(y_test,y_predict)
    scores_ecoli_f1.append(f1)
    precision = precision_score(y_test,y_predict)
    scores_ecoli_precision.append(precision)
    recall = recall_score(y_test,y_predict)
    scores_ecoli_recall.append(recall)

In [14]:
print(scores_ecoli_ac)
print(statistics.mean(scores_ecoli_ac),statistics.variance(scores_ecoli_ac))

[0.8541114058355438, 0.8509316770186336, 0.7452830188679245, 0.9224489795918367, 0.8773006134969326, 0.8926553672316384, 0.7191283292978208, 0.523121387283237, 0.8925979680696662, 0.9046242774566474, 0.928374655647383, 0.9476309226932669, 0.9375, 0.4817813765182186, 0.8735955056179775]
0.8234056989751152 0.021173998578612716


In [15]:
import csv
def generateString(scores):
    mean_score = round(np.mean(scores),4)
    var_score = round(np.var(scores),4)
    return f'{mean_score}±{var_score}'
with open('resulte.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Type','ACC','AUC','F1','Precision', 'Recall'])
    writer.writerow(['ecoli',generateString(scores_ecoli_ac),generateString(scores_ecoli_auc),generateString(scores_ecoli_f1),generateString(scores_ecoli_precision),generateString(scores_ecoli_recall)])