In [1]:
import pandas as pd
from sklearn.metrics import make_scorer, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from nbimporter import NotebookLoader

warnings.filterwarnings("ignore")

In [2]:
ma = pd.read_csv("../16 Mycoplosma/ma.csv")
map = pd.read_csv("../16 Mycoplosma/map.csv")
marth = pd.read_csv("../16 Mycoplosma/marth.csv")
mcap = pd.read_csv("../16 Mycoplosma/mcap.csv")
mcon = pd.read_csv("../16 Mycoplosma/mcon.csv")
mcro = pd.read_csv("../16 Mycoplosma/mcro.csv")
mgal = pd.read_csv("../16 Mycoplosma/mgal.csv")
mhom = pd.read_csv("../16 Mycoplosma/mhom.csv")
mhyoJ = pd.read_csv("../16 Mycoplosma/mhyoJ.csv")
mhyo232 = pd.read_csv("../16 Mycoplosma/mhyo232.csv")
mhyo7448 = pd.read_csv("../16 Mycoplosma/mhyo7448.csv")
mm163K = pd.read_csv("../16 Mycoplosma/mm163K.csv")
mms = pd.read_csv("../16 Mycoplosma/mms.csv")
mpHF = pd.read_csv("../16 Mycoplosma/mpHF.csv")
mpM = pd.read_csv("../16 Mycoplosma/mpM.csv")
ms53 = pd.read_csv("../16 Mycoplosma/ms53.csv")

In [20]:
def calculate_essential(df):
    df['essentiality_0.6'] = (df['identity'] > 0.5).astype(int)
    df = df['essentiality_0.6'].value_counts().reset_index()
    df.columns = ['essentiality_0.6', 'count']
    print(df)

In [35]:
calculate_essential(ms53)

   essentiality_0.6  count
0                 0    202
1                 1    154


In [3]:
def generate_train_test_dataset(index):
    dataset_names = [ma, map, marth, mcap, mcon, mcro, mgal, mhom, mhyoJ, mhyo232, mhyo7448, mm163K, mms, mpHF, mpM, ms53]
    merged_train = dataset_names[:index]+dataset_names[index+1:]
    train_data = pd.concat(merged_train)
    test_data = dataset_names[index]
    return train_data,test_data

In [4]:
def handle_dataset(dataset):
    columns_to_drop = ['identity','name', 'Amino acid', 'protein_sequence', 'essentiality']
    Xs = dataset.drop(columns = columns_to_drop)
    y = dataset['essentiality']
    return Xs,y

In [5]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [21]:
from sklearn.linear_model import LassoCV

lassoCV = LassoCV(cv=20,random_state=10)
all_feature_names = ["GC_Content","CAI","A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V","nSE2","nSE3","nGE2","nGE3"]

In [22]:
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 1000, 100),  # 树的数量
    'max_depth': hp.quniform('max_depth', 3, 10, 1),  # 每棵树的最大深度
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # 学习率
    'subsample': hp.uniform('subsample', 0.5, 1.0),  # 每棵树的样本采样比例
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1.0),  # 每棵树的特征采样比例
    'gamma': hp.loguniform('gamma', -3, 2),  # 节点分裂时损失函数减小值的最小值要求
    'reg_alpha': hp.loguniform('reg_alpha', -3, 2),  # L1 正则化项系数
    'reg_lambda': hp.loguniform('reg_lambda', -3, 2),  # L2 正则化项系数
}

In [23]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score


def xgb_ac_cv(params):
    scorer = make_scorer(accuracy_score)
    model = XGBClassifier(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        learning_rate=params['learning_rate'],
        subsample=params['subsample'],
        colsample_bytree=params['colsample_bytree'],
        gamma=params['gamma'],
        reg_alpha=params['reg_alpha'],
        reg_lambda=params['reg_lambda'],
        objective='binary:logistic'  # 二分类问题
    )
    scorer = make_scorer(accuracy_score)
    score = -cross_val_score(model, X_resampled, y_resampled, cv=5,scoring=scorer).mean()
    return score

In [24]:
from joblib import load
from sklearn.feature_selection import RFE

scores = []
score = 0
models = []
model_xgb_fs = load("../../model/XGB_model.joblib")
for i in range(0,15):
    train_data, test_data = generate_train_test_dataset(i)
    Xs_train,y_train = handle_dataset(train_data)
    Xs_test,y_test = handle_dataset(test_data)
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = rus.fit_resample(Xs_train, y_train)
    rfe = RFE(model_xgb_fs, n_features_to_select=20)
    rfe.fit(X_resampled, y_resampled)
    X_resampled = rfe.transform(X_resampled)  # For training data
    Xs_test = rfe.transform(Xs_test)  # For testing data
    lassoCV.fit(X_resampled,y_resampled)
    selected_features = [feature for feature, coef in zip(all_feature_names, lassoCV.coef_) if coef !=0]
    trials = Trials()
    selected_feature_indices = [all_feature_names.index(feature) for feature in selected_features]
    X_resampled = X_resampled[:, selected_feature_indices]
    Xs_test = Xs_test[:,selected_feature_indices]
    best=fmin(fn=xgb_ac_cv, # function to optimize
              space=space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=50, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.default_rng(42) # fixing random state for the reproducibility
    )
    
    model = XGBClassifier(
        n_estimators=int(best['n_estimators']),
        max_depth=int(best['max_depth']),
        learning_rate=best['learning_rate'],
        subsample=best['subsample'],
        colsample_bytree=best['colsample_bytree'],
        gamma=best['gamma'],
        reg_alpha=best['reg_alpha'],
        reg_lambda=best['reg_lambda'],
        objective='binary:logistic'  # 二分类问题
    )
    models.append(model)
    model.fit(X_resampled,y_resampled)
    tpe_test_score=accuracy_score(y_test, model.predict(Xs_test))
    scores.append(tpe_test_score)
    print(tpe_test_score)
    if score < tpe_test_score:
        score = tpe_test_score
        best_model = model

100%|██████████| 50/50 [00:37<00:00,  1.34trial/s, best loss: -0.8561163337250294]
0.870026525198939
100%|██████████| 50/50 [01:01<00:00,  1.23s/trial, best loss: -0.879120879120879] 
0.8674948240165632
100%|██████████| 50/50 [00:32<00:00,  1.56trial/s, best loss: -0.8402264193878711]
0.6792452830188679
100%|██████████| 50/50 [00:54<00:00,  1.09s/trial, best loss: -0.8665081993483253]
0.9142857142857143
100%|██████████| 50/50 [00:50<00:00,  1.01s/trial, best loss: -0.8756138232419067]
0.8987730061349694
100%|██████████| 50/50 [00:53<00:00,  1.06s/trial, best loss: -0.8715550011879307]
0.8983050847457628
100%|██████████| 50/50 [00:47<00:00,  1.05trial/s, best loss: -0.875599294947121] 
0.784503631961259
100%|██████████| 50/50 [01:01<00:00,  1.23s/trial, best loss: -0.8810074176298086]
0.9258064516129032
100%|██████████| 50/50 [00:32<00:00,  1.54trial/s, best loss: -0.8413243827511663]
0.8815028901734104
100%|██████████| 50/50 [00:30<00:00,  1.64trial/s, best loss: -0.798615748891992] 
0

In [25]:
print(score)

0.9476584022038568


In [29]:
train_data, test_data = generate_train_test_dataset(12)
Xs_train,y_train = handle_dataset(train_data)
X_resampled, y_resampled = rus.fit_resample(Xs_train, y_train)
rfe = RFE(model_xgb_fs, n_features_to_select=20)
rfe.fit(X_resampled, y_resampled)
X_resampled = rfe.transform(X_resampled)
lassoCV.fit(X_resampled,y_resampled)
selected_feature_indices = rfe.get_support(indices=True)
selected_feature_names = [all_feature_names[index] for index in selected_feature_indices]
selected_features = [feature for feature, coef in zip(selected_feature_names, lassoCV.coef_) if coef !=0]
print("Selected feature RFE:", selected_feature_names)
print("Selected feature Lasso:", selected_features)

Selected feature RFE: ['CAI', 'A', 'N', 'D', 'C', 'Q', 'G', 'H', 'L', 'K', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', 'nSE2', 'nSE3', 'nGE2']
Selected feature Lasso: ['CAI', 'A', 'N', 'D', 'C', 'G', 'H', 'T', 'V', 'nSE2', 'nGE2']


In [26]:
from joblib import dump

dump(best_model, '../../model/XGB_model_FSS.joblib')

['../../model/XGB_model_FSS.joblib']

In [12]:
train_data, test_data = generate_train_test_dataset(12)
Xs_train,y_train = handle_dataset(train_data)
Xs_test,y_test = handle_dataset(test_data)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = rus.fit_resample(Xs_train, y_train)
lassoCV.fit(X_resampled,y_resampled)
selected_features = [feature for feature, coef in zip(all_feature_names, lassoCV.coef_) if coef !=0]
print(selected_features)

['GC_Content', 'CAI', 'A', 'N', 'D', 'H', 'M', 'nSE3', 'nGE3']
