In [2]:
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LassoCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings("ignore")

In [3]:
ma = pd.read_csv("../16 Mycoplosma/ma.csv")
map = pd.read_csv("../16 Mycoplosma/map.csv")
marth = pd.read_csv("../16 Mycoplosma/marth.csv")
mcap = pd.read_csv("../16 Mycoplosma/mcap.csv")
mcon = pd.read_csv("../16 Mycoplosma/mcon.csv")
mcro = pd.read_csv("../16 Mycoplosma/mcro.csv")
mgal = pd.read_csv("../16 Mycoplosma/mgal.csv")
mhom = pd.read_csv("../16 Mycoplosma/mhom.csv")
mhyoJ = pd.read_csv("../16 Mycoplosma/mhyoJ.csv")
mhyo232 = pd.read_csv("../16 Mycoplosma/mhyo232.csv")
mhyo7448 = pd.read_csv("../16 Mycoplosma/mhyo7448.csv")
mm163K = pd.read_csv("../16 Mycoplosma/mm163K.csv")
mms = pd.read_csv("../16 Mycoplosma/mms.csv")
mpHF = pd.read_csv("../16 Mycoplosma/mpHF.csv")
mpM = pd.read_csv("../16 Mycoplosma/mpM.csv")
ms53 = pd.read_csv("../16 Mycoplosma/ms53.csv")

In [4]:
def generate_train_test_dataset(index):
    dataset_names = [ma, map, marth, mcap, mcon, mcro, mgal, mhom, mhyoJ, mhyo232, mhyo7448, mm163K, mms, mpHF, mpM, ms53]
    merged_train = dataset_names[:index]+dataset_names[index+1:]
    train_data = pd.concat(merged_train)
    test_data = dataset_names[index]
    return train_data,test_data

In [5]:
def handle_dataset(dataset):
    columns_to_drop = ['identity','name', 'Amino acid', 'protein_sequence', 'essentiality']
    Xs = dataset.drop(columns = columns_to_drop)
    y = dataset['essentiality']
    return Xs,y

In [6]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [9]:
space = {
    'var_smoothing': hp.loguniform('var_smoothing', -9, 0)
}
lassoCV = LassoCV(cv=20,random_state=10)
all_feature_names = ["GC_Content","CAI","A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V","nSE2","nSE3","nGE2","nGE3"]

In [8]:
def nb_ac_cv(params):
    scorer = make_scorer(accuracy_score)
    model = GaussianNB(var_smoothing=params['var_smoothing'])
    score = -cross_val_score(model, X_resampled[selected_features], y_resampled, cv=5,scoring=scorer, ).mean()
    return score

In [14]:
scores = []
score = 0
models = []
features = []
for j in range(0,3):
    for i in range(0,15):
        train_data, test_data = generate_train_test_dataset(i)
        Xs_train,y_train = handle_dataset(train_data)
        Xs_test,y_test = handle_dataset(test_data)
        rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
        X_resampled, y_resampled = rus.fit_resample(Xs_train, y_train)
        lassoCV.fit(X_resampled,y_resampled)
        selected_features = [feature for feature, coef in zip(all_feature_names, lassoCV.coef_) if coef !=0]
        trials = Trials()
        best=fmin(fn=nb_ac_cv, # function to optimize
              space=space, 
              algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
              max_evals=50, # maximum number of iterations
              trials=trials, # logging
              rstate=np.random.default_rng(42) # fixing random state for the reproducibility
        )
        model = GaussianNB(**best)
        models.append(model)
        model.fit(X_resampled[selected_features],y_resampled)
        tpe_test_score=accuracy_score(y_test, model.predict(Xs_test[selected_features]))
        scores.append(tpe_test_score)
        print(tpe_test_score)
        if score < tpe_test_score:
            score = tpe_test_score
            best_model = model
            features = selected_features

100%|██████████| 50/50 [00:01<00:00, 41.77trial/s, best loss: -0.8117626321974148]
0.8408488063660478
100%|██████████| 50/50 [00:01<00:00, 45.78trial/s, best loss: -0.8010989010989011]
0.8115942028985508
100%|██████████| 50/50 [00:01<00:00, 46.69trial/s, best loss: -0.8252645352144727]
0.6383647798742138
100%|██████████| 50/50 [00:01<00:00, 41.98trial/s, best loss: -0.799220127129961] 
0.9469387755102041
100%|██████████| 50/50 [00:01<00:00, 46.00trial/s, best loss: -0.8219538841929734]
0.8496932515337423
100%|██████████| 50/50 [00:01<00:00, 47.35trial/s, best loss: -0.82472083630316]
0.884180790960452
100%|██████████| 50/50 [00:01<00:00, 45.51trial/s, best loss: -0.8128319623971798]
0.6876513317191283
100%|██████████| 50/50 [00:01<00:00, 45.76trial/s, best loss: -0.8165947904088322]
0.9354838709677419
100%|██████████| 50/50 [00:01<00:00, 46.47trial/s, best loss: -0.8231482534986915]
0.5867052023121387
100%|██████████| 50/50 [00:01<00:00, 47.62trial/s, best loss: -0.7953069030417097]
0.

In [15]:
print(scores)

[0.8408488063660478, 0.8115942028985508, 0.6383647798742138, 0.9469387755102041, 0.8496932515337423, 0.884180790960452, 0.6876513317191283, 0.9354838709677419, 0.5867052023121387, 0.8113207547169812, 0.7832369942196532, 0.9173553719008265, 0.9376558603491272, 0.97, 0.5020242914979757, 0.8408488063660478, 0.8115942028985508, 0.6383647798742138, 0.9469387755102041, 0.8496932515337423, 0.884180790960452, 0.6876513317191283, 0.9354838709677419, 0.5867052023121387, 0.8113207547169812, 0.7832369942196532, 0.9173553719008265, 0.9376558603491272, 0.97, 0.5020242914979757, 0.8408488063660478, 0.8115942028985508, 0.6383647798742138, 0.9469387755102041, 0.8496932515337423, 0.884180790960452, 0.6876513317191283, 0.9354838709677419, 0.5867052023121387, 0.8113207547169812, 0.7832369942196532, 0.9173553719008265, 0.9376558603491272, 0.97, 0.5020242914979757]


In [16]:
from joblib import dump

dump(best_model, '../../model/NB_model_lasso.joblib')

['../../model/NB_model_lasso.joblib']

In [17]:
print(features)

['GC_Content', 'CAI', 'N']
