In [1]:
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from hyperopt import hp
import numpy as np
from hyperopt import Trials, tpe, fmin
import warnings
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LassoCV
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB

warnings.filterwarnings("ignore")

In [2]:
ma = pd.read_csv("../16 Mycoplosma/ma.csv")
map = pd.read_csv("../16 Mycoplosma/map.csv")
marth = pd.read_csv("../16 Mycoplosma/marth.csv")
mcap = pd.read_csv("../16 Mycoplosma/mcap.csv")
mcon = pd.read_csv("../16 Mycoplosma/mcon.csv")
mcro = pd.read_csv("../16 Mycoplosma/mcro.csv")
mgal = pd.read_csv("../16 Mycoplosma/mgal.csv")
mhom = pd.read_csv("../16 Mycoplosma/mhom.csv")
mhyoJ = pd.read_csv("../16 Mycoplosma/mhyoJ.csv")
mhyo232 = pd.read_csv("../16 Mycoplosma/mhyo232.csv")
mhyo7448 = pd.read_csv("../16 Mycoplosma/mhyo7448.csv")
mm163K = pd.read_csv("../16 Mycoplosma/mm163K.csv")
mms = pd.read_csv("../16 Mycoplosma/mms.csv")
mpHF = pd.read_csv("../16 Mycoplosma/mpHF.csv")
mpM = pd.read_csv("../16 Mycoplosma/mpM.csv")
ms53 = pd.read_csv("../16 Mycoplosma/ms53.csv")

In [3]:
def generate_train_test_dataset(index):
    dataset_names = [ma, map, marth, mcap, mcon, mcro, mgal, mhom, mhyoJ, mhyo232, mhyo7448, mm163K, mms, mpHF, mpM, ms53]
    merged_train = dataset_names[:index]+dataset_names[index+1:]
    train_data = pd.concat(merged_train)
    test_data = dataset_names[index]
    return train_data,test_data

In [4]:
def handle_dataset(dataset):
    columns_to_drop = ['identity','name', 'Amino acid', 'protein_sequence', 'essentiality']
    Xs = dataset.drop(columns = columns_to_drop)
    y = dataset['essentiality']
    return Xs,y

In [5]:
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)

In [6]:
lassoCV = LassoCV(cv=20,random_state=10)
all_feature_names = ["GC_Content","CAI","A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V","nSE2","nSE3","nGE2","nGE3"]


In [7]:
space = {'layer_size':hp.quniform('layer_size', 25, 300, 1),
         'alpha':hp.lognormal('alpha', mu=np.log(1e-4), sigma=1),
         'activation':hp.choice('activation', ['logistic', 'tanh', 'relu']),
         'max_iter': hp.quniform('max_iter', 100, 3000, 100),
         'learning_rate':hp.loguniform('learning_rate', low=np.log(1e-4), high=np.log(1.)),
        }

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
def mlp_ac_cv(args):
    scorer = make_scorer(accuracy_score)
    model = MLPClassifier(hidden_layer_sizes=(int(args['layer_size']),),                         max_iter=int(args['max_iter']),
                        alpha=args['alpha'], tol=1e-4, 
                        random_state=1, activation=args['activation'], 
                        learning_rate_init=args['learning_rate']
                       )
    score = -cross_val_score(model, X_resampled[selected_features], y_resampled, cv=5,scoring=scorer ).mean()
    return score

In [11]:
scores = []
score = 0
models = []
features = []
for i in range(0,15):
    train_data, test_data = generate_train_test_dataset(i)
    Xs_train,y_train = handle_dataset(train_data)
    Xs_test,y_test = handle_dataset(test_data)
    rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_resampled, y_resampled = rus.fit_resample(Xs_train, y_train)
    lassoCV.fit(X_resampled,y_resampled)
    selected_features = [feature for feature, coef in zip(all_feature_names, lassoCV.coef_) if coef !=0]
    trials = Trials()
    best=fmin(fn=mlp_ac_cv, # function to optimize
          space=space, 
          algo=tpe.suggest, # optimization algorithm, hyperotp will select its parameters automatically
          max_evals=15, # maximum number of iterations
          trials=trials, # logging
          rstate=np.random.default_rng(42) # fixing random state for the reproducibility
    )
    activations = ['logistic', 'tanh', 'relu']
    model = MLPClassifier(hidden_layer_sizes=int(best['layer_size']),                            max_iter=int(best['max_iter']),alpha=best['alpha'], tol=1e-4,  random_state=1, activation=activations[best['activation']], learning_rate_init=best['learning_rate'])
    models.append(model)
    model.fit(X_resampled[selected_features],y_resampled)
    tpe_test_score=accuracy_score(y_test, model.predict(Xs_test[selected_features]))
    scores.append(tpe_test_score)
    print(tpe_test_score)
    if score < tpe_test_score:
        score = tpe_test_score
        best_model = model
        features = selected_features

100%|██████████| 15/15 [00:46<00:00,  3.13s/trial, best loss: -0.8312397179788483]
0.8673740053050398
100%|██████████| 15/15 [00:47<00:00,  3.19s/trial, best loss: -0.8351648351648352]
0.7950310559006211
100%|██████████| 15/15 [00:47<00:00,  3.16s/trial, best loss: -0.8370121743087949]
0.60062893081761
100%|██████████| 15/15 [00:52<00:00,  3.51s/trial, best loss: -0.8250841301212543]
0.9510204081632653
100%|██████████| 15/15 [00:40<00:00,  2.70s/trial, best loss: -0.8326950721637628]
0.8558282208588958
100%|██████████| 15/15 [00:43<00:00,  2.93s/trial, best loss: -0.8301734378712282]
0.6807909604519774
100%|██████████| 15/15 [00:47<00:00,  3.17s/trial, best loss: -0.8279905992949471]
0.6464891041162227
100%|██████████| 15/15 [00:54<00:00,  3.64s/trial, best loss: -0.8284284974987063]
0.9129032258064517
100%|██████████| 15/15 [00:50<00:00,  3.37s/trial, best loss: -0.8370121743087952]
0.5635838150289018
100%|██████████| 15/15 [01:03<00:00,  4.22s/trial, best loss: -0.8338534393783013]
0

In [12]:
print(scores)

[0.8673740053050398, 0.7950310559006211, 0.60062893081761, 0.9510204081632653, 0.8558282208588958, 0.6807909604519774, 0.6464891041162227, 0.9129032258064517, 0.5635838150289018, 0.8142235123367199, 0.9161849710982659, 0.9366391184573003, 0.9102244389027432, 0.945, 0.45546558704453444]


In [13]:
from joblib import dump

dump(best_model, '../../model/MLP_model_lasso.joblib')

['../../model/MLP_model_lasso.joblib']

In [14]:
print(features)

['GC_Content', 'CAI', 'A', 'N', 'H', 'nSE3', 'nGE3']
