<div class="alert alert-block alert-info">
__Name__: pangenome_hyperopt<br/>
__Description__: Try automated search for classifier and optimum hyper-parameters<br/>
__Author__: Matthew Whiteside matthew dot whiteside at canada dot ca<br/>
__Date__: Oct 11, 2017<br/>
__TODO__:<br/>
</div>

In [1]:
%load_ext autoreload
%autoreload 2

In [13]:
# Load external libs
from hpsklearn import HyperoptEstimator, any_classifier, xgboost_classification, random_forest, gradient_boosting, extra_trees
from hyperopt import tpe
import numpy as np
from sklearn.metrics import f1_score, classification_report

In [6]:
# Load data
import os
os.chdir('../pangenome')
import config
import utils
import classify
pg, genome_list, locus_list = utils.read_panseq(config.PANSEQ['pangenome_file'])
amr,amr_list = utils.read_amr(config.PHENOTYPE['amr_file'], genome_list)
annot = utils.read_annot(config.ANNOTATION['blast_file'])

In [7]:
# Split into train & test for ampicillin
d = np.argwhere(amr_list == 'ampicillin').item(0)
validrows = ~np.isnan(amr[:,d])
validrows
X = pg[validrows,:]
y = amr[validrows,d]

test_size = int( 0.2 * len( y ) )
np.random.seed( 21 )
indices = np.random.permutation(X.shape[0])
X_train = X[ indices[:-test_size]]
y_train = y[ indices[:-test_size]]
X_test = X[ indices[-test_size:]]
y_test = y[ indices[-test_size:]]

In [8]:
# Define loss function
def loss_fn(y_target, y_prediction):
    return 1.0 - f1_score(y_target, y_prediction)

In [12]:
# HP search for Random Forest
rfc = HyperoptEstimator( classifier=random_forest('rfc'), preprocessing=[], algo=tpe.suggest, loss_fn=loss_fn, trial_timeout=2000)
rfc.fit( X_train.toarray(), y_train )
print( rfc.score( X_test.toarray(), y_test ) )
print( rfc.best_model() )
predictions = rfc.predict( X_test.toarray() )
print(classification_report(y_test, predictions))

0.941176470588
{'learner': RandomForestClassifier(bootstrap=False, class_weight=None,
            criterion='entropy', max_depth=None,
            max_features=0.344782910872078, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=67, n_jobs=1, oob_score=False, random_state=0,
            verbose=False, warm_start=False), 'preprocs': (), 'ex_preprocs': ()}


NameError: name 'classification_report' is not defined

In [18]:
predictions = rfc.predict( X_test.toarray() )
print(classification_report(y_test, predictions))

             precision    recall  f1-score   support

        0.0       0.82      0.93      0.87        15
        1.0       0.98      0.94      0.96        53

avg / total       0.95      0.94      0.94        68



In [15]:
etc = HyperoptEstimator( classifier=extra_trees('etc'), preprocessing=[], algo=tpe.suggest, loss_fn=loss_fn, trial_timeout=2000)
etc.fit( X_train.toarray(), y_train )
print( etc.score( X_test.toarray(), y_test ) )
print( etc.best_model() )
predictions = etc.predict( X_test.toarray() )
print(classification_report(y_test, predictions))

0.955882352941
{'learner': ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='entropy',
           max_depth=None, max_features=0.9043349350446708,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=347, n_jobs=1,
           oob_score=False, random_state=1, verbose=False,
           warm_start=False), 'preprocs': (), 'ex_preprocs': ()}
             precision    recall  f1-score   support

        0.0       0.88      0.93      0.90        15
        1.0       0.98      0.96      0.97        53

avg / total       0.96      0.96      0.96        68



In [16]:
gbc = HyperoptEstimator( classifier=gradient_boosting('gbc'), preprocessing=[], algo=tpe.suggest, loss_fn=loss_fn, trial_timeout=2000)
gbc.fit( X_train.toarray(), y_train )
print( gbc.score( X_test.toarray(), y_test ) )
print( gbc.best_model() )
predictions = gbc.predict( X_test.toarray() )
print(classification_report(y_test, predictions))

0.955882352941
{'learner': GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.0765306839796841, loss='deviance',
              max_depth=None, max_features=0.8356287036892351,
              max_leaf_nodes=None, min_impurity_split=1e-07,
              min_samples_leaf=5, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=292,
              presort='auto', random_state=2, subsample=1.0, verbose=0,
              warm_start=False), 'preprocs': (), 'ex_preprocs': ()}
             precision    recall  f1-score   support

        0.0       0.88      0.93      0.90        15
        1.0       0.98      0.96      0.97        53

avg / total       0.96      0.96      0.96        68



In [17]:
xbc = HyperoptEstimator( classifier=xgboost_classification('xbc'), preprocessing=[], algo=tpe.suggest, loss_fn=loss_fn, trial_timeout=2000)
xbc.fit( X_train.toarray(), y_train )
print( xbc.score( X_test.toarray(), y_test ) )
print( xbc.best_model() )
predictions = xbc.predict( X_test.toarray() )
print(classification_report(y_test, predictions))

  'precision', 'predicted', average, warn_for)


0.941176470588
{'learner': XGBClassifier(base_score=0.5, colsample_bylevel=0.7028641308580167,
       colsample_bytree=0.7980603649782954, gamma=0.003328636988102734,
       learning_rate=0.00990360535843554, max_delta_step=0, max_depth=10,
       min_child_weight=2, missing=nan, n_estimators=4400, nthread=-1,
       objective='binary:logistic', reg_alpha=0.0007025873903998169,
       reg_lambda=1.0246535683564084, scale_pos_weight=1, seed=1,
       silent=True, subsample=0.9738872114011073), 'preprocs': (), 'ex_preprocs': ()}
             precision    recall  f1-score   support

        0.0       0.82      0.93      0.87        15
        1.0       0.98      0.94      0.96        53

avg / total       0.95      0.94      0.94        68

