In [9]:
from hyperopt import tpe
from hyperopt import STATUS_OK, Trials, hp, fmin
import pandas as pd 
import numpy as np 
import mlflow
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, confusion_matrix 
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler

In [10]:
df = pd.read_csv('https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=0, stratify=Y)
mx = MinMaxScaler()

X_train = mx.fit_transform(X_train)
X_test = mx.transform(X_test)

In [12]:
N_FOLDS = 4 
MAX_EVALS = 10
def objective(params, n_folds = N_FOLDS):
    mlflow.sklearn.autolog()
    with mlflow.start_run(nested=True):
        clf = LogisticRegression(**params, random_state=0)
        scores = cross_val_score(clf, X_train, y_train, cv=N_FOLDS, scoring='f1_macro')
        
        best_score = max(scores)
        
        loss = 1 - best_score
        
        return {'loss': loss, 'params':params, 'status': STATUS_OK}

In [13]:
space ={
    'warm_start': hp.choice('warm_start', [True, False]),
    'fit_intercept': hp.choice('fit_intercept',[True, False]),
    'tol': hp.uniform('tol', 0.00001, 0.0001),
    'C': hp.uniform('C', 0.05, 3),
    'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
    'max_iter': hp.choice('max_iter', range(5,1000))
}

In [14]:
mlflow.set_experiment('Hyperopt_Optimization')

2022/10/18 12:40:20 INFO mlflow.tracking.fluent: Experiment with name 'Hyperopt_Optimization' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///Users/phamthanhtu/Documents/mlflow/tuning/mlruns/3', creation_time=1666071620166, experiment_id='3', last_update_time=1666071620166, lifecycle_stage='active', name='Hyperopt_Optimization', tags={}>

In [15]:
# Algorithm
tpe_algorithm = tpe.suggest

bayes_trials = Trials()
with mlflow.start_run(run_name='hyper_opt_logistic') as run: 
    best = fmin(fn=objective, space=space, algo=tpe_algorithm, max_evals=MAX_EVALS, trials=bayes_trials)
    best = {k:float(v) for k,v in best.items()}
    mlflow.log_dict(best, "best_params.json")

 20%|██        | 2/10 [00:23<01:24, 10.59s/trial, best loss: 0.23324922904401468]







100%|██████████| 10/10 [01:12<00:00,  7.26s/trial, best loss: 0.23324922904401468]
