In [1]:
!pip install mlflow hyperopt

Collecting mlflow
  Downloading mlflow-2.9.2-py3-none-any.whl (19.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.1/19.1 MB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
Collecting databricks-cli<1,>=0.8.7 (from mlflow)
  Downloading databricks_cli-0.18.0-py2.py3-none-any.whl (150 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.3/150.3 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=2.1.0 (from mlflow)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting docker<7,>=4.0.0 (from mlflow)
  Downloading docker-6.1.3-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━

# utils function

In [4]:
import pandas as pd
from sklearn.datasets import make_classification

# buat dummy dataset untuk testing
def create_dataset(
    n_samples: int = 10000,
    n_features: int = 50,
    n_informative: int = 10,
    class_sep: float = 1.0,
) -> pd.DataFrame:

    x, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_informative,
        class_sep=class_sep,
        random_state=42,
    )

    df = pd.DataFrame(x, columns=[f"feature_{i}" for i in range(n_features)])
    df["target"] = y

    return df

In [5]:
import mlflow
from typing import Any
# buat eksperimen di mlflow
def create_mlflow_experiment(
    experiment_name: str, artifact_location: str, tags: dict[str, Any]
) -> str:
    try:
        experiment_id = mlflow.create_experiment(
            name=experiment_name, artifact_location=artifact_location, tags=tags
        )
    except:
        print(f"Experiment {experiment_name} already exists.")
        experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

    mlflow.set_experiment(experiment_name=experiment_name)

    return experiment_id

# dummy hyperparameter tuning basic
kasusnya mencari hyperparameter pd kasus rumus $(x+3)^2 + 2$

In [6]:
# untuk mendptkan nilai minimum (yg diinginkan)
from hyperopt import fmin, tpe, Trials, hp

# kasusnya mencari hyperparameter pd kasus rumus (x+3)^2 + 2
def objective_function(params):
  y = (params['x'] + 3) ** 2 + 2
  return y

# hyperparameter nya bernilai -10 sampai 10 dg distribusi uniform
search_space = {
    'x': hp.uniform('x', -10, 10)
}

trials = Trials()

# mencari hyperparameter terbaik (menemukan nilai x yg paling minimum)
best = fmin(
  fn = objective_function,
  space = search_space,
  algo = tpe.suggest,
  max_evals = 100,
  trials = trials
)

print(best)

100%|██████████| 100/100 [00:00<00:00, 194.10trial/s, best loss: 2.0000349536313244]
{'x': -3.005912159615937}


# hyperparameter tuning with model

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from hyperopt import fmin
from hyperopt import tpe
from hyperopt import Trials
from hyperopt import hp

from typing import Dict
from typing import List
from typing import Optional

import pandas as pd
import mlflow
from functools import partial


def get_classification_metrics(y_true: pd.Series, y_pred: pd.Series, prefix: str) -> Dict[str, float]:
    """
    Get the classification metrics.

    :param y_true: The true target values.
    :param y_pred: The predicted target values.
    :param prefix: The prefix of the metric names.
    :return: The classification metrics.
    """
    return {
        f"{prefix}_accuracy": accuracy_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_precision": precision_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_recall": recall_score(y_true=y_true, y_pred=y_pred),
        f"{prefix}_f1": f1_score(y_true=y_true, y_pred=y_pred),
    }


def get_sklearn_pipeline(numerical_features: List[str], categorical_features: Optional[List[str]] = []) -> Pipeline:
    """
    Get the sklearn pipeline.
    :param numerical_features: The numerical features.
    :param categorical_features: The categorical features.
    :return: The sklearn pipeline.
    """
    preprocessing = ColumnTransformer(
        transformers=[
            ("numerical", SimpleImputer(strategy="median"), numerical_features),
            (
                "categorical",
                OneHotEncoder(),
                categorical_features,
            ),
        ]
    )

    pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessing),
            ("model", RandomForestClassifier()),
        ]
    )

    return pipeline

def objective_function(
    params: Dict,
    x_train: pd.DataFrame,
    x_test: pd.DataFrame,
    y_train: pd.DataFrame,
    y_test: pd.DataFrame,
    numerical_features: List[str],
    categorical_features: List[str],
) -> float:
    """
    Objective function to minimize.

    :param params: The hyperparameter values to evaluate.
    :param x_train: The training data.
    :param x_test: The test data.
    :param y_train: The training target.
    :param y_test: The test target.
    :param numerical_features: The numerical features.
    :param categorical_features: The categorical features.
    :return: The score of the model.
    """
    pipeline = get_sklearn_pipeline(numerical_features=numerical_features)
    params.update({"model__max_depth": int(params["model__max_depth"])})
    params.update({"model__n_estimators": int(params["model__n_estimators"])})
    pipeline.set_params(**params)
    with mlflow.start_run(nested=True) as run:
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        metrics = get_classification_metrics(
            y_true=y_test, y_pred=y_pred, prefix="test"
        )

        mlflow.log_params(pipeline["model"].get_params())
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(pipeline, f"{run.info.run_id}-model")
    return -metrics["test_f1"]


if __name__ == "__main__":
    df = create_dataset()
    x_train, x_test, y_train, y_test = train_test_split(
        df.drop("target", axis=1),
        df["target"],
        test_size=0.2,
        random_state=42,
    )

    numerical_features = [f for f in x_train.columns if f.startswith("feature")]
    print(numerical_features)

    space = {
        "model__n_estimators": hp.quniform("model__n_estimators", 20, 200, 10),
        "model__max_depth": hp.quniform("model__max_depth", 10, 100, 10),
    }

    experiment_id = create_mlflow_experiment(
        "hyperopt_experiment",
        artifact_location="hyperopt_mlflow_artifacts",
        tags={"mlflow.note.content": "hyperopt experiment"},
    )
    with mlflow.start_run(run_name="hyperparameter_opmization") as run:
        best_params = fmin(
            fn=partial(
                objective_function,
                x_train=x_train,
                x_test=x_test,
                y_train=y_train,
                y_test=y_test,
                numerical_features=numerical_features,
                categorical_features=None,
            ),
            space=space,
            algo=tpe.suggest,
            max_evals=10,
            trials=Trials(),
        )

        pipeline = get_sklearn_pipeline(numerical_features=numerical_features)

        best_params.update({"model__max_depth": int(best_params["model__max_depth"])})
        best_params.update(
            {"model__n_estimators": int(best_params["model__n_estimators"])}
        )

        pipeline.set_params(**best_params)
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        metrics = get_classification_metrics(
            y_true=y_test, y_pred=y_pred, prefix="best_model_test"
        )

        mlflow.log_params(pipeline["model"].get_params())
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(pipeline, f"{run.info.run_id}-best-model")

['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14', 'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19', 'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24', 'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29', 'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34', 'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39', 'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44', 'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49']
 10%|█         | 1/10 [00:07<01:04,  7.11s/trial, best loss: -0.8808184143222506]




100%|██████████| 10/10 [01:38<00:00,  9.89s/trial, best loss: -0.9058882737795673]
