In [9]:
import mlflow

In [5]:
from train import run
import os
import pickle
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [14]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run(data_path):
    mlflow.sklearn.autolog()
    with mlflow.start_run():
        X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
        X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))

        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_valid)

        rmse = mean_squared_error(y_valid, y_pred, squared=False)


In [15]:
%time run('/Users/m.shark/Documents/mlops/mlops-zoomcamp/02-experiment-tracking/homework/output/')



CPU times: user 8.76 s, sys: 247 ms, total: 9.01 s
Wall time: 14.8 s


In [13]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("exp1")

2022/05/29 23:24:29 INFO mlflow.tracking.fluent: Experiment with name 'exp1' does not exist. Creating a new experiment.


<Experiment: artifact_location='./mlruns/1', experiment_id='1', lifecycle_stage='active', name='exp1', tags={}>

In [16]:
import argparse
import os
import pickle

import mlflow
import numpy as np
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt")


def load_pickle(filename):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)


def run(data_path, num_trials):


    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_valid, y_valid = load_pickle(os.path.join(data_path, "valid.pkl"))

    def objective(params):
        with mlflow.start_run():
            mlflow.log_params(params)
            
            rf = RandomForestRegressor(**params)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_valid)
            rmse = mean_squared_error(y_valid, y_pred, squared=False)
            
            mlflow.log_metric("rmse", rmse)
            return {'loss': rmse, 'status': STATUS_OK}

    search_space = {
        'max_depth': scope.int(hp.quniform('max_depth', 1, 20, 1)),
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 50, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 10, 1)),
        'min_samples_leaf': scope.int(hp.quniform('min_samples_leaf', 1, 4, 1)),
        'random_state': 42
    }

    rstate = np.random.default_rng(42)  # for reproducible results
    fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=Trials(),
        rstate=rstate
    )

In [18]:
run(data_path = "/Users/m.shark/Documents/mlops/mlops-zoomcamp/02-experiment-tracking/homework/output/", num_trials = 50)

  0%|                                    | 0/50 [00:00<?, ?trial/s, best loss=?]




  2%|▏          | 1/50 [00:06<05:18,  6.51s/trial, best loss: 6.658956269343007]




  4%|▍          | 2/50 [00:07<02:40,  3.34s/trial, best loss: 6.658956269343007]




  6%|▋          | 3/50 [00:08<01:51,  2.38s/trial, best loss: 6.658956269343007]




  8%|▉          | 4/50 [00:13<02:24,  3.15s/trial, best loss: 6.651438559376775]




 10%|█          | 5/50 [00:15<02:13,  2.97s/trial, best loss: 6.651438559376775]




 12%|█▎         | 6/50 [00:22<03:08,  4.28s/trial, best loss: 6.651438559376775]




 14%|█▌         | 7/50 [00:29<03:34,  4.98s/trial, best loss: 6.651438559376775]




 16%|█▊         | 8/50 [00:34<03:33,  5.09s/trial, best loss: 6.651438559376775]




 18%|█▉         | 9/50 [00:38<03:21,  4.92s/trial, best loss: 6.651438559376775]




 20%|██        | 10/50 [00:42<03:01,  4.54s/trial, best loss: 6.651438559376775]




 22%|██▏       | 11/50 [00:46<02:46,  4.26s/trial, best loss: 6.642137287429206]




 24%|██▍       | 12/50 [00:49<02:26,  3.87s/trial, best loss: 6.642137287429206]




 26%|██▌       | 13/50 [00:50<01:57,  3.17s/trial, best loss: 6.642137287429206]




 28%|██▊       | 14/50 [00:53<01:52,  3.12s/trial, best loss: 6.642137287429206]




 30%|███       | 15/50 [00:57<01:58,  3.40s/trial, best loss: 6.642137287429206]




 32%|███▏      | 16/50 [01:01<01:52,  3.32s/trial, best loss: 6.642137287429206]




 34%|███▍      | 17/50 [01:05<02:00,  3.65s/trial, best loss: 6.642137287429206]




 36%|███▌      | 18/50 [01:11<02:19,  4.36s/trial, best loss: 6.629728007710133]




 38%|███▊      | 19/50 [01:13<01:54,  3.70s/trial, best loss: 6.629728007710133]




 40%|████      | 20/50 [01:15<01:34,  3.14s/trial, best loss: 6.629728007710133]




 42%|████▏     | 21/50 [01:23<02:15,  4.67s/trial, best loss: 6.629728007710133]




 44%|████▍     | 22/50 [01:30<02:26,  5.21s/trial, best loss: 6.629728007710133]




 46%|████▌     | 23/50 [01:39<02:57,  6.57s/trial, best loss: 6.629728007710133]




 48%|████▊     | 24/50 [01:48<03:06,  7.16s/trial, best loss: 6.629728007710133]




 50%|█████     | 25/50 [01:55<02:54,  7.00s/trial, best loss: 6.629728007710133]




 52%|█████▏    | 26/50 [02:04<03:04,  7.70s/trial, best loss: 6.629728007710133]




 54%|█████▍    | 27/50 [02:08<02:33,  6.66s/trial, best loss: 6.629728007710133]




 56%|█████▌    | 28/50 [02:21<03:08,  8.56s/trial, best loss: 6.629728007710133]




 58%|█████▊    | 29/50 [02:24<02:24,  6.87s/trial, best loss: 6.629728007710133]




 60%|██████    | 30/50 [02:31<02:20,  7.05s/trial, best loss: 6.629728007710133]




 62%|██████▏   | 31/50 [02:37<02:07,  6.71s/trial, best loss: 6.629728007710133]




 64%|██████▍   | 32/50 [02:42<01:51,  6.17s/trial, best loss: 6.629728007710133]




 66%|██████▌   | 33/50 [02:50<01:53,  6.70s/trial, best loss: 6.629728007710133]




 68%|██████   | 34/50 [02:57<01:49,  6.82s/trial, best loss: 6.6284257482044735]




 70%|██████▎  | 35/50 [03:01<01:29,  5.95s/trial, best loss: 6.6284257482044735]




 72%|██████▍  | 36/50 [03:06<01:17,  5.51s/trial, best loss: 6.6284257482044735]




 74%|██████▋  | 37/50 [03:07<00:54,  4.16s/trial, best loss: 6.6284257482044735]




 76%|██████▊  | 38/50 [03:12<00:53,  4.46s/trial, best loss: 6.6284257482044735]




 78%|███████  | 39/50 [03:18<00:55,  5.05s/trial, best loss: 6.6284257482044735]




 80%|███████▏ | 40/50 [03:25<00:54,  5.44s/trial, best loss: 6.6284257482044735]




 82%|███████▍ | 41/50 [03:26<00:39,  4.34s/trial, best loss: 6.6284257482044735]




 84%|███████▌ | 42/50 [03:30<00:32,  4.01s/trial, best loss: 6.6284257482044735]




 86%|███████▋ | 43/50 [03:36<00:33,  4.78s/trial, best loss: 6.6284257482044735]




 88%|███████▉ | 44/50 [03:42<00:30,  5.02s/trial, best loss: 6.6284257482044735]




 90%|████████ | 45/50 [03:45<00:22,  4.45s/trial, best loss: 6.6284257482044735]




 92%|████████▎| 46/50 [03:47<00:14,  3.66s/trial, best loss: 6.6284257482044735]




 94%|████████▍| 47/50 [03:51<00:11,  3.87s/trial, best loss: 6.6284257482044735]




 96%|████████▋| 48/50 [03:54<00:07,  3.53s/trial, best loss: 6.6284257482044735]




 98%|████████▊| 49/50 [03:58<00:03,  3.65s/trial, best loss: 6.6284257482044735]




100%|█████████| 50/50 [04:03<00:00,  4.87s/trial, best loss: 6.6284257482044735]
