In [94]:
import os

import mlflow
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
from deltalake import DeltaTable
from deltalake import fs
from deltalake.writer import write_deltalake
import boto3

In [16]:
storage_options = {
  "AWS_ENDPOINT_URL": 'http://minio:9000',
  "AWS_REGION": "",
  "AWS_ACCESS_KEY_ID": os.environ.get("MINIO_ROOT_USER"),
  "AWS_SECRET_ACCESS_KEY": os.environ.get("MINIO_ROOT_PASSWORD"),
  "AWS_ALLOW_HTTP": "true",
  "AWS_S3_ALLOW_UNSAFE_RENAME": "true"
}

In [3]:
mlflow.set_tracking_uri(uri="http://mlflow:8080")

### create dataset

In [12]:
session = boto3.Session(
    aws_access_key_id = os.environ.get("MINIO_ROOT_USER"),
    aws_secret_access_key = os.environ.get("MINIO_ROOT_PASSWORD"),
)
s3 = session.resource('s3', 
  endpoint_url='http://minio:9000', 
  config=boto3.session.Config(signature_version='s3v4')
)
s3.create_bucket(Bucket='mlflow-example-data')

s3.Bucket(name='mlflow-example-data')

In [70]:
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df["species_index"] = iris.target

In [71]:
iris_species = pd.DataFrame({"species": iris.target_names})
iris_species.index = iris_species.index.rename("species_index")
iris_species = iris_species.reset_index()

In [72]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species_index
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [76]:
write_deltalake("s3a://mlflow-example-data/iris", iris_df, storage_options = storage_options)

In [78]:
write_deltalake("s3a://mlflow-example-data/species", iris_species, storage_options = storage_options)

In [79]:
del iris, iris_df, iris_species

### load data
##### just for educational purposes

In [80]:
iris = DeltaTable("s3a://mlflow-example-data/iris", storage_options = storage_options).to_pandas()

In [81]:
iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species_index
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [91]:
X = iris.iloc[:, :4]
y = iris["species_index"]

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
mlflow.set_experiment("iris")
mlflow.autolog(log_models=False)
with mlflow.start_run(run_name="Iris RF Experiment") as run:
  param_grid = {
    'n_estimators': [10,20, 50],
    'min_samples_leaf': [2, 5, 10]
  }
  rfc = RFC()
  opt = GridSearchCV(rfc, param_grid, cv = 3)
  opt.fit(X_train, y_train)
  best_rfc = opt.best_estimator_ 
  mlflow.log_metric("test_accuracy_score", best_rfc.score(X_test, y_test))
  mlflow.sklearn.log_model(best_rfc, "random-forest-model")

  mlflow.end_run()

2024/12/09 10:50:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2024/12/09 10:50:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2024/12/09 10:50:44 INFO mlflow.sklearn.utils: Logging the 5 best runs, 4 runs will be omitted.
2024/12/09 10:50:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run Iris RF Experiment at: http://mlflow:8080/#/experiments/1/runs/74e7686fefae419ea227511d1f3bb127.
2024/12/09 10:50:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:8080/#/experiments/1.
2024/12/09 10:50:47 INFO mlflow.tracking._tracking_service.client: 🏃 View run omniscient-foal-676 at: http://mlflow:8080/#/experiments/1/runs/ceb7f215eb1e441fa129e0fcac8d20d0.
2024/12/09 10:50:47 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:8080/#/experiments/1.


mlflow-artifacts:/1/ceb7f215eb1e441fa129e0fcac8d20d0/artifacts
runID: 74e7686fefae419ea227511d1f3bb127


### You can find your experiment at http://localhost:8080

### If you wish to clean everything up, you can delete the bucket on Minio and the experiment on MLFlow