In [1]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import boto3
import pandas as pd
from minio import Minio
from dotenv import load_dotenv
import os

ModuleNotFoundError: No module named 'boto3'

In [2]:
load_dotenv()

access_key = os.getenv("AWS_ACCESS_KEY_ID")
secret_key = os.getenv("AWS_SECRET_ACCESS_KEY")
s3_url = os.getenv("S3_URL")
tracker_url = os.getenv("MLFLOW_URL")


In [4]:
tracker_url

'tracker'

In [None]:
bucket_name = "dataset"
object_name = "Iris.csv" 

In [None]:
s3 = boto3.client(
    's3',
    endpoint_url=s3_url,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)


In [11]:
try:
    response = s3.get_object(Bucket=bucket_name, Key=object_name)
    dataset_content = response.get('Body')
    # print(dataset_content)
    # Load the dataset into a DataFrame
    df = pd.read_csv(dataset_content)
    print("Dataset loaded successfully:")
    print(df.head())
except Exception as e:
    print("Error fetching dataset from MinIO:", e)

Dataset loaded successfully:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [19]:
mlflow.set_tracking_uri("http://mlflow:5000")
experiment_name = "Iris_Classification_Experiment"
mlflow.set_experiment(experiment_name)

data = df.drop('Species', axis=1)
target = df['Species']
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

In [20]:
# Start an MLflow run
with mlflow.start_run():
    # Define model parameters
    n_estimators = 100
    max_depth = 5
    random_state = 42

    # Log parameters to MLflow
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)

    # Train the model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Log the model artifact to MinIO via MLflow
    mlflow.sklearn.log_model(model, "model")

    # Print Run ID for reference
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")
    print(f"Model accuracy: {accuracy}")

# To check results:
# - Open MLflow tracking server UI at http://localhost:5001
# - Go to the MinIO console at http://localhost:9001 to view the artifacts


2024/11/03 15:11:11 INFO mlflow.tracking._tracking_service.client: 🏃 View run handsome-kite-956 at: http://mlflow:5000/#/experiments/1/runs/fb5c9c8b48b14601aba27d8d9b7c5a96.
2024/11/03 15:11:11 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://mlflow:5000/#/experiments/1.


Run ID: fb5c9c8b48b14601aba27d8d9b7c5a96
Model accuracy: 1.0
