In [1]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import boto3
import pandas as pd
from dotenv import load_dotenv
import os
import io
from google.cloud import storage

In [12]:
load_dotenv()

instance = os.getenv("GOOGLE_CLOUD_INSTANCE")
tracker_url = os.getenv("MLFLOW_URL")
bucket_name = os.getenv("GCS_BUCKET_NAME_DATASET")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = os.getenv("GOOGLE_APPLICATION_CREDENTIALS_NOTEBOOK")
object_name = "iris.csv" 

In [13]:
# s3 = boto3.client(
#     's3',
#     endpoint_url=s3_url,
#     aws_access_key_id=access_key,
#     aws_secret_access_key=secret_key
# )



client = storage.Client()

In [14]:
bucket = client.get_bucket(bucket_name)
blob = bucket.get_blob(object_name)

In [17]:
# try:
#     response = s3.get_object(Bucket=bucket_name, Key=object_name)
#     dataset_content = response.get('Body')
#     # print(dataset_content)
#     # Load the dataset into a DataFrame
#     df = pd.read_csv(dataset_content)
#     print("Dataset loaded successfully:")
#     print(df.head())
# except Exception as e:
#     print("Error fetching dataset from MinIO:", e)



try:
        # Open the blob and read its contents
        with blob.open("r") as f:
                # Use StringIO to treat the blob's content as a file-like object
                dataset_content = io.StringIO(f.read())

        # Load the dataset content into a DataFrame
        df = pd.read_csv(dataset_content)
        print("Dataset loaded successfully:")
        print(df.head())
except Exception as e:
        print("Error fetching dataset from GCS:", e)


Dataset loaded successfully:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [18]:
mlflow.set_tracking_uri("http://localhost")
experiment_name = "Iris_Classification_Experiment"
mlflow.set_experiment(experiment_name)

data = df.drop('Species', axis=1)
target = df['Species']
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

2024/11/05 22:44:54 INFO mlflow.tracking.fluent: Experiment with name 'Iris_Classification_Experiment' does not exist. Creating a new experiment.


In [19]:
# Start an MLflow run
with mlflow.start_run():
    # Define model parameters
    n_estimators = 100
    max_depth = 5
    random_state = 42

    # Log parameters to MLflow
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("random_state", random_state)

    # Train the model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state)
    model.fit(X_train, y_train)

    # Make predictions and calculate accuracy
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    # Log metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)

    # Log the model artifact to MinIO via MLflow
    mlflow.sklearn.log_model(model, "model")

    # Print Run ID for reference
    run_id = mlflow.active_run().info.run_id
    print(f"Run ID: {run_id}")
    print(f"Model accuracy: {accuracy}")


2024/11/05 22:45:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run clumsy-shark-187 at: http://localhost/#/experiments/1/runs/f8a9c84fdccc4c32b1eccad89f4e370b.
2024/11/05 22:45:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://localhost/#/experiments/1.


Run ID: f8a9c84fdccc4c32b1eccad89f4e370b
Model accuracy: 1.0
