In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
import pandas as pd
import mlflow



In [3]:
# Load the Breast Cancer dataset
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target, name='target')




In [4]:
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Creating the RandomForestClassifier model
rf = RandomForestClassifier(random_state=42)

In [5]:
param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20, 30]
}


# Applying GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

In [None]:
# import dagshub
# dagshub.init(repo_owner='', repo_name='`, mlflow=True)

# mlflow.set_tracking_uri("https://dagshub.com/")

mlflow.set_tracking_uri("http://127.0.0.1:5000")



mlflow.set_experiment('breast-cancer-rf-hp')

with mlflow.start_run() as parent:
    grid_search.fit(X_train, y_train)

    # log all the child runs
    for i in range(len(grid_search.cv_results_['params'])):

        with mlflow.start_run(nested=True) as child:
            mlflow.log_params(grid_search.cv_results_["params"][i])
            mlflow.log_metric("accuracy", grid_search.cv_results_["mean_test_score"][i])

    # Displaying the best parameters and the best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Log params
    mlflow.log_params(best_params)

    # Log metrics
    mlflow.log_metric("accuracy", best_score)

    # Log training data
    train_df = X_train.copy()
    train_df['target'] = y_train

    train_df = mlflow.data.from_pandas(train_df)
    mlflow.log_input(train_df, "training")

    # Log test data
    test_df = X_test.copy()
    test_df['target'] = y_test

    test_df = mlflow.data.from_pandas(test_df)
    mlflow.log_input(test_df, "testing")

    # Log source code
    # mlflow.log_artifact(__file__)

    # Log the best model
    mlflow.sklearn.log_model(grid_search.best_estimator_, "random_forest")

    # Set tags
    mlflow.set_tag("author", "Vikash Das")

    print(best_params)
    print(best_score)


2025/05/26 03:09:21 INFO mlflow.tracking.fluent: Experiment with name 'breast-cancer-rf-hp' does not exist. Creating a new experiment.


Fitting 5 folds for each of 12 candidates, totalling 60 fits
🏃 View run selective-rat-625 at: http://127.0.0.1:5000/#/experiments/894759314910987422/runs/3f8dc101c0a3459fb25c75f77e4dce17
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/894759314910987422
🏃 View run bald-perch-797 at: http://127.0.0.1:5000/#/experiments/894759314910987422/runs/baa111b7fb5d44a6835b1f03680d3f5c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/894759314910987422
🏃 View run auspicious-bird-441 at: http://127.0.0.1:5000/#/experiments/894759314910987422/runs/f9cc4537924249e18b3769da3ff5f90e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/894759314910987422
🏃 View run gregarious-crane-566 at: http://127.0.0.1:5000/#/experiments/894759314910987422/runs/af8b51d9b99a4558b535935750ba6536
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/894759314910987422
🏃 View run selective-stoat-82 at: http://127.0.0.1:5000/#/experiments/894759314910987422/runs/c84c5773a77d4722b82ece2d0650d80



{'max_depth': None, 'n_estimators': 100}
0.9582417582417582
🏃 View run tasteful-bird-904 at: http://127.0.0.1:5000/#/experiments/894759314910987422/runs/248e32e333544f85ab4d841bb33420a4
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/894759314910987422
