In [1]:
import mlflow

# Run the login function to authenticate with Databricks CE
mlflow.login()


AttributeError: module 'mlflow' has no attribute 'login'

In [None]:

# dummy experiment from tutorial
mlflow.set_tracking_uri("databricks")

mlflow.set_experiment("/check-databricks-connection")

with mlflow.start_run():
    mlflow.log_metric("foo", 1)
    mlflow.log_metric("bar", 2)



2024/08/29 02:00:34 INFO mlflow.tracking._tracking_service.client: 🏃 View run awesome-seal-612 at: https://community.cloud.databricks.com/ml/experiments/2597702965538188/runs/a4d786667a4f4e408d7a75d0062d959b.
2024/08/29 02:00:34 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://community.cloud.databricks.com/ml/experiments/2597702965538188.


## Simple Model and MLflow Experiment with Iris Dataset
Uses classic Iris dataset and RandomForest algo to train an ML model for classifying species of irises.

### Key Steps
1. Load dataset
2. Split data set into training and testing subsets (0.7, 0.3)
3. Train a Random Forest Classifier model with a range of n_estimators
4. Make predictions on the test data for each
5. Calculate accuracy for each model and track in MLflow 
6. Log the best model and log train_n_track.ipynb to MLflow

In [None]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn

# kill old processes
if mlflow.active_run():
    mlflow.end_run()

# set up mlflow tracking
mlflow.set_tracking_uri('databricks')
mlflow.set_experiment('/iris-random-forest')

# load iris dataset
data = load_iris()
# split data into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, test_size=0.3, random_state=23)

# list of n_estimators to try
n_estimators_list = [50, 100, 200, 500]

# variables to store results
accuracies = []
best_accuracy = 0
best_model = None
best_n_estimators = None

# start an mlflow experiment
with mlflow.start_run() as active_run:  # active_run is now defined
    for n in n_estimators_list:
        # initialize and train the model
        model = RandomForestClassifier(n_estimators=n, random_state=23)
        model.fit(X_train, y_train)
        
        # predict on the test set
        predictions = model.predict(X_test)
        
        # calculate accuracy on the test set
        accuracy = accuracy_score(y_test, predictions)
        accuracies.append(accuracy)
        
        # log the accuracy to mlflow
        mlflow.log_param(f'n_estimators_{n}', n)
        mlflow.log_metric(f'accuracy_{n}', accuracy)
        
        # print the accuracy for the current model
        print(f'n_estimators = {n}, test set accuracy = {accuracy:.4f}')
        
        # check if this is the best model
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
            best_n_estimators = n

    # calculate and log the average accuracy
    avg_accuracy = sum(accuracies) / len(accuracies)
    mlflow.log_metric('average_accuracy', avg_accuracy)
    print(f'\nAverage accuracy across all models: {avg_accuracy:.4f}')
    
    # log and print the best model's accuracy
    mlflow.log_param('best_n_estimators', best_n_estimators)
    mlflow.log_metric('best_accuracy', best_accuracy)
    mlflow.sklearn.log_model(best_model, 'best_model')
    
    print(f'Best model: n_estimators = {best_n_estimators}, accuracy = {best_accuracy:.4f}')
    print('Run ID:', active_run.info.run_id)  # use active_run.info.run_id to access the run ID


n_estimators = 50, test set accuracy = 0.9778
n_estimators = 100, test set accuracy = 0.9778
n_estimators = 200, test set accuracy = 0.9778
n_estimators = 500, test set accuracy = 0.9778

Average accuracy across all models: 0.9778




Best model: n_estimators = 50, accuracy = 0.9778
Run ID: 509923ed07c345b8894918effad86f63


2024/08/29 20:42:01 INFO mlflow.tracking._tracking_service.client: 🏃 View run smiling-turtle-739 at: https://community.cloud.databricks.com/ml/experiments/3146434618114174/runs/509923ed07c345b8894918effad86f63.
2024/08/29 20:42:01 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://community.cloud.databricks.com/ml/experiments/3146434618114174.
