# MLflow Tutorial: Hands-on Examples

This notebook provides practical examples of MLflow usage for data science MLOps.

## Prerequisites

Make sure you have installed the dependencies using UV:
```bash
uv add mlflow scikit-learn pandas numpy matplotlib seaborn
```

## 1. Setup and Basic Configuration

In [1]:
import mlflow
import mlflow.sklearn
import mlflow.models
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_iris, load_wine
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import joblib
import time
import warnings
warnings.filterwarnings('ignore')

# Configure MLflow
tracking_uri = "file:./mlruns"
mlflow.set_tracking_uri(tracking_uri)
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"MLflow version: {mlflow.__version__}")

MLflow tracking URI: file:./mlruns
MLflow version: 3.1.4


## 2. Load and Prepare Data

In [2]:
# Load Iris dataset
iris = load_iris()
X_iris, y_iris = iris.data, iris.target
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    X_iris, y_iris, test_size=0.2, random_state=42
)

# Load Wine dataset for additional examples
wine = load_wine()
X_wine, y_wine = wine.data, wine.target
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    X_wine, y_wine, test_size=0.2, random_state=42
)

print(f"Iris dataset: {X_iris.shape}, classes: {len(np.unique(y_iris))}")
print(f"Wine dataset: {X_wine.shape}, classes: {len(np.unique(y_wine))}")

Iris dataset: (150, 4), classes: 3
Wine dataset: (178, 13), classes: 3


## 3. Basic MLflow Tracking

In [None]:
# Create experiment
experiment_name = "iris-classification-tutorial"
try:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        experiment_id = mlflow.create_experiment(experiment_name)
        print(f"✅ Created new experiment: {experiment_name} (ID: {experiment_id})")
    else:
        experiment_id = experiment.experiment_id
        print(f"✅ Using existing experiment: {experiment_name} (ID: {experiment_id})")
except Exception as e:
    print(f"❌ Error creating experiment: {e}")
    
    # Fallback: create with different approach
    experiment_id = mlflow.create_experiment(experiment_name)

# Set active experiment
mlflow.set_experiment(experiment_name)

✅ Created new experiment: iris-classification-tutorial (ID: 623039503078250544)


<Experiment: artifact_location='file:///e:/mlflow_testing/mlruns/623039503078250544', creation_time=1753396716576, experiment_id='623039503078250544', last_update_time=1753396716576, lifecycle_stage='active', name='iris-classification-tutorial', tags={}>

In [None]:
# Basic tracking example - Single model run
with mlflow.start_run(run_name="baseline-random-forest"):
    
    # Single parameter values (not lists!)
    n_estimators = 100
    max_depth = 10
    min_samples_split = 2
    
    # Log parameters
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("min_samples_split", min_samples_split)
    mlflow.log_param("dataset", "iris")
    mlflow.log_param("dataset_size", len(X_train_iris))
    
    # Train model
    start_time = time.perf_counter()
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    model.fit(X_train_iris, y_train_iris)
    training_time = time.perf_counter() - start_time
    
    # Make predictions
    y_pred_train = model.predict(X_train_iris)
    y_pred_test = model.predict(X_test_iris)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train_iris, y_pred_train)
    test_accuracy = accuracy_score(y_test_iris, y_pred_test)
    precision = precision_score(y_test_iris, y_pred_test, average='weighted')
    recall = recall_score(y_test_iris, y_pred_test, average='weighted')
    f1 = f1_score(y_test_iris, y_pred_test, average='weighted')
    
    # Log metrics
    mlflow.log_metrics({
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1,
        "training_time": training_time
    })
    
    # Log model with input example and signature (fixes warnings)
    input_example = X_train_iris[:5]
    mlflow.sklearn.log_model(
        model, 
        "model",
        input_example=input_example,
        signature=mlflow.models.infer_signature(X_train_iris, y_pred_train)
    )
    
    # Save and log confusion matrix
    cm = confusion_matrix(y_test_iris, y_pred_test)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix - Random Forest')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig('confusion_matrix_rf.png', dpi=150, bbox_inches='tight')
    mlflow.log_artifact('confusion_matrix_rf.png')
    plt.show()
    
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Training time: {training_time:.2f} seconds")
    print(f"Run ID: {mlflow.active_run().info.run_id}")


### 3.1. Option 1: Manual Nested Runs (Educational)

In [None]:
# HYPERPARAMETER TUNING WITH NESTED RUNS - BEST PRACTICE
import itertools

# Define hyperparameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}

best_accuracy = 0
best_params = None
best_model = None

# Parent run for the entire tuning experiment
with mlflow.start_run(run_name="hyperparameter-tuning-experiment") as parent_run:
    
    # Log experiment metadata
    mlflow.log_param("tuning_strategy", "grid_search")
    mlflow.log_param("param_space", str(param_grid))
    mlflow.log_param("total_combinations", len(list(itertools.product(*param_grid.values()))))
    
    run_count = 0
    
    # Grid search with nested runs
    for n_est in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            for min_split in param_grid['min_samples_split']:
                
                run_count += 1
                
                # Child run for each parameter combination
                with mlflow.start_run(run_name=f"run_{run_count:02d}", nested=True) as child_run:
                    
                    # Current parameter combination
                    current_params = {
                        "n_estimators": n_est,
                        "max_depth": depth,
                        "min_samples_split": min_split
                    }
                    
                    # Log parameters
                    mlflow.log_params({
                        **current_params,
                        "model_type": "RandomForest",
                        "dataset": "iris",
                        "random_state": 42
                    })
                    
                    # Train model
                    start_time = time.perf_counter()
                    model = RandomForestClassifier(
                        n_estimators=n_est,
                        max_depth=depth,
                        min_samples_split=min_split,
                        random_state=42
                    )
                    model.fit(X_train_iris, y_train_iris)
                    training_time = time.perf_counter() - start_time
                    
                    # Make predictions
                    y_pred_train = model.predict(X_train_iris)
                    y_pred_test = model.predict(X_test_iris)
                    
                    # Calculate metrics
                    train_accuracy = accuracy_score(y_train_iris, y_pred_train)
                    test_accuracy = accuracy_score(y_test_iris, y_pred_test)
                    precision = precision_score(y_test_iris, y_pred_test, average='weighted')
                    recall = recall_score(y_test_iris, y_pred_test, average='weighted')
                    f1 = f1_score(y_test_iris, y_pred_test, average='weighted')
                    
                    # Log metrics
                    mlflow.log_metrics({
                        "train_accuracy": train_accuracy,
                        "test_accuracy": test_accuracy,
                        "precision": precision,
                        "recall": recall,
                        "f1_score": f1,
                        "training_time": training_time
                    })
                    
                    # Log model with signature (always include for best practices)
                    input_example = X_train_iris[:3]
                    mlflow.sklearn.log_model(
                        model, 
                        "model",
                        input_example=input_example,
                        signature=mlflow.models.infer_signature(X_train_iris, y_pred_test)
                    )
                    
                    # Track best model
                    if test_accuracy > best_accuracy:
                        best_accuracy = test_accuracy
                        best_params = current_params.copy()
                        best_model = model
                    
                    print(f"Run {run_count:2d} | n_est={n_est:3d}, depth={depth:2d}, min_split={min_split:2d} | Accuracy: {test_accuracy:.4f}")
    
    # Log best results to parent run
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
    mlflow.log_metric("best_test_accuracy", best_accuracy)
    mlflow.log_metric("total_runs", run_count)
    
    # Register best model
    if best_model is not None:
        input_example = X_train_iris[:5]
        mlflow.sklearn.log_model(
            best_model,
            "best_model", 
            input_example=input_example,
            signature=mlflow.models.infer_signature(X_train_iris, best_model.predict(X_test_iris)),
            registered_model_name="iris_best_rf_tuned"
        )
    
    print(f"\n🎯 TUNING COMPLETE!")
    print(f"📊 Tested {run_count} parameter combinations")
    print(f"🏆 Best accuracy: {best_accuracy:.4f}")
    print(f"⚙️  Best parameters: {best_params}")
    print(f"🔄 Parent run ID: {parent_run.info.run_id}")

### 3.5. Option 2: Using GridSearchCV with MLflow Autologging (Recommended for Hyperparameter Tuning)

In [None]:
# OPTION 2: GridSearchCV with MLflow Autologging - EVEN EASIER!
from sklearn.model_selection import GridSearchCV

# Enable autologging for sklearn (captures GridSearchCV automatically)
mlflow.sklearn.autolog()

# Define parameter grid for GridSearchCV
param_grid_cv = {
    'n_estimators': [50, 100],
    'max_depth': [5, 10],
    'min_samples_split': [2, 5]
}

# Create experiment for GridSearchCV
mlflow.set_experiment("iris-gridsearch-autolog")

with mlflow.start_run(run_name="gridsearch-with-autolog") as run:
    
    # Create base model
    rf = RandomForestClassifier(random_state=42)
    
    # Create GridSearchCV
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid_cv,
        cv=3,  # 3-fold cross-validation
        scoring='accuracy', # set scoring to accuracy
        n_jobs=-1,  # Use all cores
        verbose=1
    )
    
    print("🔍 Running GridSearchCV with 3-fold CV...")
    
    # Fit - this will automatically log everything to MLflow!
    grid_search.fit(X_train_iris, y_train_iris)
    
    # Get best results
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    
    # Test on holdout set
    test_accuracy = best_model.score(X_test_iris, y_test_iris)
    
    # Log additional custom metrics
    mlflow.log_metric("holdout_test_accuracy", test_accuracy)
    mlflow.log_param("cv_folds", 5)
    mlflow.log_param("total_combinations_tested", len(grid_search.cv_results_['mean_test_score']))
    
    print(f"\n🏆 GRIDSEARCH COMPLETE!")
    print(f"📊 Best CV score: {best_score:.4f}")
    print(f"🎯 Holdout test accuracy: {test_accuracy:.4f}")
    print(f"⚙️  Best parameters: {best_params}")
    print(f"🤖 All runs auto-logged to MLflow!")

# Disable autologging
mlflow.sklearn.autolog(disable=True)

2025/07/25 06:01:42 INFO mlflow.tracking.fluent: Experiment with name 'iris-gridsearch-autolog' does not exist. Creating a new experiment.


🔍 Running GridSearchCV with 3-fold CV...
Fitting 3 folds for each of 8 candidates, totalling 24 fits


2025/07/25 06:01:56 INFO mlflow.sklearn.utils: Logging the 5 best runs, 3 runs will be omitted.



🏆 GRIDSEARCH COMPLETE!
📊 Best CV score: 0.9583
🎯 Holdout test accuracy: 1.0000
⚙️  Best parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
🤖 All runs auto-logged to MLflow!


### 3.6. Option 3: RandomizedSearchCV for Large Parameter Spaces

In [None]:
# OPTION 3: RandomizedSearchCV for large parameter spaces
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

# Enable autologging again
mlflow.sklearn.autolog()

# Define LARGE parameter space with distributions
param_distributions = {
    'n_estimators': randint(50, 300),  # Random integers between 50-300
    'max_depth': randint(3, 20),       # Random integers between 3-20
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': uniform(0.1, 0.8)  # Random float between 0.1-0.9
}

# Set experiment
mlflow.set_experiment("iris-randomized-search")

with mlflow.start_run(run_name="randomized-search-efficient") as run:
    
    # Create base model
    rf = RandomForestClassifier(random_state=42)
    
    # Create RandomizedSearchCV - only test 20 random combinations
    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_distributions,
        n_iter=20,  # Only test 20 random combinations (vs 100+ in full grid)
        cv=3,       # 3-fold CV for speed
        scoring='accuracy',
        n_jobs=-1,
        verbose=1,
        random_state=42
    )
    
    print("🎲 Running RandomizedSearchCV (20 random combinations)...")
    
    # Fit - auto-logged to MLflow
    random_search.fit(X_train_iris, y_train_iris)
    
    # Get results
    best_model = random_search.best_estimator_
    best_params = random_search.best_params_
    best_score = random_search.best_score_
    test_accuracy = best_model.score(X_test_iris, y_test_iris)
    
    # Log additional metrics
    mlflow.log_metric("holdout_test_accuracy", test_accuracy)
    mlflow.log_param("search_type", "randomized")
    mlflow.log_param("n_iter", 20)
    
    print(f"\n🎯 RANDOMIZED SEARCH COMPLETE!")
    print(f"📊 Best CV score: {best_score:.4f}")
    print(f"🎯 Holdout test accuracy: {test_accuracy:.4f}")
    print(f"⚙️  Best parameters: {best_params}")
    print(f"⚡ Much faster than full grid search!")

# Disable autologging
mlflow.sklearn.autolog(disable=True)

### 🎯 Hyperparameter Tuning Summary: Choose Your Approach

| **Method** | **When to Use** | **Pros** | **Cons** |
|------------|-----------------|----------|----------|
| **Manual Nested Runs** | Learning MLflow, custom logic needed | Full control, educational | More code, manual loops |
| **GridSearchCV + Autolog** | Small parameter spaces, want all combinations | Easy, automatic logging | Can be slow for large grids |
| **RandomizedSearchCV + Autolog** | Large parameter spaces, time constraints | Fast, good coverage, automatic | May miss optimal combination |

### 🏆 **Best Practices Recommendations:**

1. **Start Simple**: Use GridSearchCV with autologging for most cases
2. **Go Manual**: Use nested runs when you need custom experiment logic
3. **Scale Up**: Use RandomizedSearchCV for large parameter spaces (>100 combinations)
4. **Always Log**: Include `input_example` and `signature` in model logging
5. **Track Best**: Use parent runs to summarize tuning experiments
6. **Name Wisely**: Use descriptive run names and experiment names


## 4. Compare Multiple Models

In [None]:
# Compare different models
models_config = [
    {
        "name": "logistic-regression",
        "model": LogisticRegression(random_state=42, max_iter=1000),
        "params": {"solver": "lbfgs", "max_iter": 1000}
    },
    {
        "name": "random-forest-small",
        "model": RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42),
        "params": {"n_estimators": 50, "max_depth": 5}
    },
    {
        "name": "random-forest-large", 
        "model": RandomForestClassifier(n_estimators=200, max_depth=15, random_state=42),
        "params": {"n_estimators": 200, "max_depth": 15}
    }
]

results = []

with mlflow.start_run(run_name="model-comparison"):
    for config in models_config:
        with mlflow.start_run(run_name=config["name"], nested=True):
            # Log parameters
            mlflow.log_param("model_type", config["name"])
            mlflow.log_params(config["params"])
            
            # Train model
            start_time = time.time()
            model = config["model"]
            model.fit(X_train_iris, y_train_iris)
            training_time = time.time() - start_time
            
            # Evaluate
            test_accuracy = accuracy_score(y_test_iris, model.predict(X_test_iris))
            
            # Log metrics
            mlflow.log_metrics({
                "test_accuracy": test_accuracy,
                "training_time": training_time
            })
            
            # Log model with input example and signature
            input_example = X_train_iris[:3]
            mlflow.sklearn.log_model(
                model, 
                "model",
                input_example=input_example,
                signature=mlflow.models.infer_signature(X_train_iris, model.predict(X_test_iris))
            )
            
            results.append({
                "model": config["name"],
                "accuracy": test_accuracy,
                "training_time": training_time
            })
            
            print(f"{config['name']}: {test_accuracy:.4f} accuracy, {training_time:.2f}s")

# Display results
results_df = pd.DataFrame(results)
print("\nModel Comparison Results:")
print(results_df.sort_values('accuracy', ascending=False))


## 5. Hyperparameter Tuning with MLflow

In [None]:
# Hyperparameter tuning with nested runs
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None]
}

best_accuracy = 0
best_params = None

with mlflow.start_run(run_name="hyperparameter-tuning"):
    mlflow.log_param("tuning_strategy", "grid_search")
    mlflow.log_param("param_space", str(param_grid))
    
    for n_est in param_grid['n_estimators']:
        for depth in param_grid['max_depth']:
            with mlflow.start_run(nested=True):
                # Log parameters
                params = {
                    "n_estimators": n_est,
                    "max_depth": depth if depth is not None else "None"
                }
                mlflow.log_params(params)
                
                # Train model
                model = RandomForestClassifier(
                    n_estimators=n_est, 
                    max_depth=depth, 
                    random_state=42
                )
                model.fit(X_train_iris, y_train_iris)
                
                # Evaluate
                accuracy = accuracy_score(y_test_iris, model.predict(X_test_iris))
                mlflow.log_metric("test_accuracy", accuracy)
                
                # Track best model
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_params = params.copy()
                    # Log best model with signature
                    input_example = X_train_iris[:3]
                    mlflow.sklearn.log_model(
                        model, 
                        "model",
                        input_example=input_example,
                        signature=mlflow.models.infer_signature(X_train_iris, model.predict(X_test_iris))
                    )
                
                print(f"n_est={n_est}, depth={depth}: {accuracy:.4f}")
    
    # Log best results to parent run
    mlflow.log_params({f"best_{k}": v for k, v in best_params.items()})
    mlflow.log_metric("best_accuracy", best_accuracy)

print(f"\nBest accuracy: {best_accuracy:.4f}")
print(f"Best parameters: {best_params}")


## 6. Model Registration and Versioning

In [None]:
# Train final model and register it
with mlflow.start_run(run_name="production-model"):
    # Use best parameters from tuning
    final_model = RandomForestClassifier(
        n_estimators=100, 
        max_depth=10, 
        random_state=42
    )
    final_model.fit(X_train_iris, y_train_iris)
    
    # Evaluate
    test_accuracy = accuracy_score(y_test_iris, final_model.predict(X_test_iris))
    
    # Log everything
    mlflow.log_params({
        "n_estimators": 100,
        "max_depth": 10,
        "model_purpose": "production"
    })
    mlflow.log_metric("test_accuracy", test_accuracy)
    
    # Register model with signature and input example
    input_example = X_train_iris[:5]
    model_info = mlflow.sklearn.log_model(
        final_model, 
        "model",
        input_example=input_example,
        signature=mlflow.models.infer_signature(X_train_iris, final_model.predict(X_test_iris)),
        registered_model_name="iris_classifier"
    )
    
    print(f"Model registered with accuracy: {test_accuracy:.4f}")
    print(f"Model URI: {model_info.model_uri}")


## 7. Model Registry Management

In [None]:
from mlflow.tracking import MlflowClient

client = MlflowClient()

# List registered models
registered_models = client.search_registered_models()
print("Registered models:")
for model in registered_models:
    print(f"- {model.name}")
    for version in model.latest_versions:
        print(f"  Version {version.version}: {version.current_stage}")

# Get model details
if registered_models:
    model_name = "iris_classifier"
    model_version = client.get_latest_versions(model_name)[0]
    print(f"\nLatest version of {model_name}: {model_version.version}")
    
    # Transition to staging
    client.transition_model_version_stage(
        name=model_name,
        version=model_version.version,
        stage="Staging"
    )
    print(f"Model {model_name} v{model_version.version} moved to Staging")


## 8. Loading and Using Registered Models

In [None]:
# Load model from registry
try:
    # Load latest staging model
    staging_model = mlflow.pyfunc.load_model("models:/iris_classifier/Staging")
    print("Loaded model from Staging")
    
    # Make predictions
    sample_data = X_test_iris[:5]
    predictions = staging_model.predict(sample_data)
    
    print("\nSample predictions:")
    for i, (sample, pred, actual) in enumerate(zip(sample_data, predictions, y_test_iris[:5])):
        print(f"Sample {i+1}: Predicted={pred}, Actual={actual}")
        
except Exception as e:
    print(f"Error loading model: {e}")
    print("This might happen if no model is in Staging yet")


## 9. Advanced Features: Autologging


In [None]:
# Enable autologging
mlflow.sklearn.autolog()

# Create new experiment for autologging
mlflow.set_experiment("autologging-demo")

with mlflow.start_run(run_name="autolog-example"):
    # Train model - everything is automatically logged!
    auto_model = RandomForestClassifier(n_estimators=150, max_depth=12, random_state=42)
    auto_model.fit(X_train_wine, y_train_wine)
    
    # Predictions are also logged
    y_pred = auto_model.predict(X_test_wine)
    accuracy = accuracy_score(y_test_wine, y_pred)
    
    print(f"Autologged model accuracy on wine dataset: {accuracy:.4f}")
    print("Check MLflow UI to see all automatically logged parameters and metrics!")

# Disable autologging
mlflow.sklearn.autolog(disable=True)


## 10. Custom Artifacts and Visualizations


In [None]:
# Create experiment for custom artifacts
mlflow.set_experiment("custom-artifacts-demo")

with mlflow.start_run(run_name="custom-artifacts"):
    # Train model with preprocessing
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_wine)
    X_test_scaled = scaler.transform(X_test_wine)
    
    model = LogisticRegression(random_state=42, max_iter=1000)
    model.fit(X_train_scaled, y_train_wine)
    
    y_pred = model.predict(X_test_scaled)
    accuracy = accuracy_score(y_test_wine, y_pred)
    
    # Log basic metrics
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_param("preprocessing", "StandardScaler")
    
    # Save and log preprocessing artifacts
    joblib.dump(scaler, "scaler.pkl")
    mlflow.log_artifact("scaler.pkl", "preprocessing")
    
    # Create feature importance plot
    feature_names = wine.feature_names
    if hasattr(model, 'coef_'):
        importance = np.abs(model.coef_[0])
        indices = np.argsort(importance)[::-1][:10]
        
        plt.figure(figsize=(10, 6))
        plt.title('Top 10 Feature Importance (Logistic Regression)')
        plt.bar(range(10), importance[indices])
        plt.xticks(range(10), [feature_names[i] for i in indices], rotation=45)
        plt.tight_layout()
        plt.savefig('feature_importance.png', dpi=150, bbox_inches='tight')
        mlflow.log_artifact('feature_importance.png', "plots")
        plt.show()
    
    # Create classification report
    from sklearn.metrics import classification_report
    report = classification_report(y_test_wine, y_pred, target_names=wine.target_names)
    
    with open("classification_report.txt", "w") as f:
        f.write(report)
    mlflow.log_artifact("classification_report.txt", "reports")
    
    # Log model with signature and input example
    input_example = X_train_scaled[:5]
    mlflow.sklearn.log_model(
        model, 
        "model",
        input_example=input_example,
        signature=mlflow.models.infer_signature(X_train_scaled, y_pred)
    )
    
    print(f"Wine classification accuracy: {accuracy:.4f}")
    print("Custom artifacts logged: scaler, feature importance plot, classification report")


## 11. Experiment Analysis and Comparison


In [None]:
# Search and compare experiments
from mlflow.entities import ViewType

# Get all experiments
experiments = client.search_experiments()
print("Available experiments:")
for exp in experiments:
    print(f"- {exp.name} (ID: {exp.experiment_id})")

# Search runs from specific experiment
iris_exp = mlflow.get_experiment_by_name("iris-classification-tutorial")
if iris_exp:
    runs = client.search_runs(
        experiment_ids=[iris_exp.experiment_id],
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=10
    )
    
    print(f"\nRuns in {iris_exp.name}:")
    for run in runs:
        metrics = run.data.metrics
        params = run.data.params
        print(f"Run: {run.info.run_name}")
        print(f"  Accuracy: {metrics.get('test_accuracy', 'N/A')}")
        print(f"  Model: {params.get('model_type', 'N/A')}")
        print(f"  Status: {run.info.status}")
        print()


## 12. Cleanup and Best Practices Demo


In [None]:
# Clean up temporary files
import os
temp_files = [
    'confusion_matrix_rf.png', 
    'scaler.pkl', 
    'feature_importance.png',
    'classification_report.txt'
]

for file in temp_files:
    if os.path.exists(file):
        os.remove(file)
        print(f"Removed {file}")

print("\n=== MLflow Tutorial Complete ===")
print("\nNext steps:")
print("1. Start MLflow UI: uv run mlflow ui")
print("2. Open http://localhost:5000 in your browser")
print("3. Explore your experiments, runs, and models!")
print("4. Try model serving: mlflow models serve -m 'models:/iris_classifier/Staging' -p 5001")

print("\nKey takeaways:")
print("- Use experiments to organize related runs")
print("- Log parameters, metrics, and artifacts consistently")
print("- Use nested runs for hyperparameter tuning")
print("- Register important models for production use")
print("- Leverage autologging for quick experimentation")
print("- Save preprocessing artifacts for reproducibility")
