# Triggering actions in case of drifts

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import mlflow

# Assuming heart.csv is in the correct path and contains the right data
df = pd.read_csv('heart.csv')
df = df[['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak', 'HeartDisease']]
df['HeartDisease'] = df['HeartDisease'].astype('str')
df.rename(columns={'HeartDisease': 'label'}, inplace=True)

features = df.drop('label', axis=1)
target = df['label']
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

model = RandomForestClassifier(n_estimators=200, max_depth=30)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, predictions)}")

# Setting the MLflow tracking URI and experiment
mlflow.set_tracking_uri('http://127.0.0.1:5000')  # Adjust as necessary
experiment_name = 'MLBook_Experiment3'
mlflow.set_experiment(experiment_name)

# Start an MLflow run
with mlflow.start_run(run_name='HeartDiseaseRF'):
    # Log parameters
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 30)

    # Log model accuracy
    mlflow.log_metric("accuracy", accuracy_score(y_test, predictions))

    # Log the model itself
    mlflow.sklearn.log_model(model, "HeartDiseaseRF")

# The run is automatically ended when the context manager exits
print("Model logged to MLFlow.")

In [None]:
client = mlflow.tracking.MlflowClient()
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
run_infos = client.search_runs([experiment_id])
model_url = run_infos[0].info.artifact_uri+"/HeartDiseaseRF"
result = mlflow.register_model(
    model_url, "sk-learn-random-forest-HD"
)

In [None]:
import mlflow
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Step 1: Load the trained model from the MLFlow model registry
mlflow.set_tracking_uri('http://127.0.0.1:5000')  # Set to your MLFlow server
model_name = "sk-learn-random-forest-HD"
model_version = 1  # Assuming version 1 is the latest production model

model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri)

# Step 2: Define the data pipeline and simulate incoming data
def get_incoming_data():
    # In a real scenario, this function would be replaced by data ingestion mechanisms
    # Here we simulate data generation for inference
    X, _ = make_classification(n_samples=10, n_features=6, n_classes=2, random_state=42)
    return X

# Simulate a data sink
incoming_data = get_incoming_data()

# Step 3: Inference using the model
predictions = model.predict(incoming_data)

In [None]:
# Step 4: Check for drift (a simple drift detection mechanism)
# In a real-world scenario, this should be replaced by a robust drift detection method
def drift_detection(predictions):
    # Simulate drift detection logic
    # This could be a statistical test or comparison with expected distribution
    drift_detected = np.random.choice([True, False])  # Randomly simulating drift for demo purposes
    return drift_detected

# Step 5: Automated response if drift is detected
if drift_detection(predictions):
    # Alerting the team (can be an email, a message in a team chat, etc.)
    print("Drift detected! Alerting the team...")

    # Retraining the model with new data (for simplicity, we use the same dataset)
    X, y = make_classification(n_samples=100, n_features=6, n_classes=2, random_state=42)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    new_model = RandomForestClassifier()
    new_model.fit(X_train, y_train)
    
    # Evaluate the new model
    new_accuracy = accuracy_score(y_test, new_model.predict(X_test))
    print(f"New model accuracy: {new_accuracy}")

    # If the new model is better, register it to MLFlow
    if new_accuracy > 0.8:  # Assuming 0.8 is our threshold for model performance
        with mlflow.start_run():
            mlflow.sklearn.log_model(new_model, "RandomForestModel")
            mlflow.log_metric("accuracy", new_accuracy)
            print("New model trained and registered to MLFlow due to drift.")

# The above code demonstrates how an inference pipeline can handle data drifts
# by incorporating a drift detection mechanism and an automated response system.
