# MLOps with datalab

## 1. Introduction

### 1.1 What is MLOps?

**MLOps** stands for `Machine Learning Operations`. It contains a set of best practices that seeks to increase automation and improve the efficiency of models development and deployment.

### 1.2 Why do we need MLOps? Git is not enough?

Put a machine learning model into production is difficult. It envoles many complex components such as
- data collection/ingest,
- data prep (e.g. cleaning, feature engineering, etc),
- model development
- model training,
- model tuning
- model deployment
- model monitoring,
- model explainability
- ETC.

Below figure shows the mlops competence requirement:

![ml_technical_debt.PNG](img/ml_technical_debt.PNG)

### 1.3 ML Operations

We need to address the following MLOps principals:

- **Model tracking**: track all the necessary element to reproduce the model such as code, hyperparameter and training data.
- **Model review**: Test model and produce quality assurance report. Inference model production-specifics properties such as model response times.
- **Model Governance** : manage model versions, model artifacts and transitions through their lifecycle (e.g. staging, production, archived,etc.).
                     
- **Model deployment**: Automate the process of deploying registered models (e.g. permissions, cluster creation, API management, etc.)
- **Model monitoring**: Monitor the state of model production server (e.g. number of request, response time, serving data anomalies, etc.)
- **Model retraining**: Create alerts and automation to take corrective action in case of **model drift** due to 
                    differences in training and inference data or `data evolution`.
                    

### 1.4 Continuous X

- **CI**: Track model code, training data( e.g. Feature engineering/selection), hyper-parameters optimization
- **CD**: Need to deliver not only an executable package, but also a complete pipeline of how the model is trained.
- **CT(Continuous training)**: Models need to be retrained automatically. Because evolving data make your model decay. data validation is essential at this step, Because data drifting can be caused by evolution or errors.
 
## 2 Illustrate mlops via a application example

The context 

In [None]:
import logging
import sys
import warnings

import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

# Phase 1. Train a model in an old school way

In [None]:
# calculate an accuracy from the confusion matrix
def get_model_accuracy(cf_matrix):
    diagonal_sum = cf_matrix.trace()
    sum_of_all_elements = cf_matrix.sum()
    return diagonal_sum / sum_of_all_elements


def train_model(data_url:str,n_estimator:int, max_depth:int, min_samples_split:int):
    print(f"data source: {data_url}")
    feature_data, label_data = prepare_data(data_url)
    train_X, test_X, train_y, test_y = train_test_split(feature_data, label_data, train_size=0.8, test_size=0.2,
                                                        random_state=0)
    # print(len(test_X))
   
    # create a random forest classifier
    rf_clf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth,
                                    min_samples_split=min_samples_split,
                                    n_jobs=2, random_state=0)
    # train the model with training_data
    rf_clf.fit(train_X, train_y)
    # predict testing data
    predicts_val = rf_clf.predict(test_X)

    # Generate a cm
    cm = confusion_matrix(test_y, predicts_val)
    model_accuracy = get_model_accuracy(cm)
    print("RandomForest model with hyper-parameters: (n_estimator=%f, max_depth=%f, min_samples_split=%f):" % (n_estimator, max_depth, min_samples_split))
    print("accuracy: %f" % model_accuracy)


def prepare_data(data_url):
    # read data as df
    try:
        input_df = pd.read_csv(data_url, index_col=0)
        input_df.head()
    except Exception as e:
        print(
            "Unable to read data from the giving path, check your data location. Error: %s", e
        )
    # Prepare data for ml model
    label = input_df.legendary
    feature = input_df.drop(['legendary', 'generation', 'total'], axis=1).select_dtypes(exclude=['object'])
    return feature, label

In [None]:
np.random.seed(40)
# set the training data path
data_url = "https://minio.lab.sspcloud.fr/pengfei/sspcloud-demo/pokemon-cleaned.csv"

# set the hyper parameters
n_estimator = 50
max_depth = 30
min_samples_split = 2

# train the model
train_model(data_url,n_estimator, max_depth, min_samples_split)

In [None]:




# calculate an accuracy from the confusion matrix
def get_model_accuracy(cf_matrix):
    diagonal_sum = cf_matrix.trace()
    sum_of_all_elements = cf_matrix.sum()
    return diagonal_sum / sum_of_all_elements


def run_workflow(mlflow_experiment_name: str, mlflow_run_name: str, data_url: str, n_estimator: int, max_depth: int,
                 min_samples_split: int):
    # Step1: Prepare data
    train_X, test_X, train_y, test_y = prepare_data(data_url)
    # set up mlflow context
    mlflow.set_experiment(mlflow_experiment_name)
    with mlflow.start_run(run_name=mlflow_run_name):
        # create a random forest classifier
        rf_clf = RandomForestClassifier(n_estimators=n_estimator, max_depth=max_depth,
                                        min_samples_split=min_samples_split,
                                        n_jobs=2, random_state=0)
        # train the model with training_data
        rf_clf.fit(train_X, train_y)
        # predict testing data
        predicts_val = rf_clf.predict(test_X)

        # Generate a cm
        cm = confusion_matrix(test_y, predicts_val)
        model_accuracy = get_model_accuracy(cm)
        print("RandomForest model (n_estimator=%f, max_depth=%f, min_samples_split=%f):" % (n_estimator, max_depth,
                                                                                            min_samples_split))
        print("accuracy: %f" % model_accuracy)
        mlflow.log_param("data_url", data_url)
        mlflow.log_param("n_estimator", n_estimator)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("min_samples_split", min_samples_split)
        # log shap feature explanation extension. This will generate a graph of feature importance of the model
        # mlflow.shap.log_explanation(rf_clf.predict, test_X.sample(70))
        mlflow.log_metric("model_accuracy", model_accuracy)
        mlflow.sklearn.log_model(rf_clf, "model")


def prepare_data(data_url: str):
    input_df = None
    # read data as df
    try:
        input_df = pd.read_csv(data_url, index_col=0)
    except Exception as e:
        logger.exception(
            "Unable to read data from the giving path, check your data location. Error: %s", e
        )
    # Prepare data for ml model
    label = input_df.legendary
    feature = input_df.drop(['legendary', 'generation', 'total'], axis=1).select_dtypes(exclude=['object'])
    train_X, test_X, train_y, test_y = train_test_split(feature, label, train_size=0.8, test_size=0.2,
                                                        random_state=0)
    return train_X, test_X, train_y, test_y


def main():
    warnings.filterwarnings("ignore")
    np.random.seed(40)
    # default configuration

    default_data_url = "https://minio.lab.sspcloud.fr/pengfei/sspcloud-demo/pokemon-cleaned.csv"
    default_run_name = "default"

    default_n_estimator = 10
    default_max_depth = 5
    default_samples_split = 2

    # Get experiment setting from cli
    remote_server_uri = str(sys.argv[1]) if len(sys.argv) > 1 else sys.exit("Must provide a mlflow server url ")
    experiment_name = str(sys.argv[2]) if len(sys.argv) > 2 else sys.exit("Must provide a mlflow experiment name ")
    run_name = str(sys.argv[3]) if len(sys.argv) > 3 else default_run_name

    # Get data path
    data_url = str(sys.argv[4]) if len(
        sys.argv) > 4 else default_data_url

    # Get hyper parameters from cli arguments
    n_estimator = int(sys.argv[5]) if len(sys.argv) > 5 else default_n_estimator
    max_depth = int(sys.argv[6]) if len(sys.argv) > 6 else default_max_depth
    min_samples_split = int(sys.argv[7]) if len(sys.argv) > 7 else default_samples_split

    # run the main model training pipeline

    run_workflow(experiment_name, run_name, data_url, n_estimator, max_depth, min_samples_split)


if __name__ == "__main__":
    main()