# Showcase 3: Pipeline Experimentation based on CaliforniaHousing Data

## 0) Setting up Modeva

In [None]:
## =============================================================
## Install or update packages(recommended to run in Terminal)
## =============================================================
!pip show modeva
# !pip uninstall modeva
#!pip install modeva

## 1) Prepare Step Functions

In [None]:
import numpy as np

from modeva import DataSet
from modeva import ModelZoo
from modeva import FactSheet
from modeva.models import MoLGBMRegressor
from modeva.models import MoMoERegressor
from modeva.models.tune import ModelTuneRandomSearch

from modeva.automation.pipeline import Pipeline

def load_data(name, inactive_features, target_feature, task_type, test_ratio):
    ds = DataSet(name=name)
    ds.load(name)
    ds.reset_preprocess()
    ds.impute_missing()
    ds.scale_numerical(method="minmax")
    ds.encode_categorical(method="ordinal")
    ds.preprocess()
    ds.set_inactive_features(features=inactive_features)
    ds.set_target(feature=target_feature)
    ds.set_task_type(task_type)
    ds.set_random_split(test_ratio=test_ratio)
    return ds

def train_lgbm(ds):
    model = MoLGBMRegressor(name="LGBM", max_depth=2, n_estimators=100, verbose=-1)
    model.fit(ds.train_x, ds.train_y.ravel())
    return model

def train_moe(ds):
    model = MoMoERegressor(name="MOE", n_clusters=5, max_depth=2, n_estimators=100, verbose=-1)
    model.fit(ds.train_x, ds.train_y.ravel())
    return model

def train_moe_tuned(ds):
    hyperspace = dict(n_clusters=[2, 3, 4, 5, 6, 7, 8, 9, 10])
    hpo = ModelTuneRandomSearch(dataset=ds,
                                model=MoMoERegressor(verbose=-1))
    result = hpo.run(param_distributions=hyperspace,
                     n_iter=10,
                     metric="MSE",
                     cv=5,
                     random_state=0)

    best_param_idx = np.where(result.value["rank_test_MSE"] == 1)[0][0]
    model = MoMoERegressor(**result.value["params"][best_param_idx],
                           name="MoE-Tuned",
                           verbose=-1)
    model.fit(ds.train_x, ds.train_y)
    return model

def interpret_model(ds, model):
    fs = FactSheet(ds, model=model)

    result1 = fs.interpret_fi()
    result1.plot(figsize=(6, 4))

    result2 = fs.interpret_ei()
    result2.plot(figsize=(6.5, 4))

    result3 = fs.interpret_effects(features="MedInc")
    result3.plot(figsize=(6, 4))

    result4 = fs.interpret_local_fi(dataset='test', sample_index=0, centered=True)
    result4.plot(figsize=(6, 4))
    return result1, result2, result3, result4

def explain_model(ds, model):
    fs = FactSheet(ds, model=model)

    result1 = fs.explain_pfi()
    result1.plot(figsize=(6, 4))

    result2 = fs.explain_hstatistic(sample_size=1000, grid_resolution=10)
    result2.plot(figsize=(6, 5))

    result3 = fs.explain_pdp(features="MedInc")
    result3.plot(figsize=(6, 5))

    result4 = fs.explain_lime(dataset="test", sample_index=0, centered=False)
    result4.plot(figsize=(6, 4))
    return result1, result2, result3, result4

def test_model(ds, model):
    fs = FactSheet(ds, model=model)

    result1 = fs.diagnose_accuracy_table(train_dataset="train", test_dataset="test", metric=None)
    print(result1.table)

    result2 = fs.diagnose_robustness(dataset="test", perturb_features=None, 
                                     noise_levels=(0.2, 0.4, 0.6, 0.8), metric="MAE")
    result2.plot(figsize=(6, 4))

    result3 = fs.diagnose_residual_fi(method="uniform")
    result3.plot(figsize=(6, 4))

    result4 = fs.diagnose_slicing_accuracy(features=(("MedInc",), ("Population", ), ), metric="MAE",
                                           method="quantile", threshold=None)
    result4.table
    return result1, result2, result3, result4

def compare_models(ds, model1, model2, model3):
    fsc = FactSheet(ds, models=[model1, model2, model3])

    result1 = fsc.compare_accuracy_table(train_dataset="train", test_dataset="test", metric=("MSE", "MAE"))
    result1.plot(figsize=(6.5, 4))

    result2 = fsc.compare_robustness(noise_levels=(0.1, 0.2, 0.3, 0.4), 
                                     perturb_method="quantile", metric="MAE")
    result2.plot(figsize=(6.5, 4))
    return result1, result2

## 2) Construct Pipeline with Step Functions

In [None]:
exp = Pipeline(name='CH-Pipeline')

exp.add_step(
    name='load_data',
    func=load_data,
    func_inputs={'name': 'CaliforniaHousing',
                 "target_feature": "MedHouseVal",
                 "inactive_features": None,
                 "task_type": "Regression",
                 "test_ratio": 0.33},
    save_data=True,
)

exp.add_step(
    name='train_lgbm', 
    parent='load_data',
    func=train_lgbm,
    func_inputs={}, # auto map from parent steps
    save_model=True,
)

exp.add_step(
    name='train_moe', 
    parent='load_data',
    func=train_moe,
    func_inputs={}, # auto map from parent steps
    save_model=True,
)

exp.add_step(
    name='train_moe_tuned',
    parent='load_data',
    func=train_moe_tuned,
    func_inputs={}, # auto map from parent steps
    save_model=True,
)

exp.add_step(
    func=interpret_model,
    func_inputs={}, # auto map from parent steps
    name='interpret_model', parent=['load_data', 'train_lgbm'],
    save_factsheet=True,
)

exp.add_step(
    func=explain_model,
    func_inputs={}, # auto map from parent steps
    name='explain_model', parent=['load_data', 'train_lgbm'],
    save_factsheet=True,
)

exp.add_step(
    func=test_model,
    func_inputs={}, # auto map from parent steps
    name='test_model', parent=['load_data', 'train_lgbm'],
    save_factsheet=True,
)

exp.add_step(
    func=compare_models,
    func_inputs={}, # auto map from parent steps
    name='compare_model', parent=['load_data', 'train_lgbm', 'train_moe', 'train_moe_tuned'],
    save_factsheet=True,
)

## 3) Run Pipeline

In [None]:
exp.run()

## 3) Export Factsheet Results to HTML Report

In [None]:
## =============================================================
## Factsheet-export API (to be merged to Modeva in next release)
## =============================================================

import json
from modeva.dashboard.utils.report import create_html_reprt

def export_report(fs, path: str = "report.html"):
    """Export report to html

    Parameters
    ----------
    path : str, optional
        The export path, by default "report.html"
    """
    names = fs.list_registered_tests().Name.unique().tolist()
    rs = []
    for name in names:
        f = fs.load_registered_test(name=name)
        plots = []
        plot = f['options']
        if plot:
            if 'chart_id' in plot:
                plots.append(plot)
            else:
                for name, option in plot.items():
                    plots.append(option)
        if f['table'] is not None:
            table = f['table'].replace({float('nan'): None}).round(6).to_dict(orient="split")
        else:
            table = {}
        rs.append({
            "name": name,
            "data": json.dumps(f['data']),
            "model": json.dumps(f['model']),
            "inputs": json.dumps(f['inputs']),
            "table": json.dumps(table),
            "plots": json.dumps(plots)
        })
    html_str = create_html_reprt(fs.name, rs)
    with open(path, 'w', encoding='utf-8') as f:
        f.write(html_str)

In [None]:
fs = FactSheet(name='CH-Pipeline-FactSheet')
export_report(fs, path="report.html")