# Pipeline Example: Experimentation based on CaliforniaHousing Data

## 0) Setting up Modeva

In [2]:
## =============================================================
## Install or update packages(recommended to run in Terminal)
## =============================================================
!pip show modeva
# !pip uninstall modeva
#!pip install modeva



In [3]:
# To get authentication, use the following command: (To get full access please replace the token to your own token)
from modeva.utils.authenticate import authenticate
authenticate(token='eaaa4301-b140-484c-8e93-f9f633c8bacb')

## 1) Prepare Step Functions

In [7]:
import numpy as np

from modeva import DataSet
from modeva import ModelZoo
from modeva import TestSuite
from modeva.models import MoLGBMRegressor
from modeva.models import MoMoERegressor
from modeva.models.tune import ModelTuneRandomSearch

from modeva.automation.pipeline import Pipeline

def load_data(name, inactive_features, target_feature, task_type, test_ratio):
    ds = DataSet(name=name)
    ds.load(name)
    ds.reset_preprocess()
    ds.impute_missing()
    ds.scale_numerical(method="minmax")
    ds.encode_categorical(method="ordinal")
    ds.preprocess()
    ds.set_inactive_features(features=inactive_features)
    ds.set_target(feature=target_feature)
    ds.set_task_type(task_type)
    ds.set_random_split(test_ratio=test_ratio)
    return ds

def train_lgbm(ds):
    model = MoLGBMRegressor(name="LGBM", max_depth=2, n_estimators=100, verbose=-1)
    model.fit(ds.train_x, ds.train_y.ravel())
    return model

def train_moe(ds):
    model = MoMoERegressor(name="MOE", n_clusters=5, max_depth=2, n_estimators=100, verbose=-1)
    model.fit(ds.train_x, ds.train_y.ravel())
    return model

def train_moe_tuned(ds):
    hyperspace = dict(n_clusters=[2, 3, 4, 5, 6, 7, 8, 9, 10])
    hpo = ModelTuneRandomSearch(dataset=ds,
                                model=MoMoERegressor(verbose=-1))
    result = hpo.run(param_distributions=hyperspace,
                     n_iter=10,
                     metric="MSE",
                     cv=5,
                     random_state=0)

    best_param_idx = np.where(result.value["rank_test_MSE"] == 1)[0][0]
    model = MoMoERegressor(**result.value["params"][best_param_idx],
                           name="MoE-Tuned",
                           verbose=-1)
    model.fit(ds.train_x, ds.train_y)
    return model

def interpret_model(ds, model):
    ts = TestSuite(ds, model=model)

    result1 = ts.interpret_fi()
    result1.plot(figsize=(6, 4))

    result2 = ts.interpret_ei()
    result2.plot(figsize=(6.5, 4))

    result3 = ts.interpret_effects(features="MedInc")
    result3.plot(figsize=(6, 4))

    result4 = ts.interpret_local_fi(dataset='test', sample_index=0, centered=True)
    result4.plot(figsize=(6, 4))
    return result1, result2, result3, result4

def explain_model(ds, model):
    ts = TestSuite(ds, model=model)

    result1 = ts.explain_pfi()
    result1.plot(figsize=(6, 4))

    result2 = ts.explain_hstatistic(sample_size=1000, grid_resolution=10)
    result2.plot(figsize=(6, 5))

    result3 = ts.explain_pdp(features="MedInc")
    result3.plot(figsize=(6, 5))

    result4 = ts.explain_lime(dataset="test", sample_index=0, centered=False)
    result4.plot(figsize=(6, 4))
    return result1, result2, result3, result4

def test_model(ds, model):
    ts = TestSuite(ds, model=model)

    result1 = ts.diagnose_accuracy_table(train_dataset="train", test_dataset="test", metric=None)
    print(result1.table)

    result2 = ts.diagnose_robustness(dataset="test", perturb_features=None, 
                                     noise_levels=(0.2, 0.4, 0.6, 0.8), metric="MAE")
    result2.plot(figsize=(6, 4))

    result3 = ts.diagnose_residual_cluster()
    result3.plot(figsize=(6, 4))

    result4 = ts.diagnose_slicing_accuracy(features=(("MedInc",), ("Population", ), ), metric="MAE",
                                           method="quantile", threshold=None)
    result4.table
    return result1, result2, result3, result4

def compare_models(ds, model1, model2, model3):
    tsc = TestSuite(ds, models=[model1, model2, model3])

    result1 = tsc.compare_accuracy_table(train_dataset="train", test_dataset="test", metric=("MSE", "MAE"))
    result1.plot(figsize=(6.5, 4))

    result2 = tsc.compare_robustness(noise_levels=(0.1, 0.2, 0.3, 0.4), 
                                     perturb_method="quantile", metric="MAE")
    result2.plot(figsize=(6.5, 4))
    return result1, result2

## 2) Construct Pipeline with Step Functions

In [8]:
exp = Pipeline(name='CH-Pipeline')

exp.add_step(
    name='load_data',
    func=load_data,
    func_inputs={'name': 'CaliforniaHousing',
                 "target_feature": "MedHouseVal",
                 "inactive_features": None,
                 "task_type": "Regression",
                 "test_ratio": 0.33},
    save_data=True,
)

exp.add_step(
    name='train_lgbm', 
    parent='load_data',
    func=train_lgbm,
    func_inputs={}, # auto map from parent steps
    save_model=True,
)

exp.add_step(
    name='train_moe', 
    parent='load_data',
    func=train_moe,
    func_inputs={}, # auto map from parent steps
    save_model=True,
)

exp.add_step(
    name='train_moe_tuned',
    parent='load_data',
    func=train_moe_tuned,
    func_inputs={}, # auto map from parent steps
    save_model=True,
)

exp.add_step(
    func=interpret_model,
    func_inputs={}, # auto map from parent steps
    name='interpret_model', parent=['load_data', 'train_lgbm'],
    save_testsuite=True,
)

exp.add_step(
    func=explain_model,
    func_inputs={}, # auto map from parent steps
    name='explain_model', parent=['load_data', 'train_lgbm'],
    save_testsuite=True,
)

exp.add_step(
    func=test_model,
    func_inputs={}, # auto map from parent steps
    name='test_model', parent=['load_data', 'train_lgbm'],
    save_testsuite=True,
)

exp.add_step(
    func=compare_models,
    func_inputs={}, # auto map from parent steps
    name='compare_model', parent=['load_data', 'train_lgbm', 'train_moe', 'train_moe_tuned'],
    save_testsuite=True,
)

## 3) Run Pipeline

In [9]:
exp.run()

Executing step: load_data
Executing step: train_lgbm
Executing step: train_moe
Executing step: train_moe_tuned
Executing step: interpret_model
Executing step: explain_model
Executing step: test_model
            MSE       MAE        R2
train  0.013251  0.081484  0.766098
test   0.014154  0.082798  0.749555
GAP    0.000903  0.001315 -0.016543
Executing step: compare_model


## 3) Export Results to HTML Report

In [10]:
ts = TestSuite(name='CH-Pipeline-TestSuite')
ts.export_report(path='CH-Pipeline-Report.html')

FileNotFoundError: [WinError 3] 系统找不到指定的路径。: ''