### Regression and One-click deployment example

In this Notebook I walk through all the steps needed to develop a **regression model** and save the model to the OCI Data Science Model Catalog.

Then, I deploy the model as a service.

In [74]:
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype

# the dataset used for the example
from sklearn.datasets import fetch_california_housing

from sklearn.model_selection import train_test_split

# the GBM used
import xgboost as xgb

# to use ADSTuner
from ads.hpo.search_cv import ADSTuner
from ads.hpo.stopping_criterion import *
from ads.hpo.distributions import *

import pickle

import os
from ads import set_auth
from ads.common.model_artifact import ModelArtifact
from ads.common.model_export_util import prepare_generic_model
from ads.common.model_metadata import (MetadataCustomCategory,
                                       UseCaseType)

In [2]:
# functions
def get_general_info(data_df):
    print(f"There are: {len(data_df.columns)} columns in the dataset")
    print()
    print(
        "The list of column names, in alphabetical order:",
        sorted(list(data_df.columns)),
    )
    print()
    print(f"There are {data_df.shape[0]} records in the dataset")
    print()
    
    return

# well you have to decide a threshold in term of a fraction
# to decide if the col is categorical
FRAC = 0.1

def analyze_df(data_df):
    # it is ok to use isna, isnull is an alias of isna
    missing_val = data_df.isna().sum()

    # cardinality

    THR = data_df.shape[0] * FRAC

    list_card = []
    list_cat = []
    list_dtypes = []
    list_num_zeros = []

    for col in data_df.columns:
        # count the # of distinct values
        n_distinct = data_df[col].nunique()
        list_card.append(n_distinct)
        
        # is categorical is decide on this rule
        if n_distinct < THR:
            # categorical
            list_cat.append("Yes")
        else:
            list_cat.append("No")

        list_dtypes.append(data_df[col].dtype)

    # build the results DF
    result_df = pd.DataFrame(
        {
            "col_name": list(data_df.columns),
            "missing_vals": missing_val,
            "cardinality": list_card,
            "is_categorical": list_cat,
            "data_type": list_dtypes,
        },
        index=None,
    )

    # if you don't want cols as index
    result_df.reset_index(drop=True, inplace=True)

    return result_df

def show_tuner_results(tuner):

    # to count completed
    result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
        by=["value"], ascending=False
    )

    print("ADSTuner session results:")
    print(f"ADSTuner has completed {result_df.shape[0]} trials")
    print()
    print(f"The best trial is the #: {tuner.best_index}")
    print(f"Parameters for the best trial are: {tuner.best_params}")
    print(f"The metric used to optimize is: {tuner.scoring_name}")
    print(f"The best score is: {round(tuner.best_score, 4)}")

In [3]:
# load the dataset
housing = fetch_california_housing(as_frame=True)

orig_df = housing.frame

In [4]:
orig_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


### some EDA

In [5]:
get_general_info(orig_df)

analyze_df(orig_df)

There are: 9 columns in the dataset

The list of column names, in alphabetical order: ['AveBedrms', 'AveOccup', 'AveRooms', 'HouseAge', 'Latitude', 'Longitude', 'MedHouseVal', 'MedInc', 'Population']

There are 20640 records in the dataset



Unnamed: 0,col_name,missing_vals,cardinality,is_categorical,data_type
0,MedInc,0,12928,No,float64
1,HouseAge,0,52,Yes,float64
2,AveRooms,0,19392,No,float64
3,AveBedrms,0,14233,No,float64
4,Population,0,3888,No,float64
5,AveOccup,0,18841,No,float64
6,Latitude,0,862,Yes,float64
7,Longitude,0,844,Yes,float64
8,MedHouseVal,0,3842,No,float64


In [6]:
# In this example I'll use all the columns (ex MedHouseVal) as features, except Lat, Long, to simplify

TARGET = "MedHouseVal"
all_cols = list(orig_df.columns)
cols_to_drop = ['Latitude', 'Longitude']

cat_cols = ['HouseAge']
# take care, I have sorted
FEATURES = sorted(list(set(all_cols) - set([TARGET])- set(cols_to_drop)))

# for LightGBM
cat_columns_idxs = [i for i, col in enumerate(FEATURES) if col in cat_cols]

FEATURES

['AveBedrms', 'AveOccup', 'AveRooms', 'HouseAge', 'MedInc', 'Population']

In [7]:
cat_columns_idxs

[3]

In [8]:
# the only important thing is that we have 1 categorical column: HouseAge

# as per LightGBM doc (https://lightgbm.readthedocs.io/en/latest/Advanced-Topics.html) we will code categorical as integer starting from zero
# in this case it is easy, since the minimum is 1... so we need only to subtract 1

In [9]:
used_df = orig_df.copy()

used_df['HouseAge'] = used_df['HouseAge'] - 1.

used_df['HouseAge'] = used_df['HouseAge'].astype(int)
used_df['HouseAge'] = used_df['HouseAge'].astype("category")

In [10]:
# let's make a simple train/test split
X = used_df[FEATURES].values
y = used_df[TARGET].values

TEST_SIZE = 0.2

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=1)

### using ADSTuner

In [11]:
STUDY_NAME = "xgb001"
FOLDS = 5
TIME_BUDGET = 900

#
# Here we define the strategy, the space for hyper-parameters we want to explore
#
params = {
    "n_estimators": CategoricalDistribution([100, 200, 300, 400, 500]),
    "learning_rate": LogUniformDistribution(low=1e-4, high=1e-2),
    "max_depth": IntUniformDistribution(5, 10),
}

alg_reg = xgb.XGBRegressor()

tuner = ADSTuner(
    alg_reg, cv=FOLDS, strategy=params, study_name=STUDY_NAME,  scoring='neg_mean_absolute_error'
)

tuner.tune(X_train, y_train, exit_criterion=[TimeBudget(TIME_BUDGET)])

[32m[I 2022-03-30 10:38:17,451][0m A new study created in RDB with name: xgb001[0m


In [43]:
# get the status to see if completed
print(f"The tuner status is: {tuner.get_status()}")

print(f"Remaining time is: {round(tuner.time_remaining, 1)} sec.")

The tuner status is: State.RUNNING
Remaining time is: 0 sec.


In [54]:
show_tuner_results(tuner)

ADSTuner session results:
ADSTuner has completed 3 trials

The best trial is the #: 1
Parameters for the best trial are: {'learning_rate': 0.009368194311641513, 'max_depth': 5, 'n_estimators': 100}
The metric used to optimize is: neg_mean_absolute_error
The best score is: -0.7217


In [55]:
# look only at completed trials, sorted with best on top. Metric chosen is in the value col.
result_df = tuner.trials[tuner.trials["state"] == "COMPLETE"].sort_values(
    by=["value"], ascending=False
)

result_df.head(10)

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_learning_rate,params_max_depth,params_n_estimators,user_attrs_mean_fit_time,user_attrs_mean_score_time,...,user_attrs_metric,user_attrs_split0_test_score,user_attrs_split1_test_score,user_attrs_split2_test_score,user_attrs_split3_test_score,user_attrs_split4_test_score,user_attrs_std_fit_time,user_attrs_std_score_time,user_attrs_std_test_score,state
1,1,-0.721742,2022-03-30 10:47:37.721241,2022-03-30 10:49:06.555894,0 days 00:01:28.834653,0.009368,5,100,17.721438,0.0138,...,neg_mean_absolute_error,-0.727963,-0.726046,-0.716097,-0.710933,-0.727672,0.104303,0.00013,0.006944,COMPLETE
2,2,-0.744392,2022-03-30 10:49:06.570064,2022-03-30 10:56:28.031449,0 days 00:07:21.461385,0.00178,5,500,88.244876,0.015779,...,neg_mean_absolute_error,-0.751375,-0.748336,-0.737855,-0.733915,-0.750482,0.104166,0.001137,0.007126,COMPLETE
0,0,-1.40867,2022-03-30 10:38:17.832500,2022-03-30 10:47:37.702838,0 days 00:09:19.870338,0.000232,6,500,111.91969,0.017269,...,neg_mean_absolute_error,-1.424512,-1.412303,-1.39465,-1.399688,-1.412197,0.749321,0.000772,0.010525,COMPLETE


In [57]:
%%time
### train with best params

model = xgb.XGBRegressor(**tuner.best_params)

model.fit(X_train, y_train)

CPU times: user 8min 18s, sys: 860 ms, total: 8min 19s
Wall time: 17.6 s


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.009368194311641513,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=32,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

### Save the model

In [88]:
# save the model
MODEL_FILE_NAME = "model.pkl"

pickle.dump(model, open(MODEL_FILE_NAME, "wb"))

In [89]:
loaded_model = pickle.load(open(MODEL_FILE_NAME, "rb"))

In [90]:
loaded_model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.009368194311641513,
             max_delta_step=0, max_depth=5, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=32,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)

### Prepare for Model Catalog

In [91]:
PATH_ARTEFACT = f"./model-files"

if not os.path.exists(PATH_ARTEFACT):
    os.mkdir(PATH_ARTEFACT)

In [92]:
# se passi x-test, y_test genera nei metadati anche lo schema
artifact = prepare_generic_model(model=model, model_path=PATH_ARTEFACT,
                                 force_overwrite=True, data_science_env=True,
                                 X_sample=X_test,
                                 y_sample=y_test,
                                 use_case_type=UseCaseType.REGRESSION)

# add the model file to the directory
pickle.dump(model, open(PATH_ARTEFACT + "/" + MODEL_FILE_NAME, "wb"))

loop1:   0%|          | 0/4 [00:00<?, ?it/s]



### TODO: customize score.py

In [93]:
# Saving the model artifact to the model catalog.
compartment_id = os.environ['NB_SESSION_COMPARTMENT_OCID']
project_id = os.environ['PROJECT_OCID']

set_auth(auth='resource_principal')

# to set the serialization format in metadata
artifact.reload(model_file_name=MODEL_FILE_NAME)

catalog_entry = artifact.save(display_name='california-housing1', 
                              description='A model for regression',
                              # to avoid to commit (be careful)
                              ignore_pending_changes=True)

loop1:   0%|          | 0/5 [00:00<?, ?it/s]

artifact:/tmp/saved_model_fe2fa2bc-1c80-4736-94da-be09a412f848.zip
