### Test model deployment

In this NB we do a set of detailed test on the code used in the Model Deployment.

In [1]:
import os
import requests
import oci
from oci.signer import Signer

from sklearn.model_selection import train_test_split

import pandas as pd

import ads
from ads.dataset.factory import DatasetFactory

In [2]:
ads.set_auth(auth='resource_principal')

auth = oci.auth.signers.get_resource_principals_signer()



### Ovviamente,, carichiamo i dati per i test

In [3]:
def load_as_dataframe(path):
    ds = DatasetFactory.open(path,
                             target="Attrition").set_positive_class('Yes')

    ds_up = ds.up_sample()

    # drop unneeded columns
    cols_to_drop = ['Directs','name', 'Over18','WeeklyWorkedHours','EmployeeNumber']

    ds_used = ds_up.drop(columns=cols_to_drop)
    
    df_used = ds_used.to_pandas()
    
    

    # train, test split (lo faccio direttamente sui dataframe)
    df_train, df_test = train_test_split(df_used, shuffle=True, test_size=0.2, random_state = 1234)

    print("# of samples in train set", df_train.shape[0])
    print("# of samples in test set", df_test.shape[0])
    
    return df_train, df_test

In [4]:
# load the dataset and do upsampling
TARGET = 'Attrition'

attrition_path = "/opt/notebooks/ads-examples/oracle_data/orcl_attrition.csv"

# ritorna un dataset su cui è stato fatto l'upsampling
df_train, df_test = load_as_dataframe(attrition_path)

loop1:   0%|          | 0/4 [00:00<?, ?it/s]

# of samples in train set 1972
# of samples in test set 494


In [5]:
X_train, y_train = df_train.drop([TARGET], axis=1), df_train[TARGET]
X_test, y_test = df_test.drop([TARGET], axis=1), df_test[TARGET]

### Definiamo l'insieme di dati di input per il test

In [9]:
# è fondamentale che input_data sia serializzabile !!!

N_ROWS = 20

input_data = X_test[:N_ROWS].to_json()

input_data

'{"Age":{"1204":49,"1681":20,"248":38,"2305":35,"441":43,"2360":49,"1925":31,"2192":30,"142":39,"852":30,"27":43,"1496":34,"208":41,"1452":51,"320":28,"2414":35,"1969":31,"1435":45,"454":30,"1178":21},"TravelForWork":{"1204":"often","1681":"infrequent","248":"infrequent","2305":"none","441":"often","2360":"infrequent","1925":"infrequent","2192":"infrequent","142":"infrequent","852":"infrequent","27":"infrequent","1496":"infrequent","208":"none","1452":"often","320":"infrequent","2414":"none","1969":"often","1435":"infrequent","454":"infrequent","1178":"infrequent"},"SalaryLevel":{"1204":3246,"1681":1388,"248":4664,"2305":6246,"441":6760,"2360":2870,"1925":632,"2192":1026,"142":1668,"852":6426,"27":3168,"1496":3728,"208":5278,"1452":4026,"320":6316,"2414":4434,"1969":2128,"1435":4756,"454":1064,"1178":5232},"JobFunction":{"1204":"Product Management","1681":"Software Developer","248":"Software Developer","2305":"Product Management","441":"Software Developer","2360":"Software Developer","

### Come fare i test passo passo

In [32]:
%load_ext autoreload
%autoreload 2

# le due istruzioni sopra sono fondamentali per caricare l'ultima versione di score.py

# importare score.py
from model_dir.score import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [33]:
# passare un oggetto "serializzabile"

# dataframe non lo è
# numpy non lo è

# controlliamo il tipo che passiamo
assert type(input_data) == str

print("Controllo OK")

Controllo OK


In [34]:
# carichiamo il modello

model = load_model(model_file_name="model.joblib")

Start loading model.joblib from model directory /home/datascience/workshop-july-2022/model_dir ...
Model is successfully loaded.


In [35]:
# controlliamo che sia ok
model.steps

[('preprocessor',
  ColumnTransformer(transformers=[('num',
                                   Pipeline(steps=[('imputer', SimpleImputer()),
                                                   ('standard_scaler',
                                                    StandardScaler())]),
                                   ['Age', 'SalaryLevel', 'CommuteLength',
                                    'HourlyRate', 'MonthlyIncome', 'MonthlyRate',
                                    'NumCompaniesWorked', 'PercentSalaryHike',
                                    'YearsinIndustry', 'YearsOnJob',
                                    'YearsAtCurrentLevel',
                                    'YearsSinceLastPromotion',
                                    'YearsWithCurrManager']),
                                  ('c...
                                                    OrdinalEncoder(handle_unknown='use_encoded_value',
                                                                   unknown_value=-

In [36]:
# predict nell'ordine chiama
input = pre_inference(input_data, "model_dir/input_schema.json")

---> deserialize...
Input data: {"Age":{"1204":49,"1681":20,"248":38,"2305":35,"441":43,"2360":49},"TravelForWork":{"1204":"often","1681":"infrequent","248":"infrequent","2305":"none","441":"often","2360":"infrequent"},"SalaryLevel":{"1204":3246,"1681":1388,"248":4664,"2305":6246,"441":6760,"2360":2870},"JobFunction":{"1204":"Product Management","1681":"Software Developer","248":"Software Developer","2305":"Product Management","441":"Software Developer","2360":"Software Developer"},"CommuteLength":{"1204":8,"1681":3,"248":2,"2305":20,"441":6,"2360":2},"EducationalLevel":{"1204":"L2","1681":"L3","248":"L2","2305":"L3","441":"L2","2360":"L2"},"EducationField":{"1204":"Medical","1681":"Life Sciences","248":"Medical","2305":"Marketing","441":"Other","2360":"Life Sciences"},"EnvironmentSatisfaction":{"1204":4,"1681":2,"248":3,"2305":1,"441":2,"2360":1},"Gender":{"1204":"Female","1681":"Male","248":"Female","2305":"Male","441":"Male","2360":"Male"},"HourlyRate":{"1204":95,"1681":47,"248":83,

In [27]:
assert type(input) == pd.core.frame.DataFrame

print("Controllo tipo dati input a predict OK")

Controllo tipo dati input a predict OK


In [28]:
yhat = post_inference(model.predict(input))

In [29]:
yhat

[1, 1, 0, 1, 0, 1]

In [31]:
y_test[:N_ROWS].values

array([1, 1, 0, 1, 0, 1])

### Ed ora siamo pronti per invocare il servizio REST (Model Deployment)

* notare che usiamo il parametro "data" e non "json"

In [10]:
%%time

#
endpoint='https://modeldeployment.eu-milan-1.oci.customer-oci.com/ocid1.datasciencemodeldeployment.oc1.eu-milan-1.amaaaaaangencdyafafaigxeuy6mdg7jycpjcgn7kp3mqyjzwlutjmkgoyeq/predict'

# if we want to passa a DataFrame serialize, use the data params of request
requests.post(endpoint, data=input_data, auth=auth).json()

CPU times: user 17 ms, sys: 2.69 ms, total: 19.6 ms
Wall time: 121 ms


{'prediction': [1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0]}

In [11]:
# compare with expected values
y_test[0:N_ROWS].values

array([1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0])