From cf05d722049c71aa5acb54484f516762271c0e60 Mon Sep 17 00:00:00 2001 From: Lu Peng <118394507+lu-ohai@users.noreply.github.com> Date: Mon, 1 May 2023 16:44:37 -0700 Subject: [PATCH 1/9] Update 1-model-training.ipynb --- labs/MLSummit21/notebooks/1-model-training.ipynb | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/labs/MLSummit21/notebooks/1-model-training.ipynb b/labs/MLSummit21/notebooks/1-model-training.ipynb index c91b1913..1c6dbe35 100644 --- a/labs/MLSummit21/notebooks/1-model-training.ipynb +++ b/labs/MLSummit21/notebooks/1-model-training.ipynb @@ -68,7 +68,7 @@ "from collections import defaultdict\n", "\n", "from ads.common.model import ADSModel\n", - "from ads.dataset.factory import DatasetFactory\n", + "from ads.dataset.dataset_with_target import ADSDatasetWithTarget\n", "from ads.evaluations.evaluator import ADSEvaluator\n", "\n", "import pandas as pd\n", @@ -147,9 +147,11 @@ "source": [ "bucket_name = \"hosted-ds-datasets\"\n", "namespace = \"bigdatadatasciencelarge\"\n", - "employees = DatasetFactory.open(\n", - " \"oci://{}@{}/synthetic/orcl_attrition.csv\".format(bucket_name, namespace), \n", - " target=\"Attrition\", storage_options={'config':{},'region':'us-ashburn-1'}).set_positive_class('Yes')" + "employees = ADSDatasetWithTarget.from_dataframe(\n", + " df=pd.read_csv(\"oci://{}@{}/synthetic/orcl_attrition.csv\".format(bucket_name, namespace)),\n", + " target=\"Attrition\",\n", + " storage_options={'config':{},'region':'us-ashburn-1'}\n", + ").set_positive_class('Yes')" ] }, { @@ -585,9 +587,9 @@ "metadata": { "celltoolbar": "Raw Cell Format", "kernelspec": { - "display_name": "Python [conda env:tensorflow27_p37_cpu_v1]", + "display_name": "Python [conda env:tensorflow28_p38_cpu_v1]", "language": "python", - "name": "conda-env-tensorflow27_p37_cpu_v1-py" + "name": "conda-env-tensorflow28_p38_cpu_v1-py" }, "language_info": { "codemirror_mode": { @@ -599,7 +601,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.12" + "version": "3.8.13" }, "pycharm": { "stem_cell": { From ac97834e76c3c6bd887ca7852c15bcc43419be5d Mon Sep 17 00:00:00 2001 From: Lu Peng <118394507+lu-ohai@users.noreply.github.com> Date: Mon, 1 May 2023 16:49:25 -0700 Subject: [PATCH 2/9] Update 1-model-training.ipynb --- labs/MLSummit21/notebooks/1-model-training.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/labs/MLSummit21/notebooks/1-model-training.ipynb b/labs/MLSummit21/notebooks/1-model-training.ipynb index 1c6dbe35..1f2b6232 100644 --- a/labs/MLSummit21/notebooks/1-model-training.ipynb +++ b/labs/MLSummit21/notebooks/1-model-training.ipynb @@ -106,7 +106,7 @@ "\n", "This is a fictional data set which contains 1,470 rows. There are 36 features. 22 features are ordinal, 11 are categorical, and 3 are constant values. The features include basic demographic information, compensation level, job characteristics, job satisfaction and employee performance metrics. The data is not balanced as fewer employees leave than stay.\n", "\n", - "The first step is to load in the dataset. To do this the `DatasetFactory` singleton object will be used. It is part of the `ADS` library. It is a powerful class to work with datasets from different sources.\n", + "The first step is to load in the dataset. To do this the `ADSDatasetWithTarget` singleton object will be used. It is part of the `ADS` library. It is a powerful class to work with datasets from different sources.\n", "\n", "Datasets are provided as a convenience. Datasets are considered Third Party Content and are not considered Materials under Your agreement with Oracle applicable to the Services. You can access the `orcl_attrition` dataset license [here](oracle_data/UPL.txt). Dataset `orcl_attrition` is distributed under UPL license. \n", "" @@ -135,7 +135,7 @@ "\n", "3. Run the following instruction instead of the code cell below.\n", "```\n", - "employees = DatasetFactory.open(\"synthetic_orcl_attrition.csv\", format='csv', delimiter=\",\", target=\"Attrition\").set_positive_class('Yes')\n", + "employees = ADSDatasetWithTarget.from_dataframe(df=pd.read_csv(\"synthetic_orcl_attrition.csv\"), target=\"Attrition\",).set_positive_class('Yes')\n", "```" ] }, From 19aefb1688398ee19e4e659b48bf344eecd286d7 Mon Sep 17 00:00:00 2001 From: Lu Peng Date: Wed, 3 May 2023 15:19:45 -0700 Subject: [PATCH 3/9] Updated pr. --- .../notebooks/1-model-training.ipynb | 22 +- ...dataset-data_load_and_model_training.ipynb | 618 ++++++++++++++++++ 2 files changed, 628 insertions(+), 12 deletions(-) create mode 100644 notebook_examples/dataset-data_load_and_model_training.ipynb diff --git a/labs/MLSummit21/notebooks/1-model-training.ipynb b/labs/MLSummit21/notebooks/1-model-training.ipynb index 1f2b6232..a6412124 100644 --- a/labs/MLSummit21/notebooks/1-model-training.ipynb +++ b/labs/MLSummit21/notebooks/1-model-training.ipynb @@ -68,7 +68,7 @@ "from collections import defaultdict\n", "\n", "from ads.common.model import ADSModel\n", - "from ads.dataset.dataset_with_target import ADSDatasetWithTarget\n", + "from ads.dataset.factory import DatasetFactory\n", "from ads.evaluations.evaluator import ADSEvaluator\n", "\n", "import pandas as pd\n", @@ -106,7 +106,7 @@ "\n", "This is a fictional data set which contains 1,470 rows. There are 36 features. 22 features are ordinal, 11 are categorical, and 3 are constant values. The features include basic demographic information, compensation level, job characteristics, job satisfaction and employee performance metrics. The data is not balanced as fewer employees leave than stay.\n", "\n", - "The first step is to load in the dataset. To do this the `ADSDatasetWithTarget` singleton object will be used. It is part of the `ADS` library. It is a powerful class to work with datasets from different sources.\n", + "The first step is to load in the dataset. To do this the `DatasetFactory` singleton object will be used. It is part of the `ADS` library. It is a powerful class to work with datasets from different sources.\n", "\n", "Datasets are provided as a convenience. Datasets are considered Third Party Content and are not considered Materials under Your agreement with Oracle applicable to the Services. You can access the `orcl_attrition` dataset license [here](oracle_data/UPL.txt). Dataset `orcl_attrition` is distributed under UPL license. \n", "" @@ -135,7 +135,7 @@ "\n", "3. Run the following instruction instead of the code cell below.\n", "```\n", - "employees = ADSDatasetWithTarget.from_dataframe(df=pd.read_csv(\"synthetic_orcl_attrition.csv\"), target=\"Attrition\",).set_positive_class('Yes')\n", + "employees = DatasetFactory.open(\"synthetic_orcl_attrition.csv\", format='csv', delimiter=\",\", target=\"Attrition\").set_positive_class('Yes')\n", "```" ] }, @@ -147,11 +147,9 @@ "source": [ "bucket_name = \"hosted-ds-datasets\"\n", "namespace = \"bigdatadatasciencelarge\"\n", - "employees = ADSDatasetWithTarget.from_dataframe(\n", - " df=pd.read_csv(\"oci://{}@{}/synthetic/orcl_attrition.csv\".format(bucket_name, namespace)),\n", - " target=\"Attrition\",\n", - " storage_options={'config':{},'region':'us-ashburn-1'}\n", - ").set_positive_class('Yes')" + "employees = DatasetFactory.open(\n", + " \"oci://{}@{}/synthetic/orcl_attrition.csv\".format(bucket_name, namespace), \n", + " target=\"Attrition\", storage_options={'config':{},'region':'us-ashburn-1'}).set_positive_class('Yes')" ] }, { @@ -587,9 +585,9 @@ "metadata": { "celltoolbar": "Raw Cell Format", "kernelspec": { - "display_name": "Python [conda env:tensorflow28_p38_cpu_v1]", + "display_name": "Python [conda env:tensorflow27_p37_cpu_v1]", "language": "python", - "name": "conda-env-tensorflow28_p38_cpu_v1-py" + "name": "conda-env-tensorflow27_p37_cpu_v1-py" }, "language_info": { "codemirror_mode": { @@ -601,7 +599,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.7.12" }, "pycharm": { "stem_cell": { @@ -615,4 +613,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/notebook_examples/dataset-data_load_and_model_training.ipynb b/notebook_examples/dataset-data_load_and_model_training.ipynb new file mode 100644 index 00000000..1f2b6232 --- /dev/null +++ b/notebook_examples/dataset-data_load_and_model_training.ipynb @@ -0,0 +1,618 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Oracle Cloud Infrastructure Data Science Demo Notebook\n", + "\n", + "Copyright (c) 2021 Oracle, Inc.
\n", + "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***\n", + "# Predicting Employee Attrition with ADS\n", + "

by the OCI Data Science PM Team

\n", + "\n", + "***" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview:\n", + "\n", + "In this notebook, we will be using an employee attrition dataset. We will start by doing an exploratory data analysis (EDA) to understand the data. Then a model will be trained using `scikit-learn`. The model will be used to make predictions and evaluate the model to determine how well it generalizes to new data. You will prepare and save the resulting model to the model catalog using Oracle's Accelerated Data Science, (`ADS`) library." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's do all of the imports necessary to get this notebook working up here.\n", + "\n", + "**NOTE: This notebook was run in the TensorFlow 2.7 for CPU (slug: `tensorflow27_p37_cpu_v1`) conda environment with ADS version 2.5.10. Upgrade your version of ADS (see cell below) and restart your kernel.**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install oracle-ads==2.5.10\n", + "!pip install onnxconverter-common --upgrade" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "import warnings\n", + "import logging\n", + "import os\n", + "from os import path \n", + "from os.path import expanduser\n", + "from os.path import join\n", + "\n", + "from category_encoders.ordinal import OrdinalEncoder\n", + "from collections import defaultdict\n", + "\n", + "from ads.common.model import ADSModel\n", + "from ads.dataset.dataset_with_target import ADSDatasetWithTarget\n", + "from ads.evaluations.evaluator import ADSEvaluator\n", + "\n", + "import pandas as pd\n", + "\n", + "from sklearn.base import TransformerMixin\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import get_scorer\n", + "import numpy as np \n", + "\n", + "import ads \n", + "ads.set_auth(auth='resource_principal') \n", + "\n", + "warnings.filterwarnings('ignore')\n", + "logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.ERROR)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Open and Visualize the Attrition Dataset using `ADS`\n", + "\n", + "\n", + "### Binary Classification\n", + "\n", + "Binary classification is a technique of classifying observations into one of two groups. In this notebook, the two groups are those employees that will leave the organization and those that will not. \n", + "\n", + "Given the features in the data, the model will determine the optimal criteria for classifying an observation as leaving or not leaving. This optimization is based on the training data. However, we will holdout some of the data to test the model's preformance. Models can over-fit on the training data, that is learn the noise in a dataset and then it will not do a good job at predicting the results on new data (test data). Since we already know the truth for the data in the training dataset, we are really interested in how well it performs on the test data.\n", + "\n", + "\n", + "### The Dataset\n", + "\n", + "This is a fictional data set which contains 1,470 rows. There are 36 features. 22 features are ordinal, 11 are categorical, and 3 are constant values. The features include basic demographic information, compensation level, job characteristics, job satisfaction and employee performance metrics. The data is not balanced as fewer employees leave than stay.\n", + "\n", + "The first step is to load in the dataset. To do this the `ADSDatasetWithTarget` singleton object will be used. It is part of the `ADS` library. It is a powerful class to work with datasets from different sources.\n", + "\n", + "Datasets are provided as a convenience. Datasets are considered Third Party Content and are not considered Materials under Your agreement with Oracle applicable to the Services. You can access the `orcl_attrition` dataset license [here](oracle_data/UPL.txt). Dataset `orcl_attrition` is distributed under UPL license. \n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ADS version used in this notebook: \n", + "print(ads.__version__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code cell below will work only if your notebook session is running in the **Ashburn** region. If not use these instructions to download it to your local computer, then upload it to the notebook session, then use it to create a dataset. \n", + "\n", + "1. Download the file from this public url and save it on your local computer: \n", + "https://objectstorage.us-ashburn-1.oraclecloud.com/n/bigdatadatasciencelarge/b/hosted-ds-datasets/o/synthetic%2Forcl_attrition.csv\n", + "\n", + "2. Use the Upload Files button (or drag and drop) to upload the csv file to the same folder as 1-model-training.ipynb\n", + "\n", + "3. Run the following instruction instead of the code cell below.\n", + "```\n", + "employees = ADSDatasetWithTarget.from_dataframe(df=pd.read_csv(\"synthetic_orcl_attrition.csv\"), target=\"Attrition\",).set_positive_class('Yes')\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bucket_name = \"hosted-ds-datasets\"\n", + "namespace = \"bigdatadatasciencelarge\"\n", + "employees = ADSDatasetWithTarget.from_dataframe(\n", + " df=pd.read_csv(\"oci://{}@{}/synthetic/orcl_attrition.csv\".format(bucket_name, namespace)),\n", + " target=\"Attrition\",\n", + " storage_options={'config':{},'region':'us-ashburn-1'}\n", + ").set_positive_class('Yes')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Visualize the Dataset Object\n", + "\n", + "The `show_in_notebook` method can be applied to the dataset itself. When this is done the following is produced:\n", + "\n", + " - Summary, this shows a brief description of the dataset, shape, and a breakdown by feature type\n", + " - Feature summary, a visualization created on a dataset sample to give an idea of distribution for each feature.\n", + " - Correlations, a map which shows how every feature (numeric and categorical) are correlated\n", + " - Data preview, the first five rows of the data\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#employees.show_in_notebook()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#employees.show_corr()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Get and Apply Transformation Recommendations\n", + "\n", + "`ADS` can help with feature engineering by transforming datasets. For example, it can fix class imbalance by up or downsampling. This is just one example of the many transforms that `ADS` can apply. You can have `ADS` perform an analysis of the data and automatically perform the transformations that it thinks would improve the model. This is done with the `auto_transform()` method. The `suggest_recommendations()` method allows you to explore the suggested transforms using the notebook's UI and select the transformations that you would like it to make.\n", + "\n", + "All ADS datasets are immutable; any transforms that are applied result in a new dataset. In this example, the notebook will perform automatic transformations on the data, and it will also fix the class imbalance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformed_ds = employees.auto_transform(fix_imbalance=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's split the dataset train/test. If you call `train_test_split()` the split will be 90/10, train/test. Change the parameter `test_size` to change the size of the test dataset. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train, test = transformed_ds.train_test_split()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Training a `scikit-learn` Random Forest Model \n", + "\n", + "Below we create our own label encoder for some of the categorical features that are found in our dataset. We use `category_encoders` to achieve this and we apply to all columns of type `object` or `category`. That's a preprocessing step we go through before training the model.\n", + "\n", + "The class object will be written locally as a Python module (`dataframelabelencoder.py`). We will capture that file as part of the model artifact." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile dataframelabelencoder.py \n", + "\n", + "from category_encoders.ordinal import OrdinalEncoder\n", + "from collections import defaultdict\n", + "\n", + "from sklearn.base import TransformerMixin\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.pipeline import make_pipeline\n", + "from sklearn.preprocessing import LabelEncoder\n", + "\n", + "class DataFrameLabelEncoder(TransformerMixin):\n", + " def __init__(self):\n", + " self.label_encoders = defaultdict(LabelEncoder)\n", + " \n", + " def fit(self, X):\n", + " for column in X.columns:\n", + " if X[column].dtype.name in [\"object\", \"category\"]:\n", + " self.label_encoders[column] = OrdinalEncoder()\n", + " self.label_encoders[column].fit(X[column])\n", + " return self\n", + " \n", + " def transform(self, X):\n", + " for column, label_encoder in self.label_encoders.items():\n", + " X[column] = label_encoder.transform(X[column])\n", + " return X" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we train the model. We are using the sklearn `Pipeline()` object to assemble the data transformation and model estimators into a single object. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dataframelabelencoder import DataFrameLabelEncoder\n", + "\n", + "X = train.X.copy()\n", + "y = train.y.copy()\n", + "\n", + "le = DataFrameLabelEncoder()\n", + "X = le.fit_transform(X)\n", + "\n", + "sk_clf = RandomForestClassifier(random_state=42)\n", + "sk_clf.fit(X, y)\n", + "\n", + "sk_model = make_pipeline(le, sk_clf)\n", + "\n", + "# Build an ads model from the SVM classifier\n", + "my_model = ADSModel.from_estimator(sk_model, \n", + " name=sk_clf.__class__.__name__)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "## Evaluate The Model using `ADSEvaluator`\n", + "\n", + "One of the key advantages of `ADS` is the ability to quickly evaluate any models. ADS supports evaluating:\n", + "\n", + "- regression\n", + "- binary classification\n", + "- multiclass classification\n", + "\n", + "`ADS` supports the ability for you to provide your own evaluation function (given `y_true` and `y_pred` series) for any esoteric calculation that you would like to run.\n", + "\n", + "Below, we examine the plots that are commonly used to evaluate model performance. These include the precision-recall, ROC, lift, and gain plots. Each model under study is plotted together, allowing for easy comparison. In addition, the normalized confusion matrices are provided." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluator = ADSEvaluator(test, models=[my_model], \n", + " training_data=train)\n", + "evaluator.show_in_notebook()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are a number of common metrics that are used to assess the quality of a model. `ADS` provides a convenient method to compare the models and highlights the model with the highest score in each metric." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluator.metrics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A binary classification model can have one of four outcomes for each prediction. A true-negative is an outcome where the model correctly predicts the negative case, and a false-negative is an outcome where when the model incorrectly predicts the negative case. A false-positive is when the model incorrectly predicts the positive case, and a true-positive is when the model correctly predicts the positive case. However, not all false-positive and false-negatives have the same importance. For example, a cancer test has a higher cost when it incorrectly says that a patient does not have cancer when they do. The `calculate_cost` method allows the cost to be computed for each model based on the cost of each class of prediction." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "evaluator.calculate_cost(tn_weight=1, fp_weight=3, fn_weight=2, tp_weight=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# Saving the model to the model catalog " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can save the simple random forest model in the model catalog, using the very flexible `prepare_generic_model()` function to save my model. That function creates an editable template artifact. The function `prepare_generic_model()` can support **any** model and **should always be the preferred way to save models from open source libraries**. \n", + "\n", + "`prepare_generic_model()` gives you complete control on the structure of the artifact and the definition fo the functions in `score.py`.\n", + "\n", + "Note in the cell below that we specify an `inference_conda_env` value. This parameter corresponds to the conda environment we want to use for model deployment. A reference of that environment is written to `runtime.yaml` when you run `prepare_generic_model()`. The path represents where the conda environment is stored in object storage. You can find that information in the Environment Explorer. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from ads.common.model_artifact import ModelArtifact\n", + "from ads.common.model_export_util import prepare_generic_model\n", + "import joblib \n", + "import os\n", + "\n", + "# Path to artifact directory for my sklearn model: \n", + "model_artifact_location = os.path.expanduser('./model-artifact/')\n", + "os.makedirs(model_artifact_location, exist_ok=True)\n", + "\n", + "# Creating a joblib pickle object of my random forest model: \n", + "joblib.dump(sk_model, os.path.join(model_artifact_location, \"model.joblib\"))\n", + "\n", + "# Creating the artifact template files in the directory: \n", + "sklearn_artifact = prepare_generic_model(model_artifact_location, \n", + " inference_conda_env=\"oci://service-conda-packs@id19sfcrra6z/service_pack/cpu/TensorFlow 2.7 for CPU on Python 3.7/1.0/tensorflow27_p37_cpu_v1\",\n", + " force_overwrite=True,\n", + " model='model.joblib',\n", + " use_case_type='BINARY_CLASSIFICATION',\n", + " X_sample=train.X,\n", + " y_sample=train.y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we copy the `dataframelabelencoder.py` module in the model artifact directory. The serialized pipeline object will require the module to be defined and available when you de-serialize and load the pipeline object to memory. \n", + "\n", + "We also tweak the `score.py` template that `prepare_generic_model()` created, ensuring that `load_model()` reads in the `model.joblib` file." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#setting paths for artifact files that need to be modified: \n", + "\n", + "encoder_path = os.path.join(model_artifact_location, \"dataframelabelencoder.py\")\n", + "score_path = os.path.join(model_artifact_location, \"score.py\")\n", + "!cp dataframelabelencoder.py {encoder_path}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile {score_path}\n", + "\n", + "\"\"\"\n", + " Inference script. This script is used for prediction by scoring server when schema is known.\n", + "\"\"\"\n", + "\n", + "import json\n", + "import os\n", + "from joblib import load\n", + "import io \n", + "import pandas as pd\n", + "import logging \n", + "\n", + "# logging configuration - OPTIONAL \n", + "logging.basicConfig(format='%(name)s - %(levelname)s - %(message)s', level=logging.INFO)\n", + "logger_pred = logging.getLogger('model-prediction')\n", + "logger_pred.setLevel(logging.INFO)\n", + "logger_feat = logging.getLogger('input-features')\n", + "logger_feat.setLevel(logging.INFO)\n", + "\n", + "from dataframelabelencoder import DataFrameLabelEncoder\n", + "\n", + "def load_model():\n", + " \"\"\"\n", + " Loads model from the serialized format\n", + "\n", + " Returns\n", + " -------\n", + " model: a model instance on which predict API can be invoked\n", + " \"\"\"\n", + " model_dir = os.path.dirname(os.path.realpath(__file__))\n", + " contents = os.listdir(model_dir)\n", + " model_file_name = \"model.joblib\"\n", + " # TODO: Load the model from the model_dir using the appropriate loader\n", + " # Below is a sample code to load a model file using `cloudpickle` which was serialized using `cloudpickle`\n", + " # from cloudpickle import cloudpickle\n", + " if model_file_name in contents:\n", + " with open(os.path.join(os.path.dirname(os.path.realpath(__file__)), model_file_name), \"rb\") as file:\n", + " model = load(file) # Use the loader corresponding to your model file.\n", + " else:\n", + " raise Exception('{0} is not found in model directory {1}'.format(model_file_name, model_dir))\n", + " \n", + " return model\n", + "\n", + "\n", + "def predict(data, model=load_model()) -> dict:\n", + " \"\"\"\n", + " Returns prediction given the model and data to predict\n", + "\n", + " Parameters\n", + " ----------\n", + " model: Model instance returned by load_model API\n", + " data: Data format as expected by the predict API of the core estimator. For eg. in case of sckit models it could be numpy array/List of list/Panda DataFrame\n", + "\n", + " Returns\n", + " -------\n", + " predictions: Output from scoring server\n", + " Format: { 'prediction': output from `model.predict` method }\n", + "\n", + " \"\"\"\n", + " assert model is not None, \"Model is not loaded\"\n", + " X = pd.read_json(io.StringIO(data)) if isinstance(data, str) else pd.DataFrame.from_dict(data)\n", + " preds = model.predict(X).tolist()\n", + "# logger_pred.info(preds)\n", + "# logger_feat.info(X) \n", + " return { 'prediction': preds }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Testing the artifact before saving to the catalog\n", + "\n", + "It is always a good idea to test your model artifact before saving it to the catalog. Here we load the `score.py` module along with `load_model` and `predict`. We test predict by passing the training dataframe, doing the same for the predict() method of the sklearn model object. Next, we compare the two prediction arrays. These two should be identical." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "input_data = train.X[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys \n", + "\n", + "# add the path of score.py: \n", + "sys.path.insert(0, model_artifact_location)\n", + "\n", + "from score import load_model, predict\n", + "\n", + "# Load the model to memory \n", + "_ = load_model()\n", + "# make predictions on the first five rows of the training dataset: \n", + "predictions = predict(input_data.to_json()) \n", + "\n", + "# The two lists should match:\n", + "print(f\"* * * score.predict() and the pipeline predict() return the same predictions \\\n", + "on the same data: {sk_model.predict(input_data).tolist() == predictions['prediction']}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Saving the Model to the Model Catalog" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mc_model = sklearn_artifact.save(project_id=os.environ['PROJECT_OCID'], \n", + " compartment_id=os.environ['NB_SESSION_COMPARTMENT_OCID'], \n", + " training_id=os.environ['NB_SESSION_OCID'],\n", + " display_name=\"attrition-model\",\n", + " ignore_introspection=False,\n", + " description=\"simple sklearn model to predict employee attrition\", \n", + " training_script_path=\"1-model-training.ipynb\", \n", + " ignore_pending_changes=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This this the model OCID of the newly created model: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(mc_model.id)" + ] + } + ], + "metadata": { + "celltoolbar": "Raw Cell Format", + "kernelspec": { + "display_name": "Python [conda env:tensorflow28_p38_cpu_v1]", + "language": "python", + "name": "conda-env-tensorflow28_p38_cpu_v1-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + }, + "pycharm": { + "stem_cell": { + "cell_type": "raw", + "metadata": { + "collapsed": false + }, + "source": [] + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From bb112886fd07a5315c99be2ca899516c8d36e5e9 Mon Sep 17 00:00:00 2001 From: Lu Peng <118394507+lu-ohai@users.noreply.github.com> Date: Wed, 3 May 2023 15:21:09 -0700 Subject: [PATCH 4/9] Update 1-model-training.ipynb --- labs/MLSummit21/notebooks/1-model-training.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/labs/MLSummit21/notebooks/1-model-training.ipynb b/labs/MLSummit21/notebooks/1-model-training.ipynb index a6412124..c91b1913 100644 --- a/labs/MLSummit21/notebooks/1-model-training.ipynb +++ b/labs/MLSummit21/notebooks/1-model-training.ipynb @@ -613,4 +613,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} From ec974e3b1d0a86b4f59507497bc54971a265c81b Mon Sep 17 00:00:00 2001 From: Lu Peng <118394507+lu-ohai@users.noreply.github.com> Date: Thu, 11 May 2023 10:35:34 -0700 Subject: [PATCH 5/9] Update notebook_examples/dataset-data_load_and_model_training.ipynb Co-authored-by: Qiu Qin --- notebook_examples/dataset-data_load_and_model_training.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook_examples/dataset-data_load_and_model_training.ipynb b/notebook_examples/dataset-data_load_and_model_training.ipynb index 1f2b6232..47b5fedb 100644 --- a/notebook_examples/dataset-data_load_and_model_training.ipynb +++ b/notebook_examples/dataset-data_load_and_model_training.ipynb @@ -6,7 +6,7 @@ "source": [ "Oracle Cloud Infrastructure Data Science Demo Notebook\n", "\n", - "Copyright (c) 2021 Oracle, Inc.
\n", + "Copyright (c) 2021, 2023 Oracle, Inc.
\n", "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.\n", "
" ] From 168c8e8bf89f44586ccb368bb4ae11405ce54456 Mon Sep 17 00:00:00 2001 From: Lu Peng <118394507+lu-ohai@users.noreply.github.com> Date: Thu, 11 May 2023 10:35:40 -0700 Subject: [PATCH 6/9] Update notebook_examples/dataset-data_load_and_model_training.ipynb Co-authored-by: Qiu Qin --- notebook_examples/dataset-data_load_and_model_training.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook_examples/dataset-data_load_and_model_training.ipynb b/notebook_examples/dataset-data_load_and_model_training.ipynb index 47b5fedb..9c736fb4 100644 --- a/notebook_examples/dataset-data_load_and_model_training.ipynb +++ b/notebook_examples/dataset-data_load_and_model_training.ipynb @@ -46,7 +46,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install oracle-ads==2.5.10\n", + "!pip install oracle-ads==2.8.4\n", "!pip install onnxconverter-common --upgrade" ] }, From 8319ef65a1b162e9ee7d5f16538de951b61b3d53 Mon Sep 17 00:00:00 2001 From: Lu Peng <118394507+lu-ohai@users.noreply.github.com> Date: Thu, 11 May 2023 10:36:41 -0700 Subject: [PATCH 7/9] Update dataset-data_load_and_model_training.ipynb --- notebook_examples/dataset-data_load_and_model_training.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook_examples/dataset-data_load_and_model_training.ipynb b/notebook_examples/dataset-data_load_and_model_training.ipynb index 9c736fb4..67af6ca0 100644 --- a/notebook_examples/dataset-data_load_and_model_training.ipynb +++ b/notebook_examples/dataset-data_load_and_model_training.ipynb @@ -37,7 +37,7 @@ "source": [ "Let's do all of the imports necessary to get this notebook working up here.\n", "\n", - "**NOTE: This notebook was run in the TensorFlow 2.7 for CPU (slug: `tensorflow27_p37_cpu_v1`) conda environment with ADS version 2.5.10. Upgrade your version of ADS (see cell below) and restart your kernel.**" + "**NOTE: This notebook was run in the TensorFlow 2.7 for CPU (slug: `tensorflow27_p37_cpu_v1`) conda environment with ADS version 2.8.4. Upgrade your version of ADS (see cell below) and restart your kernel.**" ] }, { From 649ce8b28f4d379d9fdf993a1cbfdaf8771a5799 Mon Sep 17 00:00:00 2001 From: John DeSanto <202220+jdesanto@users.noreply.github.com> Date: Tue, 1 Aug 2023 08:05:12 -0700 Subject: [PATCH 8/9] Added bibliography --- ...dataset-data_load_and_model_training.ipynb | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/notebook_examples/dataset-data_load_and_model_training.ipynb b/notebook_examples/dataset-data_load_and_model_training.ipynb index 67af6ca0..0f8540d8 100644 --- a/notebook_examples/dataset-data_load_and_model_training.ipynb +++ b/notebook_examples/dataset-data_load_and_model_training.ipynb @@ -1,12 +1,25 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "@notebook{dataset-data_load_and_model_training.ipynb,\n", + " title: Predicting Employee Attrition with ADS,\n", + " summary: Predict employee attrition using exploratory data analysis, train a model, and save the model to the model catalog,\n", + " developed_on: tensorflow28_p38_cpu_v1,\n", + " keywords: data analysis, scikit-learn, \n", + " license: Universal Permissive License v 1.0\n", + "}" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ "Oracle Cloud Infrastructure Data Science Demo Notebook\n", "\n", - "Copyright (c) 2021, 2023 Oracle, Inc.
\n", + "Copyright (c) 2021 Oracle, Inc.
\n", "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.\n", "
" ] @@ -37,7 +50,7 @@ "source": [ "Let's do all of the imports necessary to get this notebook working up here.\n", "\n", - "**NOTE: This notebook was run in the TensorFlow 2.7 for CPU (slug: `tensorflow27_p37_cpu_v1`) conda environment with ADS version 2.8.4. Upgrade your version of ADS (see cell below) and restart your kernel.**" + "**NOTE: This notebook was run in the TensorFlow 2.7 for CPU (slug: `tensorflow27_p37_cpu_v1`) conda environment with ADS version 2.5.10. Upgrade your version of ADS (see cell below) and restart your kernel.**" ] }, { @@ -46,8 +59,8 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install oracle-ads==2.8.4\n", - "!pip install onnxconverter-common --upgrade" + "!pip install oracle-ads --upgrade --quiet\n", + "!pip install onnxconverter-common --upgrade -quiet" ] }, { From 69d799c26f9a2ac29a7e6ff087785f545da869e8 Mon Sep 17 00:00:00 2001 From: Lu Peng Date: Thu, 14 Dec 2023 09:46:22 -0500 Subject: [PATCH 9/9] Updated copyright year. --- notebook_examples/dataset-data_load_and_model_training.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebook_examples/dataset-data_load_and_model_training.ipynb b/notebook_examples/dataset-data_load_and_model_training.ipynb index 0f8540d8..d739d413 100644 --- a/notebook_examples/dataset-data_load_and_model_training.ipynb +++ b/notebook_examples/dataset-data_load_and_model_training.ipynb @@ -19,7 +19,7 @@ "source": [ "Oracle Cloud Infrastructure Data Science Demo Notebook\n", "\n", - "Copyright (c) 2021 Oracle, Inc.
\n", + "Copyright (c) 2023 Oracle, Inc.
\n", "Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl.\n", "
" ]