diff --git a/data-platform/data-science/oracle-data-science/r-conda-oci-data-science/.ipynb_checkpoints/LICENSE-checkpoint b/data-platform/data-science/oracle-data-science/r-conda-oci-data-science/.ipynb_checkpoints/LICENSE-checkpoint new file mode 100644 index 000000000..8dc7c0703 --- /dev/null +++ b/data-platform/data-science/oracle-data-science/r-conda-oci-data-science/.ipynb_checkpoints/LICENSE-checkpoint @@ -0,0 +1,35 @@ +Copyright (c) 2025 Oracle and/or its affiliates. + +The Universal Permissive License (UPL), Version 1.0 + +Subject to the condition set forth below, permission is hereby granted to any +person obtaining a copy of this software, associated documentation and/or data +(collectively the "Software"), free of charge and under any and all copyright +rights in the Software, and any and all patent rights owned or freely +licensable by each licensor hereunder covering either (i) the unmodified +Software as contributed to or provided by such licensor, or (ii) the Larger +Works (as defined below), to deal in both + +(a) the Software, and +(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +one is included with the Software (each a "Larger Work" to which the Software +is contributed by such licensors), + +without restriction, including without limitation the rights to copy, create +derivative works of, display, perform, and distribute the Software and make, +use, sell, offer for sale, import, export, have made, and have sold the +Software and the Larger Work(s), and to sublicense the foregoing rights on +either these or other terms. + +This license is subject to the following condition: +The above copyright notice and either this complete permission notice or at +a minimum a reference to the UPL must be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/Guide for Your First Data Science Project prerequisites.pdf b/data-platform/data-science/oracle-data-science/your-first-data-science-project/Guide for Your First Data Science Project prerequisites.pdf new file mode 100644 index 000000000..26650eeab Binary files /dev/null and b/data-platform/data-science/oracle-data-science/your-first-data-science-project/Guide for Your First Data Science Project prerequisites.pdf differ diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/LICENSE b/data-platform/data-science/oracle-data-science/your-first-data-science-project/LICENSE new file mode 100644 index 000000000..8dc7c0703 --- /dev/null +++ b/data-platform/data-science/oracle-data-science/your-first-data-science-project/LICENSE @@ -0,0 +1,35 @@ +Copyright (c) 2025 Oracle and/or its affiliates. + +The Universal Permissive License (UPL), Version 1.0 + +Subject to the condition set forth below, permission is hereby granted to any +person obtaining a copy of this software, associated documentation and/or data +(collectively the "Software"), free of charge and under any and all copyright +rights in the Software, and any and all patent rights owned or freely +licensable by each licensor hereunder covering either (i) the unmodified +Software as contributed to or provided by such licensor, or (ii) the Larger +Works (as defined below), to deal in both + +(a) the Software, and +(b) any piece of software and/or hardware listed in the lrgrwrks.txt file if +one is included with the Software (each a "Larger Work" to which the Software +is contributed by such licensors), + +without restriction, including without limitation the rights to copy, create +derivative works of, display, perform, and distribute the Software and make, +use, sell, offer for sale, import, export, have made, and have sold the +Software and the Larger Work(s), and to sublicense the foregoing rights on +either these or other terms. + +This license is subject to the following condition: +The above copyright notice and either this complete permission notice or at +a minimum a reference to the UPL must be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/README.md b/data-platform/data-science/oracle-data-science/your-first-data-science-project/README.md new file mode 100644 index 000000000..945f4f9df --- /dev/null +++ b/data-platform/data-science/oracle-data-science/your-first-data-science-project/README.md @@ -0,0 +1,47 @@ + +# Overview + +Your First Data Science Project demonstrates how to build a complete end-to-end data science workflow using Oracle Cloud Infrastructure (OCI) Data Science Platform. +The project walks through the main stages of a typical machine learning lifecycle — from data preparation to model deployment and inference — using practical examples. + +Reviewed: 2025.11.10 + +# What You’ll Learn + +This project covers the following steps: + +Data ingestion + +Data preprocessing and visualization + +Model training and validation + +Model explainability + +Model deployment + +Endpoint invocation for predictions + +# Prerequisites + +Access to OCI Data Science Platform + +Basic familiarity with Python and machine learning concepts + +A valid compartment, resource principal and policies configured for Data Science services. More details can be found in the Guide for Your First Data Science Project prerequisites.pdf + +# How to Use + +Open the provided notebook in your OCI Data Science Notebook Session. + +Select the following conda environment: automlx234_p310_cpu_x86_64_v1 + +Run the notebook cells in sequence to reproduce the complete workflow. + +# License + +Copyright (c) 2025 Oracle and/or its affiliates. + +Licensed under the Universal Permissive License (UPL), Version 1.0. + +See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details. diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf new file mode 100644 index 000000000..26650eeab Binary files /dev/null and b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/Guide for Your First Data Science Project prerequisites.pdf differ diff --git a/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb new file mode 100644 index 000000000..e2acb29c6 --- /dev/null +++ b/data-platform/data-science/oracle-data-science/your-first-data-science-project/files/adult_income shared.ipynb @@ -0,0 +1,1090 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "a325c96e-756f-4628-9a3d-9c6976e340e6", + "metadata": {}, + "source": [ + "Conda environment: automlx251_p311_cpu_x86_64_v2\\\n", + "Created Data: 09/11/2025\\\n", + "By: Assaf Rabinowicz, EMEA Data Science Team" + ] + }, + { + "cell_type": "markdown", + "id": "a736e6a0-a81e-47c7-b4b0-14d6b19ac027", + "metadata": { + "tags": [] + }, + "source": [ + "# 1. Import Packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05af78e8-c25b-4d10-9d8d-1b1ab07ea30f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# third-party open-source packages\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.datasets import fetch_openml\n", + "from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score\n", + "from xgboost import XGBClassifier\n", + "import os\n", + "import requests\n", + "\n", + "# Oracle packages\n", + "import automlx\n", + "from automlx import init\n", + "import oci\n", + "from oci.object_storage import UploadManager\n", + "import ads\n", + "from ads.common.model_metadata import UseCaseType\n", + "from ads.model import GenericModel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de12f611-a70e-4a02-9e87-b97fc3fc1a94", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# hash symbol used for commenting\n", + "# Ctrl+ Enter for running the code\n", + "# Enter for a new line" + ] + }, + { + "cell_type": "markdown", + "id": "56cbba26-8ac5-472b-a644-2b18b2905306", + "metadata": { + "tags": [] + }, + "source": [ + "# 2. Data Import, Exploration and Pre-Processing" + ] + }, + { + "cell_type": "markdown", + "id": "88efbdc4-5e9b-47c2-9288-6a893351d6c9", + "metadata": { + "tags": [] + }, + "source": [ + "## 2.1 Data Import" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cba6a5ee-431f-423f-8a74-1ad122106941", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data = fetch_openml(name=\"adult\", version=2, as_frame=True) # https://www.openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=adult\n", + "df = data.frame" + ] + }, + { + "cell_type": "markdown", + "id": "09607f12-8397-4851-b1f5-c55cb9602a8f", + "metadata": { + "tags": [] + }, + "source": [ + "### 2.1.1 Bonus: Importing from the atteached block volume" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d922fe16-9240-4369-ac45-c20d74be1836", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#file_path=\"your_path\" # an example for a path: '/home/datascience/df_sample.csv'. Commonly you need to use /home/datascience before the visable path.\n", + "#df = pd.read_csv(file_path)" + ] + }, + { + "cell_type": "markdown", + "id": "5a7fd513-3096-4c9b-a543-f00600b869c9", + "metadata": {}, + "source": [ + "## 2.2 Data Structure" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "588a89f0-1b57-4b8a-bf9a-78501b5fe26a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72bcb542-cf3d-4d4b-82ec-052562f883c5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "23a72a8d-d408-46f3-aa86-3d5c4ac8e2eb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# We would like to create a formula that uses the features for predicting the target variable" + ] + }, + { + "cell_type": "markdown", + "id": "a77498e7-2ea6-4d16-ac2a-06fce64e3eb9", + "metadata": {}, + "source": [ + "## 2.2 Data Analysis and Processing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cf68aa72-abe3-45e7-b0f7-bfc072d8d0ee", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.drop(['fnlwgt'], axis=1,inplace=True) # dropping 'sampling weights' column for simplification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "47fec1cd-5592-41d0-ad6e-ed0172b7e220", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "round(df.describe(percentiles=[]),1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "70c6f89f-c6ff-457a-83d4-c280d3846453", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.describe(include=['category']).round(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa2d9ed8-3044-4f02-bee7-18fa5e26b981", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df['class'] = (df['class'] == '>50K').astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "13bd0e3c-6d12-4579-92c0-f7168257d9a6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe510568-98ef-4497-b9c9-7d83065aa69d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pd.plotting.scatter_matrix(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e479730e-0206-46a2-87ce-e2205f96dffa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pd.plotting.scatter_matrix(df[['education-num','capital-gain']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acbaf252-8cea-440b-a619-c6de9758e70c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#!conda install seaborn -y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5388fd5b-b856-4b86-a369-57954c5be3e6", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "\n", + "sns.histplot(df, x='education-num', hue=\"class\",multiple=\"dodge\", bins=30)\n", + "plt.title(\"Distribution of Education-Num by Salery Class\")\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0885ecb5-da3f-4cb1-9c0a-fc3bb4847e06", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "sns.boxplot(data=df, x=\"class\", y=\"education-num\")\n", + "plt.title(\"Distribution of Education-Num by Salery Class\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "232c7ebc-dd80-4eca-b103-1f017bf1daa0", + "metadata": { + "tags": [] + }, + "source": [ + "# 3. Model Training" + ] + }, + { + "cell_type": "markdown", + "id": "f411e856-5827-449d-a622-1d739ce4369b", + "metadata": {}, + "source": [ + "## 3.1 Train and Test Split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca7bd821-27ba-495c-91ed-4683b3c541d4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "X = df.drop('class', axis=1)\n", + "y = df['class']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fe9fee8-0190-4639-a886-adcf6ec45bdd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3) # \n", + "\n", + "print(\"Train shape:\", X_train.shape)\n", + "print(\"Test shape:\", X_test.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "0c5adb4f-f2a5-4d4d-a66a-404396fd1242", + "metadata": {}, + "source": [ + "## 3.2 Using AutoML Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5429e219-640d-40e1-bd9e-1726c4ebd9cd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "init(engine='local')" + ] + }, + { + "cell_type": "markdown", + "id": "3b4f6e81-87e5-42bc-bf57-0333da5f4601", + "metadata": {}, + "source": [ + "Optinal Tasks:\\\n", + "classification, regression, anomaly_detection, forecasting, recommendation\n", + "\n", + "Optional algorithms for classification are: \\\n", + "AdaBoostClassifier, DecisionTreeClassifier, ExtraTreesClassifier, TorchMLPClassifier\n", + "KNeighborsClassifier, LGBMClassifier\n", + "LinearSVC, LogisticRegression, RandomForestClassifier\n", + "SVC, XGBClassifier, GaussianNB" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab8b68fd-9c73-437f-81dd-108b68d047d5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pipeline1 = automlx.Pipeline(task='classification',model_list=['LogisticRegression', 'RandomForestClassifier','XGBClassifier'],max_tuning_trials =10)\n", + "# model_list and max_tuning_trials were added to reduce fitting time. Removing them allows training a potentially better model.\n", + "# The automl pipeline has a rich api: https://docs.oracle.com/en-us/iaas/tools/automlx/latest/latest/automl.html " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05d17ab5-125c-4806-8ff8-f02b347873ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pipeline1.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "id": "d8231e29-8246-481b-9e51-8d4f7343a55f", + "metadata": {}, + "source": [ + "The pipeline includes several main steps:\n", + "1. Data pre-processing\n", + "2. Algorithm selection - based on existing data, predicting which algorithm is the best for your data \n", + "3. Sample size reduction try ('adaptive sampling')\n", + "4. Features reduction try ('feature selection')\n", + "5. Model hyperparameters selection ('model tuning')\n", + "6. Model fitting with the selected hypterparameters" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c11eac3-2c78-4d24-a846-7f9f9f2f1f72", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "y_train_pred = pipeline1.predict(X_train)\n", + "y_test_pred = pipeline1.predict(X_test)\n", + "print(y_test_pred[0:20])" + ] + }, + { + "cell_type": "markdown", + "id": "bde41405-8df0-48f8-b895-589e2bf5f083", + "metadata": {}, + "source": [ + "### 3.2.1 Understanding the Automl Pipeline Selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd90f60-07f0-4a96-bd75-b8e6a2370ace", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#pipeline1.completed_trials_summary_" + ] + }, + { + "cell_type": "markdown", + "id": "54aa7349-0a0e-4255-aa61-0bae1659ba86", + "metadata": {}, + "source": [ + "## 3.3 Modeling with other open-sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f554c534-b949-48f5-bf58-1abbdca961a7", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "X_train_encoded = pd.get_dummies(X_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "04930ab0-5865-43a0-bbff-6f339179d5db", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "model = XGBClassifier(max_depth=5, n_estimators=200, learning_rate=0.01,eval_metric='logloss')\n", + "model.fit(X_train_encoded, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "432bcdcb-f54d-45da-8e76-565af87b5102", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "y_train_pred_xgboost = model.predict(X_train_encoded)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8c871ad-ed53-48de-bad1-249185657a26", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "np.bincount(y_train_pred_xgboost)" + ] + }, + { + "cell_type": "markdown", + "id": "aca66782-a828-42fb-8ba4-429b8507a511", + "metadata": { + "tags": [] + }, + "source": [ + "# 4. Model Validation and Explainabilty" + ] + }, + { + "cell_type": "markdown", + "id": "a6796d7b-1839-43d2-9f82-d944c2511d70", + "metadata": { + "tags": [] + }, + "source": [ + "## 4.1 Model Validation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24eb410e-6b33-4e9a-aea8-a865316d2e77", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "acc_test = accuracy_score(y_test, y_test_pred) * 100\n", + "print('Model Accuracy, test: ',acc_test.round(1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cdbbf74-570c-4d67-89c3-b80ce2d1d40d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cm_test = confusion_matrix(y_test, y_test_pred)\n", + "cm_test_pct = cm_test / cm_test.sum(axis=1, keepdims=True) * 100\n", + "\n", + "ConfusionMatrixDisplay(cm_test_pct, display_labels=['<=50K', '>50K']).plot(cmap='Blues', values_format=\".1f\")\n", + "plt.title('Confusion Matrix - Test Set [%]')\n", + "\n", + "plt.savefig('confusion_matrix.png', dpi=300)\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "4ad8baa1-1433-4408-9f80-5da52da656d3", + "metadata": {}, + "source": [ + "## 4.2 Saving the confusion matrices in the object storage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a300a62b-54c8-42a6-a6bc-ece51b63d631", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "signer = oci.auth.signers.get_resource_principals_signer()\n", + "object_storage = oci.object_storage.ObjectStorageClient({}, signer=signer)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "15df4394-f4c5-4cbe-9d66-21afcf3f575d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "namespace = object_storage.get_namespace().data\n", + "bucket_name = \"data-science-reports\"\n", + "file_name = \"confusion_matrix2\"\n", + "local_path = \"/home/datascience/confusion_matrix.png\" # make sure to add '/home/datascience/' to the path." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2c34668-7716-4c93-86b8-5461cf5dda1d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "upload_manager = UploadManager(object_storage, allow_parallel_uploads=True)\n", + "upload_manager.upload_file(\n", + " namespace_name=namespace,\n", + " bucket_name=bucket_name,\n", + " object_name=file_name,\n", + " file_path=local_path\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5d7f4c4b-08fc-4a13-8061-7590fd8217b1", + "metadata": {}, + "source": [ + "### 4.2.1 Bonus: Interacting with Object Storage and ADB" + ] + }, + { + "cell_type": "markdown", + "id": "4780a408-d303-4f12-b589-564ba8b1dc19", + "metadata": {}, + "source": [ + "#### 4.2.1 Reading a table from object storage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31350eaf-bfbc-46c8-8368-54e07f175ad4", + "metadata": {}, + "outputs": [], + "source": [ + "# import io\n", + "\n", + "# signer = oci.auth.signers.get_resource_principals_signer()\n", + "# object_storage = oci.object_storage.ObjectStorageClient({}, signer=signer)\n", + "\n", + "# namespace = object_storage.get_namespace().data\n", + "# bucket_name='data-science-reports'\n", + "# file_name= 'testagg_day_0.csv'\n", + "\n", + "# obj = object_storage.get_object(namespace, bucket_name, file_name)\n", + "# df = pd.read_csv(io.BytesIO(obj.data.content))" + ] + }, + { + "cell_type": "markdown", + "id": "1b22fd08-0d1e-4745-b64e-8e14df9ef0bd", + "metadata": {}, + "source": [ + "#### 4.2.1 Reading a table from the database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "33f6c85f-c42d-4057-9d8b-e9b9514f0a39", + "metadata": {}, + "outputs": [], + "source": [ + "# import ads\n", + "\n", + "# connection_parameters = {\n", + "# \"user_name\": \"\",\n", + "# \"password\": \"\",\n", + "# \"service_name\": \"\",\n", + "# \"wallet_location\": \"/full/path/to/my_wallet.zip\", # download the wallet file from the databse\n", + "# }\n", + "\n", + "# df = pd.DataFrame.ads.read_sql(\n", + "# \"SELECT * FROM SH.SALES\",\n", + "# connection_parameters=connection_parameters,\n", + "# )\n" + ] + }, + { + "cell_type": "markdown", + "id": "2e2f9884-14ec-46e5-9a65-969d538f16f8", + "metadata": {}, + "source": [ + "## 4.3 Explainability" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ad2091ea-9107-406e-83b5-a676891d28f4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "explainer = automlx.MLExplainer(pipeline1,\n", + " X_train,\n", + " y_train,\n", + " target_names=[\"<=50K\", \">50K\"],\n", + " task=\"classification\")\n", + "\n", + "y_train = (y_train == \">50K\").astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b971fb2-e9c1-4091-a167-e3fe7db03574", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result_explain_model_default = explainer.explain_model()" + ] + }, + { + "cell_type": "markdown", + "id": "43d9fb85-69a0-4e7f-bc5d-f47398650afa", + "metadata": {}, + "source": [ + "### 4.3.1 Gloabal Explainability" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce86b25a-f49e-4ab2-b5a6-14f86136089d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "result_explain_model_default.show_in_notebook() # based on permutation" + ] + }, + { + "cell_type": "markdown", + "id": "67371780-8fbb-4c84-bf0e-21ccfe9ef9e8", + "metadata": { + "tags": [] + }, + "source": [ + "### 4.3.2 Local Explainability" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "da1c513f-5225-4415-93b8-19220f4ae0e0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "index = 0\n", + "X_train.iloc[[index]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a0c91f93-f905-4b95-9087-d3173fa455c4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "actual=y_train[index]\n", + "prediction=pipeline1.predict(X_train.iloc[[index]])[0]\n", + "print('actual: ',actual)\n", + "print('prediction: ',prediction)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f463c6b3-e9ad-42e9-aafe-5dd611562c32", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "explainer.configure_explain_prediction(tabulator_type=\"kernel_shap\",\n", + " sampling={'technique': 'random', 'n_samples': 2000})\n", + "result_explain_prediction_kernel_shap = explainer.explain_prediction(X_train.iloc[[index]])\n", + "result_explain_prediction_kernel_shap[0].show_in_notebook()" + ] + }, + { + "cell_type": "markdown", + "id": "04cf260e-3a3b-45f2-9047-f573c31d18b0", + "metadata": { + "tags": [] + }, + "source": [ + "## 4.4 Bonus: Notebook Explorer" + ] + }, + { + "cell_type": "markdown", + "id": "3e6ed89d-34a4-482b-8e70-ade333a81a22", + "metadata": { + "tags": [] + }, + "source": [ + "# 5 Deployment" + ] + }, + { + "cell_type": "markdown", + "id": "ace2e128-9673-4adb-873a-559771d741bb", + "metadata": {}, + "source": [ + "## 5.1 Prepare the Artifacts (Serializiation) Using ADS" + ] + }, + { + "cell_type": "markdown", + "id": "6b05ef1e-ea60-4ead-9827-e8dd5dab1aa0", + "metadata": {}, + "source": [ + "* Create the files required for deployment and pack them together.\n", + "* Besides the model, the following required files are generated automatically: `score.py`, `runtime.yaml`, `input_schema.json`, `output_schema.json`\n", + "* Optional info can be added, such as: `inference_conda_env`, `training_conda_env`\n", + "\n", + "* The following frameworks have an automated prepare function: TensorFlow, PyTorch, scikit-learn, XGBoost, LightGBM, SparkPipelineModel, AutoMlx, transformers\n", + "* In addition" + ] + }, + { + "cell_type": "markdown", + "id": "cb7c0e6b-877b-43af-8b3e-6418c341cc9f", + "metadata": {}, + "source": [ + "ADS takes you through the deployment process in a simple way" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea8b8fa0-cdfa-4265-b659-017aef1f2b32", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ads.set_auth(\"resource_principal\") # a signer for all ads operations, managed automatically" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d05ef6e-a6d9-499f-bcf7-890c1fc39793", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "automl_model = GenericModel(estimator=pipeline1, artifact_dir=\"automl_model_artifact2\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "35cc92ab-6285-46c5-9be9-92bd19832847", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "automl_model.summary_status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c70f5ed-0ace-4987-9970-830310df3734", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "conda_env=\"automlx251_p311_cpu_x86_64_v2\"\n", + "automl_model.prepare(inference_conda_env=conda_env,\n", + " training_conda_env=conda_env,\n", + " use_case_type=UseCaseType.BINARY_CLASSIFICATION,\n", + " X_sample=X_test,\n", + " force_overwrite=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e93a7a8e-bace-4d07-a0bc-1c3a029fcfec", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "automl_model.summary_status()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9573715-9015-44d0-b779-cfc005ec46ed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "automl_model.verify(X_test.iloc[:20], auto_serialize_data=True)" + ] + }, + { + "cell_type": "markdown", + "id": "85a58e48-3cfa-4eac-86b6-ebcc0ac9545a", + "metadata": {}, + "source": [ + "## 5.2 Register" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4828b67-06ba-48e1-b425-6333b4dff9e8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "model_id = automl_model.save(display_name=\"Demo Adults Income Model 1\")" + ] + }, + { + "cell_type": "markdown", + "id": "a651d3c2-d540-45c9-82d7-42112445736e", + "metadata": {}, + "source": [ + "## 5.3 Deploy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1706b2d-73eb-4a46-992e-ea721ae5f81b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#automl_model.deploy(display_name=\"Demo Adults Income Model 1\")" + ] + }, + { + "cell_type": "markdown", + "id": "8d4757f6-7509-4516-b5c0-9eb635965a6b", + "metadata": { + "tags": [] + }, + "source": [ + "# 6. Inference " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "492f82fb-8090-497a-8a47-16324936dffb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "auth = oci.auth.signers.get_resource_principals_signer()\n", + "\n", + "endpoint = ''\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8c01a8f-5e9a-40e0-8d26-c71a4ca4e6aa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "body = {\n", + " \"data\": '''[\n", + " {\n", + " \"age\": 37,\n", + " \"workclass\": \"Private\",\n", + " \"education\": \"Bachelors\",\n", + " \"education-num\": 13,\n", + " \"marital-status\": \"Married-civ-spouse\",\n", + " \"occupation\": \"Exec-managerial\",\n", + " \"relationship\": \"Husband\",\n", + " \"race\": \"White\",\n", + " \"sex\": \"Male\",\n", + " \"capital-gain\": 500,\n", + " \"capital-loss\": 0,\n", + " \"hours-per-week\": 40,\n", + " \"native-country\": \"United-States\"\n", + " }\n", + " ]'''\n", + "}\n", + "# play with the capital-gain variable to see changes in prediction" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3acd4b19-f89c-4488-a631-f22183adc39b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "requests.post(endpoint, json=body, auth=auth).json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c5f3379b-5885-490d-aa72-2fa29dd4ea10", + "metadata": { + "jupyter": { + "source_hidden": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "df_example = pd.DataFrame([{\n", + " \"age\": 37,\n", + " \"workclass\": \"Private\",\n", + " \"education\": \"Bachelors\",\n", + " \"education-num\": 13,\n", + " \"marital-status\": \"Married-civ-spouse\",\n", + " \"occupation\": \"Exec-managerial\",\n", + " \"relationship\": \"Husband\",\n", + " \"race\": \"White\",\n", + " \"sex\": \"Male\",\n", + " \"capital-gain\": 0,\n", + " \"capital-loss\": 0,\n", + " \"hours-per-week\": 40,\n", + " \"native-country\": \"United-States\"\n", + "}])\n", + "\n", + "# Convert DataFrame to JSON (orientation='records' creates a list of dicts)\n", + "body = {\n", + " \"data_type\": \"pandas.core.frame.DataFrame\",\n", + " \"data\": df_example.to_json(orient='records')\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:automlx251_p311_cpu_x86_64_v2]", + "language": "python", + "name": "conda-env-automlx251_p311_cpu_x86_64_v2-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}