Conda environment: automlx251_p311_cpu_x86_64_v2\
Created Data: 09/11/2025\
By: Assaf Rabinowicz, EMEA Data Science Team

# 1. Import Packages


In [None]:
# third-party open-source packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from xgboost import XGBClassifier
import os
import requests

# Oracle packages
import automlx
from automlx import init
import oci
from oci.object_storage import UploadManager
import ads
from ads.common.model_metadata import UseCaseType
from ads.model import GenericModel

In [None]:
# hash symbol used for commenting
# Ctrl+ Enter for running the code
# Enter for a new line

# 2. Data Import, Exploration and Pre-Processing

## 2.1 Data Import

In [None]:
data = fetch_openml(name="adult", version=2, as_frame=True) # https://www.openml.org/search?type=data&sort=version&status=any&order=asc&exact_name=adult
df = data.frame

### 2.1.1 Bonus: Importing from the atteached block volume

In [None]:
#file_path="your_path" # an example for a path: '/home/datascience/df_sample.csv'. Commonly you need to use /home/datascience before the visable path.
#df = pd.read_csv(file_path)

## 2.2 Data Structure

In [None]:
df.shape

In [None]:
df.head()

In [None]:
# We would like to create a formula that uses the features for predicting the target variable

## 2.2 Data Analysis and Processing

In [None]:
df.drop(['fnlwgt'], axis=1,inplace=True) # dropping 'sampling weights' column for simplification

In [None]:
round(df.describe(percentiles=[]),1)

In [None]:
df.describe(include=['category']).round(1)

In [None]:
df['class'] = (df['class'] == '>50K').astype(int)

In [None]:
df.head()

In [None]:
pd.plotting.scatter_matrix(df)

In [None]:
pd.plotting.scatter_matrix(df[['education-num','capital-gain']])

In [None]:
#!conda install seaborn -y

In [None]:
import seaborn as sns

sns.histplot(df, x='education-num', hue="class",multiple="dodge", bins=30)
plt.title("Distribution of Education-Num by Salery Class")
plt.show()

In [None]:
sns.boxplot(data=df, x="class", y="education-num")
plt.title("Distribution of Education-Num by Salery Class")
plt.show()

# 3. Model Training

## 3.1 Train and Test Split

In [None]:
X = df.drop('class', axis=1)
y = df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3) # 

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

## 3.2 Using AutoML Pipeline

In [None]:
init(engine='local')

Optinal Tasks:\
classification, regression, anomaly_detection, forecasting, recommendation

Optional algorithms for classification are: \
AdaBoostClassifier, DecisionTreeClassifier, ExtraTreesClassifier, TorchMLPClassifier
KNeighborsClassifier, LGBMClassifier
LinearSVC, LogisticRegression, RandomForestClassifier
SVC, XGBClassifier, GaussianNB

In [None]:
pipeline1 = automlx.Pipeline(task='classification',model_list=['LogisticRegression', 'RandomForestClassifier','XGBClassifier'],max_tuning_trials =10)
# model_list and max_tuning_trials were added to reduce fitting time. Removing them allows training a potentially better model.
# The automl pipeline has a rich api: https://docs.oracle.com/en-us/iaas/tools/automlx/latest/latest/automl.html 

In [None]:
pipeline1.fit(X_train, y_train)

The pipeline includes several main steps:
1. Data pre-processing
2. Algorithm selection - based on existing data, predicting which algorithm is the best for your data 
3. Sample size reduction try ('adaptive sampling')
4. Features reduction try ('feature selection')
5. Model hyperparameters selection ('model tuning')
6. Model fitting with the selected hypterparameters

In [None]:
y_train_pred = pipeline1.predict(X_train)
y_test_pred = pipeline1.predict(X_test)
print(y_test_pred[0:20])

### 3.2.1 Understanding the Automl Pipeline Selection

In [None]:
#pipeline1.completed_trials_summary_

## 3.3 Modeling with other open-sources

In [None]:
X_train_encoded = pd.get_dummies(X_train)

In [None]:
model = XGBClassifier(max_depth=5, n_estimators=200, learning_rate=0.01,eval_metric='logloss')
model.fit(X_train_encoded, y_train)

In [None]:
y_train_pred_xgboost = model.predict(X_train_encoded)

In [None]:
np.bincount(y_train_pred_xgboost)

# 4. Model Validation and Explainabilty

## 4.1 Model Validation

In [None]:
acc_test = accuracy_score(y_test, y_test_pred) * 100
print('Model Accuracy, test: ',acc_test.round(1))

In [None]:
cm_test = confusion_matrix(y_test, y_test_pred)
cm_test_pct = cm_test / cm_test.sum(axis=1, keepdims=True) * 100

ConfusionMatrixDisplay(cm_test_pct, display_labels=['<=50K', '>50K']).plot(cmap='Blues', values_format=".1f")
plt.title('Confusion Matrix - Test Set [%]')

plt.savefig('confusion_matrix.png', dpi=300)
plt.show()

## 4.2 Saving the confusion matrices in the object storage

In [None]:
signer = oci.auth.signers.get_resource_principals_signer()
object_storage = oci.object_storage.ObjectStorageClient({}, signer=signer)

In [None]:
namespace = object_storage.get_namespace().data
bucket_name = "data-science-reports"
file_name = "confusion_matrix2"
local_path = "/home/datascience/confusion_matrix.png" # make sure to add '/home/datascience/' to the path.

In [None]:
upload_manager = UploadManager(object_storage, allow_parallel_uploads=True)
upload_manager.upload_file(
    namespace_name=namespace,
    bucket_name=bucket_name,
    object_name=file_name,
    file_path=local_path
)

### 4.2.1 Bonus: Interacting with Object Storage and ADB

#### 4.2.1 Reading a table from object storage

In [None]:
# import io

# signer = oci.auth.signers.get_resource_principals_signer()
# object_storage = oci.object_storage.ObjectStorageClient({}, signer=signer)

# namespace = object_storage.get_namespace().data
# bucket_name='data-science-reports'
# file_name= 'testagg_day_0.csv'

# obj = object_storage.get_object(namespace, bucket_name, file_name)
# df = pd.read_csv(io.BytesIO(obj.data.content))

#### 4.2.1 Reading a table from the database

In [None]:
# import ads

# connection_parameters = {
#     "user_name": "<username>",
#     "password": "<password>",
#     "service_name": "<service_name_{high|med|low}>",
#     "wallet_location": "/full/path/to/my_wallet.zip", # download the wallet file from the databse
# }

# df = pd.DataFrame.ads.read_sql(
#     "SELECT * FROM SH.SALES",
#     connection_parameters=connection_parameters,
# )


## 4.3 Explainability

In [None]:
explainer = automlx.MLExplainer(pipeline1,
                               X_train,
                               y_train,
                               target_names=["<=50K", ">50K"],
                               task="classification")

y_train = (y_train == ">50K").astype(int)

In [None]:
result_explain_model_default = explainer.explain_model()

### 4.3.1 Gloabal Explainability

In [None]:
result_explain_model_default.show_in_notebook() # based on permutation

### 4.3.2 Local Explainability

In [None]:
index = 0
X_train.iloc[[index]]

In [None]:
actual=y_train[index]
prediction=pipeline1.predict(X_train.iloc[[index]])[0]
print('actual: ',actual)
print('prediction: ',prediction)

In [None]:
explainer.configure_explain_prediction(tabulator_type="kernel_shap",
                                               sampling={'technique': 'random', 'n_samples': 2000})
result_explain_prediction_kernel_shap = explainer.explain_prediction(X_train.iloc[[index]])
result_explain_prediction_kernel_shap[0].show_in_notebook()

## 4.4 Bonus: Notebook Explorer

# 5 Deployment

## 5.1 Prepare the Artifacts (Serializiation) Using ADS

* Create the files required for deployment and pack them together.
* Besides the model, the following required files are generated automatically: `score.py`, `runtime.yaml`, `input_schema.json`, `output_schema.json`
* Optional info can be added, such as: `inference_conda_env`, `training_conda_env`

* The following frameworks have an automated prepare function: TensorFlow, PyTorch, scikit-learn, XGBoost, LightGBM, SparkPipelineModel, AutoMlx, transformers
* In addition

ADS takes you through the deployment process in a simple way

In [None]:
ads.set_auth("resource_principal") # a signer for all ads operations, managed automatically

In [None]:
automl_model = GenericModel(estimator=pipeline1, artifact_dir="automl_model_artifact2")

In [None]:
automl_model.summary_status()

In [None]:
conda_env="automlx251_p311_cpu_x86_64_v2"
automl_model.prepare(inference_conda_env=conda_env,
                     training_conda_env=conda_env,
                     use_case_type=UseCaseType.BINARY_CLASSIFICATION,
                     X_sample=X_test,
                     force_overwrite=True)

In [None]:
automl_model.summary_status()

In [None]:
automl_model.verify(X_test.iloc[:20], auto_serialize_data=True)

## 5.2 Register

In [None]:
model_id = automl_model.save(display_name="Demo Adults Income Model 1")

## 5.3 Deploy

In [None]:
#automl_model.deploy(display_name="Demo Adults Income Model 1")

# 6. Inference 

In [None]:
auth = oci.auth.signers.get_resource_principals_signer()

endpoint = '<your_endpoint>'


In [None]:
body = {
    "data": '''[
        {
            "age": 37,
            "workclass": "Private",
            "education": "Bachelors",
            "education-num": 13,
            "marital-status": "Married-civ-spouse",
            "occupation": "Exec-managerial",
            "relationship": "Husband",
            "race": "White",
            "sex": "Male",
            "capital-gain": 500,
            "capital-loss": 0,
            "hours-per-week": 40,
            "native-country": "United-States"
        }
    ]'''
}
# play with the capital-gain variable to see changes in prediction

In [None]:
requests.post(endpoint, json=body, auth=auth).json()

In [None]:
df_example = pd.DataFrame([{
    "age": 37,
    "workclass": "Private",
    "education": "Bachelors",
    "education-num": 13,
    "marital-status": "Married-civ-spouse",
    "occupation": "Exec-managerial",
    "relationship": "Husband",
    "race": "White",
    "sex": "Male",
    "capital-gain": 0,
    "capital-loss": 0,
    "hours-per-week": 40,
    "native-country": "United-States"
}])

# Convert DataFrame to JSON (orientation='records' creates a list of dicts)
body = {
    "data_type": "pandas.core.frame.DataFrame",
    "data": df_example.to_json(orient='records')
}