In [1]:
#****************************************************************************
# (C) Cloudera, Inc. 2020-2023
#  All rights reserved.
#
#  Applicable Open Source License: GNU Affero General Public License v3.0
#
#  NOTE: Cloudera open source products are modular software products
#  made up of hundreds of individual components, each of which was
#  individually copyrighted.  Each Cloudera open source product is a
#  collective work under U.S. Copyright Law. Your license to use the
#  collective work is as provided in your written agreement with
#  Cloudera.  Used apart from the collective work, this file is
#  licensed for your use pursuant to the open source license
#  identified above.
#
#  This code is provided to you pursuant a written agreement with
#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
#  this code. If you do not have a written agreement with Cloudera nor
#  with an authorized and properly licensed third party, you do not
#  have any rights to access nor to use this code.
#
#  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
#  DATA.
#
# #  Author(s): Paul de Fusco
#***************************************************************************/

In [1]:
import os, warnings, sys, logging
import mlflow
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score
import mlflow.sklearn
from xgboost import XGBClassifier
from datetime import date
import cml.data_v1 as cmldata
import pyspark.pandas as ps

import onnxmltools
from urllib.parse import urlparse
from sklearn.preprocessing import FunctionTransformer
from mlflow.models import infer_signature
#from onnxconverter_common import FloatTensorType
from onnxmltools.convert.common.data_types import FloatTensorType



In [6]:
import mlflow
import onnxmltools
import xgboost

print(f"MLflow version: {mlflow.__version__}")
print(f"onnxmltools version: {onnxmltools.__version__}")
print(f"XGBoost version: {xgboost.__version__}")


MLflow version: 2.19.0
onnxmltools version: 1.14.0
XGBoost version: 2.1.4


In [2]:
USERNAME = os.environ["PROJECT_OWNER"]
DBNAME = "BNK_MLOPS_HOL_"+USERNAME
CONNECTION_NAME = "pdf-oct-aw-dl"


DATE = date.today()
EXPERIMENT_NAME = "xgb-cc-fraud-{0}".format(USERNAME)

mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='/home/cdsw/.experiments/24e5-urzf-bfzc-4r50', creation_time=None, experiment_id='24e5-urzf-bfzc-4r50', last_update_time=None, lifecycle_stage='active', name='xgb-cc-fraud-pauldefusco', tags={}>

In [3]:
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

Setting spark.hadoop.yarn.resourcemanager.principal to pauldefusco


Spark Application Id:spark-ad79df940e1f45dca2b2705caced5fb5


In [4]:
df_from_sql = ps.read_table('{0}.CC_TRX_{1}'.format(DBNAME, USERNAME))
df = df_from_sql.to_pandas()

y = df["fraud_trx"]
df = df.drop("fraud_trx", axis=1)
df.columns = ['f' + str(i) for i in range(len(df.columns))]

test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=test_size)

Hive Session ID = 1eed56a7-24e1-4444-a058-757607e53d4c
                                                                                

In [5]:
with mlflow.start_run():

    model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

    # Step 1: cambiar test_size linea 69 y recorrer
    # Step 2: cambiar linea 74, agregar linea 97, y recorrer
      # linea 75: model = XGBClassifier(use_label_encoder=False, max_depth=4, eval_metric="logloss")
      # linea 97: mlflow.log_param("max_depth", 4)
    # Step 3: cambiar linea 74 y 97, agregar linea 98, y recorrer
      # linea 75: model = XGBClassifier(use_label_encoder=False, max_depth=2, max_leaf_nodes=5, eval_metric="logloss")
      # linea 97: mlflow.log_param("max_depth", 2)
      # linea 98: mlflow.log_param("max_leaf_nodes", 5)

    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    print("Accuracy: %.2f%%" % (accuracy * 100.0))
    print("Test Size: %.2f%%" % (test_size * 100.0))

    mlflow.log_param("accuracy", accuracy)
    mlflow.log_param("test_size", test_size)

    # Step 2:
    # Step 3:

    num_features = X_train.shape[1]
    initial_type = [("input", FloatTensorType([None, num_features]))]

    model_signature = infer_signature(X_train, y_pred)
    #onnx_model = onnxmltools.convert_xgboost(model, initial_types=initial_type)
    onnx_model = onnxmltools.convert_xgboost(model.get_booster(), initial_types=initial_type)

    #onnxmltools.utils.save_model(onnx_model, "fraud_classifier.onnx")
    mlflow.onnx.log_model(onnx_model, "fraud-clf-onnx-xgboost",
                          registered_model_name="fraud-detector-onnx-xgboost",
                          signature=model_signature)

Creating run for experiment_id: 24e5-urzf-bfzc-4r50, user_id: cdsw, run_name: None
Parameters: { "use_label_encoder" } are not used.



Accuracy: 94.85%
Test Size: 30.00%


Successfully registered model 'fraud-detector-onnx-xgboost'.
experiment id 24e5-urzf-bfzc-4r50 
2025/10/22 23:08:43 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: fraud-detector-onnx-xgboost, version 1
Created version '1' of model 'fraud-detector-onnx-xgboost'.
