In [1]:
#****************************************************************************
# (C) Cloudera, Inc. 2020-2023
#  All rights reserved.
#
#  Applicable Open Source License: GNU Affero General Public License v3.0
#
#  NOTE: Cloudera open source products are modular software products
#  made up of hundreds of individual components, each of which was
#  individually copyrighted.  Each Cloudera open source product is a
#  collective work under U.S. Copyright Law. Your license to use the
#  collective work is as provided in your written agreement with
#  Cloudera.  Used apart from the collective work, this file is
#  licensed for your use pursuant to the open source license
#  identified above.
#
#  This code is provided to you pursuant a written agreement with
#  (i) Cloudera, Inc. or (ii) a third-party authorized to distribute
#  this code. If you do not have a written agreement with Cloudera nor
#  with an authorized and properly licensed third party, you do not
#  have any rights to access nor to use this code.
#
#  Absent a written agreement with Cloudera, Inc. (“Cloudera”) to the
#  contrary, A) CLOUDERA PROVIDES THIS CODE TO YOU WITHOUT WARRANTIES OF ANY
#  KIND; (B) CLOUDERA DISCLAIMS ANY AND ALL EXPRESS AND IMPLIED
#  WARRANTIES WITH RESPECT TO THIS CODE, INCLUDING BUT NOT LIMITED TO
#  IMPLIED WARRANTIES OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY AND
#  FITNESS FOR A PARTICULAR PURPOSE; (C) CLOUDERA IS NOT LIABLE TO YOU,
#  AND WILL NOT DEFEND, INDEMNIFY, NOR HOLD YOU HARMLESS FOR ANY CLAIMS
#  ARISING FROM OR RELATED TO THE CODE; AND (D)WITH RESPECT TO YOUR EXERCISE
#  OF ANY RIGHTS GRANTED TO YOU FOR THE CODE, CLOUDERA IS NOT LIABLE FOR ANY
#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR
#  CONSEQUENTIAL DAMAGES INCLUDING, BUT NOT LIMITED TO, DAMAGES
#  RELATED TO LOST REVENUE, LOST PROFITS, LOSS OF INCOME, LOSS OF
#  BUSINESS ADVANTAGE OR UNAVAILABILITY, OR LOSS OR CORRUPTION OF
#  DATA.
#
# #  Author(s): Paul de Fusco
#***************************************************************************/

In [2]:
import mlflow.spark

In [3]:
import os, warnings, sys
import mlflow
import pandas as pd
import numpy as np

In [46]:
import logging, json, shutil, datetime
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler, StandardScaler, Imputer, StringIndexer, OneHotEncoder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.linalg import DenseVector
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from mlopsutils import *

In [47]:
import cml.data_v1 as cmldata

# Sample in-code customization of spark configurations
from pyspark import SparkContext
SparkContext.setSystemProperty('spark.executor.cores', '2')
SparkContext.setSystemProperty('spark.executor.memory', '2g')

CONNECTION_NAME = "go01-aw-dl"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

In [48]:
import os
print("https://spark-"+os.environ["CDSW_ENGINE_ID"]+"."+os.environ["CDSW_DOMAIN"])

https://spark-lgo3r5g8ngclymtm.ml-b74f8940-b97.go01-dem.ylcu-atmi.cloudera.site


In [49]:
username = os.environ["PROJECT_OWNER"]
dbname = "MLOPS"

In [50]:
bankingDf = spark.sql("SELECT * FROM {0}.BANKING_TRANSACTIONS_{1}".format(dbname, username))

In [51]:
bankingDf.printSchema()

root
 |-- name: string (nullable = true)
 |-- address: string (nullable = true)
 |-- email: string (nullable = true)
 |-- age: decimal(10,0) (nullable = true)
 |-- credit_card_balance: decimal(10,0) (nullable = true)
 |-- bank_account_balance: decimal(10,0) (nullable = true)
 |-- mortgage_balance: decimal(10,0) (nullable = true)
 |-- sec_bank_account_balance: decimal(10,0) (nullable = true)
 |-- savings_account_balance: decimal(10,0) (nullable = true)
 |-- sec_savings_account_balance: decimal(10,0) (nullable = true)
 |-- total_est_nworth: decimal(10,0) (nullable = true)
 |-- primary_loan_balance: decimal(10,0) (nullable = true)
 |-- secondary_loan_balance: decimal(10,0) (nullable = true)
 |-- college_loan_balance: decimal(10,0) (nullable = true)
 |-- aba_routing: string (nullable = true)
 |-- bank_country: string (nullable = true)
 |-- account_no: string (nullable = true)
 |-- int_account_no: string (nullable = true)
 |-- swift11: string (nullable = true)
 |-- credit_card_number: strin

In [52]:
features = ["age", 
            "credit_card_balance", 
            "bank_account_balance", 
            "mortgage_balance", 
            "primary_loan_balance",
            "sec_bank_account_balance", 
            "savings_account_balance", 
            "sec_savings_account_balance",
            "secondary_loan_balance",
            "total_est_nworth", 
            "college_loan_balance", 
            "transaction_amount", 
            "latitude", 
            "longitude",
            "fraud"
        ]

In [53]:
bankingDf = bankingDf[features]

In [54]:
class ModelFactory():
    """
    Class to create, compare and log multiple Mllib models in efficient manner via mlflow
    Input: sparkMlRegressor, hyperParamGrid, inputFeatures
    Output: trainedRegressorPipeline
    """

    def __init__(self, sparkDf, sparkMlClf, hyperParamGrid, modelEvaluator):
        self.sparkDf = sparkDf
        self.sparkMlClf = sparkMlClf
        self.hyperParamGrid = hyperParamGrid
        self.modelEvaluator = modelEvaluator

    def trainTestSplit(self, sparkDf, trainPercentage):
        """
        Method to split data into train and test sets
        Requires inputs: DF to split; percentage of train set (number between 0 and 1)
        Returns train and test sets as spark df's
        """
        
        testPercentage = 1 - trainPercentage 
        #features = [feature for feature in self.inputFeatures if feature != "fraud"]
        #sparkDf = sparkDf[features]
        train, test = sparkDf.randomSplit(weights=[trainPercentage,testPercentage], seed=200)

        return train, test

    def makeCv(self, inputFeatures):
        """
        Method to create Spark Mllib Pipeline Object
        Pipeline: VectorAssembler, StandardScaler, LogisticRegression
        Returns cross validated best model
        """
        stages = []

        #Assembling mixed data type transformations:
        assembler = VectorAssembler(inputCols=inputFeatures, outputCol="features")
        stages += [assembler]    

        #Scaling features
        scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures", withStd=True, withMean=True)
        stages += [scaler]

        #Tested Regressor
        modelTested = self.sparkMlClf
        stages += [modelTested]

        #Creating and running the pipeline:
        pipeline = Pipeline(stages=stages)

        #Creating Evaluator
        evaluator = self.modelEvaluator

        #Define the parameter grid to examine.
        grid = self.hyperParamGrid
        cv = CrossValidator(estimator=pipeline, evaluator=evaluator, estimatorParamMaps=grid, numFolds=3)
        return cv

In [55]:
numeric, cat = defineColTypes(bankingDf)

In [56]:
bankingDf = convertToFloat(bankingDf, numeric)
bankingDf.printSchema()

root
 |-- age: float (nullable = true)
 |-- credit_card_balance: float (nullable = true)
 |-- bank_account_balance: float (nullable = true)
 |-- mortgage_balance: float (nullable = true)
 |-- primary_loan_balance: float (nullable = true)
 |-- sec_bank_account_balance: float (nullable = true)
 |-- savings_account_balance: float (nullable = true)
 |-- sec_savings_account_balance: float (nullable = true)
 |-- secondary_loan_balance: float (nullable = true)
 |-- total_est_nworth: float (nullable = true)
 |-- college_loan_balance: float (nullable = true)
 |-- transaction_amount: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- fraud: string (nullable = true)



In [57]:
def labelEncoder(df, labelCol):
    """
    Method to transform dataframe label from categorical to numerical column type
    """
    indexer = StringIndexer(inputCol=labelCol, outputCol="label")
    df = indexer.fit(df).transform(df)
    df = df.drop(labelCol)
    return df

In [58]:
bankingDf = labelEncoder(bankingDf, "fraud")
bankingDf.printSchema()

[Stage 10:>                                                         (0 + 1) / 1]

root
 |-- age: float (nullable = true)
 |-- credit_card_balance: float (nullable = true)
 |-- bank_account_balance: float (nullable = true)
 |-- mortgage_balance: float (nullable = true)
 |-- primary_loan_balance: float (nullable = true)
 |-- sec_bank_account_balance: float (nullable = true)
 |-- savings_account_balance: float (nullable = true)
 |-- sec_savings_account_balance: float (nullable = true)
 |-- secondary_loan_balance: float (nullable = true)
 |-- total_est_nworth: float (nullable = true)
 |-- college_loan_balance: float (nullable = true)
 |-- transaction_amount: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- label: double (nullable = false)



                                                                                

In [59]:
#features = [feature for feature in train.columns if feature != "label"]

In [63]:
mlflow.set_experiment('MySparkMlClf')

with mlflow.start_run():
    
    # Create Logistic Regression Object
    lr = LogisticRegression(labelCol="label", featuresCol="features")

    lrevaluator = MulticlassClassificationEvaluator(labelCol="label")
    #lrevaluator = MulticlassClassificationEvaluator(labelCol="label", metricName="accuracy")
    
    # Create ParamGrid for Cross Validation
    lrparamGrid = (ParamGridBuilder()
                .addGrid(lr.regParam, [0.01, 0.3, 0.6])
                .addGrid(lr.elasticNetParam, [0.0, 0.25, 0.5, 0.75, 1.0])
                .addGrid(lr.maxIter, [1, 5, 10])
                .build())
    
    # Instantiate the model factory class
    myLogisticRegressionTest = ModelFactory(bankingDf,
                             lr,
                             lrparamGrid,
                             lrevaluator)
    
    train, test = myLogisticRegressionTest.trainTestSplit(bankingDf, .80)
    
    train.writeTo("spark_catalog.{0}.BANKING_TRANSACTIONS_TRAIN_{1}".format(dbname, username)).using("iceberg").createOrReplace()
    test.writeTo("spark_catalog.{0}.BANKING_TRANSACTIONS_TEST_{1}".format(dbname, username)).using("iceberg").createOrReplace()

    ### SHOW TABLE HISTORY AND SNAPSHOTS
    #spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TRAIN_{1}.history".format(dbname, username)).show(20, False)
    #spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TRAIN_{1}.snapshots".format(dbname, username)).show(20, False)

    train_snapshot_id = spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TRAIN_{1}.snapshots".format(dbname, username)).tail(1)[0][0]
    train_committed_at = spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TRAIN_{1}.snapshots".format(dbname, username)).select("committed_at").tail(1)[0][0].strftime('%m/%d/%Y')
    train_parent_id = spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TRAIN_{1}.snapshots".format(dbname, username)).select("parent_id").tail(1)[0][0]
    test_snapshot_id = spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TEST_{1}.snapshots".format(dbname, username)).tail(1)[0][0]
    test_committed_at = spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TEST_{1}.snapshots".format(dbname, username)).select("committed_at").tail(1)[0][0].strftime('%m/%d/%Y')
    test_parent_id = spark.read.format("iceberg").load("spark_catalog.{0}.BANKING_TRANSACTIONS_TEST_{1}.snapshots".format(dbname, username)).select("parent_id").tail(1)[0][0]

    cv = myLogisticRegressionTest.makeCv(bankingDf.columns)
    
    # Run the cross validation on the training dataset. The cv.fit() call returns the best model it found.
    cvModel = cv.fit(train)
    
    # Evaluate the best model's performance on the test dataset and log the result.
    #test_metric = lrevaluator.evaluate(cvModel.transform(test))
    #mlflow.log_metric('test_' + lrevaluator.getMetricName(), test_metric) 
    
    accuracy = lrevaluator.evaluate(cvModel.transform(test), {lrevaluator.metricName: "accuracy"})
    precision = lrevaluator.evaluate(cvModel.transform(test), {lrevaluator.metricName: "weightedPrecision"})
    recall = lrevaluator.evaluate(cvModel.transform(test), {lrevaluator.metricName: "weightedRecall"})

    tags = {
      "train_iceberg_snapshot_id": train_snapshot_id,
      "train_iceberg_snapshot_committed_at": train_committed_at,
      "train_iceberg_parent_id": train_parent_id,
      "train_row_count": train.count(),
      "test_iceberg_snapshot_id": test_snapshot_id,
      "test_iceberg_snapshot_committed_at": test_committed_at,
      "test_iceberg_parent_id": test_parent_id,
      "test_row_count": test.count()
    }
    
    # Log the best model.
    mlflow.spark.log_model(spark_model=cvModel.bestModel, artifact_path='best-model') 
    mlflow.log_metric("test_accuracy", accuracy)
    mlflow.log_metric("test_precision", precision)
    mlflow.log_metric("test_recall", recall)
    mlflow.set_tags(tags)
    
    mlflow.end_run()



In [None]:
#mlflow.end_run()