### Launch Spark Session via CML Data

In [37]:
import cml.data_v1 as cmldata

# Sample in-code customization of spark configurations
#from pyspark import SparkContext
#SparkContext.setSystemProperty('spark.executor.cores', '1')
#SparkContext.setSystemProperty('spark.executor.memory', '2g')

CONNECTION_NAME = "se-aw-mdl"
conn = cmldata.get_connection(CONNECTION_NAME)
spark = conn.get_spark_session()

# Sample usage to run query through spark
EXAMPLE_SQL_QUERY = "show databases"
spark.sql(EXAMPLE_SQL_QUERY).show()


+------------------+
|         namespace|
+------------------+
|           default|
|information_schema|
|               sys|
+------------------+



In [38]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from pyspark.sql.types import LongType, IntegerType, StringType, FloatType
from pyspark.sql import functions as F
import dbldatagen as dg
import dbldatagen.distributions as dist
from dbldatagen import FakerTextFactory, DataGenerator, fakerText

class DataGen:

    '''Class to Generate Text Data'''

    def __init__(self, spark):
        self.spark = spark

    def dataGen(self, shuffle_partitions_requested = 8, partitions_requested = 8, data_rows = 10000):

        # partition parameters etc.
        self.spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)

        fakerDataspec = (DataGenerator(self.spark, rows=data_rows, partitions=partitions_requested)
                    .withColumn("col1", IntegerType(), minValue=1, maxValue=data_rows)
                    .withColumn("col2", IntegerType(), minValue=1, maxValue=data_rows)
                    .withColumn("label", "string", values=["0", "1"],random=True)
                    )
        df = fakerDataspec.build()
     
        df = df.withColumn("labelStr", F.col("label").cast(FloatType()))\
            .drop("label")\
            .withColumnRenamed("labelStr", "label")
        
        return df

In [39]:
dg = DataGen(spark)

training_df = dg.dataGen()

### Create and Run Experiment

In [40]:
import logging
import json
import shutil
import datetime
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import VectorAssembler

In [41]:
import mlflow.spark

In [42]:
def experimentRun(df):

    mlflow.set_experiment("inference-simple")
    
    ### MLFLOW EXPERIMENT RUN
    with mlflow.start_run() as run:

        maxIter=8
        regParam=0.01

        assembler = VectorAssembler(inputCols=['col1','col2'], outputCol='features')
        lr = LogisticRegression(maxIter=maxIter, regParam=regParam)
        
        pipeline = Pipeline(stages=[assembler, lr])
        model = pipeline.fit(df)

        mlflow.log_param("maxIter", maxIter)
        mlflow.log_param("regParam", regParam)

        #prediction = model.transform(test)
        mlflow.spark.log_model(model, artifact_path="artifacts")

    mlflow.end_run()
    
    experiment_id = mlflow.get_experiment_by_name("inference-simple").experiment_id
    runs_df = mlflow.search_runs(experiment_id, run_view_type=1)
    
    return runs_df

In [43]:
experimentRun(training_df)

24/02/26 22:59:02 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
24/02/26 22:59:03 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.
24/02/26 22:59:03 WARN SparkConf: The configuration key 'spark.yarn.access.hadoopFileSystems' has been deprecated as of Spark 3.0 and may be removed in the future. Please use the new key 'spark.kerberos.access.hadoopFileSystems' instead.


Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,params.maxIter,params.regParam,tags.mlflow.source.git.commit,tags.mlflow.source.type,tags.mlflow.user,tags.engineID,tags.mlflow.source.name,tags.mlflow.log-model.history
0,1be2-gab8-84g9-fe5e,638k-6cau-p4tw-vq6t,EXPERIMENT_RUN_FAILED,/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/1b...,2024-02-26 22:14:54.138494976+00:00,2024-02-26 22:14:54.276999936+00:00,,,b105e0e52fce135147f398b7f132b9a5abadabb7,LOCAL,pauldefusco,dpcneccqtnsprhdi,/usr/local/lib/python3.10/site-packages/ipyker...,
1,y8na-mdz6-lvf0-edz0,638k-6cau-p4tw-vq6t,EXPERIMENT_RUN_FAILED,/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/y8...,2024-02-26 22:23:29.904854016+00:00,2024-02-26 22:23:29.908000+00:00,,,b105e0e52fce135147f398b7f132b9a5abadabb7,LOCAL,pauldefusco,dpcneccqtnsprhdi,/usr/local/lib/python3.10/site-packages/ipyker...,
2,4ymb-6808-x13p-7pue,638k-6cau-p4tw-vq6t,EXPERIMENT_RUN_FAILED,/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/4y...,2024-02-26 22:23:35.184467968+00:00,2024-02-26 22:23:35.212000+00:00,,,b105e0e52fce135147f398b7f132b9a5abadabb7,LOCAL,pauldefusco,dpcneccqtnsprhdi,/usr/local/lib/python3.10/site-packages/ipyker...,
3,h2k1-qwzd-szhd-yc5i,638k-6cau-p4tw-vq6t,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/h2...,2024-02-26 22:24:06.357978112+00:00,2024-02-26 22:24:59.124999936+00:00,8.0,0.01,b105e0e52fce135147f398b7f132b9a5abadabb7,LOCAL,pauldefusco,dpcneccqtnsprhdi,/usr/local/lib/python3.10/site-packages/ipyker...,"[{""run_id"": ""h2k1-qwzd-szhd-yc5i"", ""artifact_p..."
4,lroe-ei98-363k-3761,638k-6cau-p4tw-vq6t,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/lr...,2024-02-26 22:25:53.482353920+00:00,2024-02-26 22:26:55.880999936+00:00,8.0,0.01,b105e0e52fce135147f398b7f132b9a5abadabb7,LOCAL,pauldefusco,dpcneccqtnsprhdi,/usr/local/lib/python3.10/site-packages/ipyker...,"[{""run_id"": ""lroe-ei98-363k-3761"", ""artifact_p..."
5,kaok-pb97-jmsz-85hr,638k-6cau-p4tw-vq6t,EXPERIMENT_RUN_FINISHED,/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/ka...,2024-02-26 22:58:36.159972864+00:00,2024-02-26 22:59:19.392999936+00:00,8.0,0.01,b105e0e52fce135147f398b7f132b9a5abadabb7,LOCAL,pauldefusco,dpcneccqtnsprhdi,/usr/local/lib/python3.10/site-packages/ipyker...,"[{""run_id"": ""kaok-pb97-jmsz-85hr"", ""artifact_p..."


### Create Inference Data

In [44]:
class InferenceGen:

    '''
    Class to Generate Text Data
    Same data as above but without lable column
    '''
    
    def __init__(self, spark):
        self.spark = spark
        
    def dataGen(self, shuffle_partitions_requested = 8, partitions_requested = 8, data_rows = 10000):

        # partition parameters etc.
        self.spark.conf.set("spark.sql.shuffle.partitions", shuffle_partitions_requested)

        fakerDataspec = (DataGenerator(self.spark, rows=data_rows, partitions=partitions_requested)
                    .withColumn("col1", IntegerType(), minValue=1, maxValue=data_rows)
                    .withColumn("col2", IntegerType(), minValue=1, maxValue=data_rows)
                    )
        df = fakerDataspec.build()
        
        return df

In [45]:
dg = InferenceGen(spark)

inference_df = dg.dataGen()

In [46]:
column_names = inference_df.columns
print(column_names)

['col1', 'col2']


In [47]:
import mlflow

logged_model = '/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/lroe-ei98-363k-3761/artifacts/artifacts'

# Load model as a Spark UDF.
loaded_model = mlflow.pyfunc.spark_udf(spark, model_uri=logged_model)

# Predict on a Spark DataFrame.
inference_df.withColumn('predictions', loaded_model(*column_names)).collect()

2024/02/26 23:00:08 INFO mlflow.models.flavor_backend_registry: Selected backend for flavor 'python_function'
24/02/26 23:00:29 WARN TaskSetManager: Lost task 1.0 in stage 58.0 (TID 238) (100.100.114.237 executor 27): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/opt/cmladdons/python/site-packages/mlflow/pyfunc/__init__.py", line 1275, in udf
    loaded_model = mlflow.pyfunc.load_model(local_model_path)
  File "/home/cdsw/.local/lib/python3.10/site-packages/mlflow/pyfunc/spark_model_cache.py", line 47, in get_or_load
    SparkModelCache._models[archive_path] = (load_model(local_model_dir), local_model_dir)
  File "/home/cdsw/.local/lib/python3.10/site-packages/mlflow/pyfunc/__init__.py", line 597, in load_model
    model_impl = importlib.import_module(conf[MAIN])._load_pyfunc(data_path)
  File "/home/cdsw/.local/lib/python3.10/site-packages/mlflow/spark.py", line 842, in _load_pyfunc
    return _PyFuncModelWrapper(spark, _load_model(model_uri=

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/opt/cmladdons/python/site-packages/mlflow/pyfunc/__init__.py", line 1275, in udf
    loaded_model = mlflow.pyfunc.load_model(local_model_path)
  File "/home/cdsw/.local/lib/python3.10/site-packages/mlflow/pyfunc/spark_model_cache.py", line 47, in get_or_load
    SparkModelCache._models[archive_path] = (load_model(local_model_dir), local_model_dir)
  File "/home/cdsw/.local/lib/python3.10/site-packages/mlflow/pyfunc/__init__.py", line 597, in load_model
    model_impl = importlib.import_module(conf[MAIN])._load_pyfunc(data_path)
  File "/home/cdsw/.local/lib/python3.10/site-packages/mlflow/spark.py", line 842, in _load_pyfunc
    return _PyFuncModelWrapper(spark, _load_model(model_uri=path))
  File "/home/cdsw/.local/lib/python3.10/site-packages/mlflow/spark.py", line 748, in _load_model
    return PipelineModel.load(model_uri)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/ml/util.py", line 332, in load
    return cls.read().load(path)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/ml/pipeline.py", line 256, in load
    metadata = DefaultParamsReader.loadMetadata(path, self.sc)
  File "/opt/spark/python/lib/pyspark.zip/pyspark/ml/util.py", line 525, in loadMetadata
    metadataStr = sc.textFile(metadataPath, 1).first()
  File "/opt/spark/python/lib/pyspark.zip/pyspark/rdd.py", line 1591, in first
    raise ValueError("RDD is empty")
ValueError: RDD is empty


24/02/26 23:01:18 WARN TaskSetManager: Lost task 7.0 in stage 58.0 (TID 250) (100.100.114.249 executor 39): TaskKilled (Stage cancelled)
24/02/26 23:01:19 WARN TaskSetManager: Lost task 4.0 in stage 58.0 (TID 247) (100.100.114.246 executor 36): TaskKilled (Stage cancelled)
24/02/26 23:01:19 WARN TaskSetManager: Lost task 5.1 in stage 58.0 (TID 257) (100.100.114.247 executor 37): TaskKilled (Stage cancelled)
24/02/26 23:01:19 WARN TaskSetManager: Lost task 2.3 in stage 58.0 (TID 254) (100.100.114.237 executor 27): TaskKilled (Stage cancelled)
24/02/26 23:01:19 WARN TaskSetManager: Lost task 3.3 in stage 58.0 (TID 255) (100.100.114.239 executor 29): TaskKilled (Stage cancelled)
24/02/26 23:01:20 WARN TaskSetManager: Lost task 0.3 in stage 58.0 (TID 256) (100.100.114.245 executor 35): TaskKilled (Stage cancelled)


### Using Pandas Option

In [31]:
iDfPandas = inference_df.toPandas()

                                                                                

In [32]:
import mlflow

# Load model as a PyFuncModel.
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Predict on a Pandas DataFrame.
import pandas as pd
loaded_model.predict(iDfPandas)

 - mlflow (current: 2.4.1, required: mlflow==2.4)
 - pandas (current: 2.1.3, required: pandas<2)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
2024/02/26 22:34:50 INFO mlflow.spark: File '/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/lroe-ei98-363k-3761/artifacts/artifacts/sparkml' is already on DFS, copy is not necessary.


ValueError: RDD is empty

### Loading Directly from /home/cdsw/.experiments with PySpark

In [34]:
# read pickled model via pipeline api
from pyspark.ml.pipeline import PipelineModel
persistedModel = PipelineModel.load(loaded_model)

# predict
predictionsDF = persistedModel.transform(inference_df)

TypeError: expected str, bytes or os.PathLike object, not function

Trying different folders 

In [36]:
mPath = '/home/cdsw/.experiments/638k-6cau-p4tw-vq6t/lroe-ei98-363k-3761/artifacts/artifacts/sparkml/'

persistedModel = PipelineModel.load(mPath)

# predict
predictionsDF = persistedModel.transform(inference_df)

ValueError: RDD is empty