# Convert a SparkML Model to ONNX Format for API Serving

#### Spark ML Models don't naturally fit the API serving use case because they require a Spark Session which is heavyweight and a bit clunky

#### As an alternative, you can convert your SparkML Model to ONNX format and then host that in a REST Endpoint.

#### ONNX Models are serialized, portable, and don't require a Spark Session.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml import PipelineModel

In [2]:
import random, os
import numpy as np

In [3]:
import onnxruntime
import onnxmltools
import onnx

In [4]:
from onnxmltools.convert.common.data_types import FloatTensorType
from onnxmltools.convert.sparkml.utils import buildInitialTypesSimple

## Import the Spark ML Model

In [5]:
spark = SparkSession\
    .builder\
    .appName("ONNX_Model")\
    .config("spark.hadoop.fs.s3a.s3guard.ddb.region","us-east-2")\
    .config("spark.yarn.access.hadoopFileSystems",os.environ["STORAGE"])\
    .getOrCreate()

Setting spark.hadoop.yarn.resourcemanager.principal to pauldefusco


In [6]:
df = spark.sql("SELECT * FROM default.lc_smote_subset LIMIT 1")

Hive Session ID = 840020aa-a80a-4042-a17e-75d4e18aea95


In [7]:
def load_pipeline():
    modelPipeline = PipelineModel.load(os.environ["STORAGE"]+"/pdefusco/pipeline")
    return modelPipeline

In [8]:
def transform_pipeline(spark_df, pmodel):        
     
    cols = ['acc_now_delinq', 'acc_open_past_24mths', 'annual_inc', 'avg_cur_bal', 'funded_amnt']
        
    spark_df = spark_df.select(*cols)
    
    for c in spark_df.columns:
        spark_df = spark_df.withColumn(c, spark_df[c].cast("float"))
        
    return pmodel.transform(spark_df)

In [9]:
modelPipeline = load_pipeline()

                                                                                

In [10]:
sample_df = transform_pipeline(df, modelPipeline)

In [11]:
sample_df = sample_df.select(['acc_now_delinq', 'acc_open_past_24mths', 'annual_inc', 'avg_cur_bal', 'funded_amnt'])

## Convert the Model

In [12]:
initial_types = onnxmltools.convert.sparkml.buildInitialTypesSimple(sample_df)

In [13]:
initial_types

[('acc_now_delinq', FloatTensorType(shape=[1, 1])),
 ('acc_open_past_24mths', FloatTensorType(shape=[1, 1])),
 ('annual_inc', FloatTensorType(shape=[1, 1])),
 ('avg_cur_bal', FloatTensorType(shape=[1, 1])),
 ('funded_amnt', FloatTensorType(shape=[1, 1]))]

In [14]:
onnx_model = onnxmltools.convert.convert_sparkml(modelPipeline, 'My Sparkml Pipeline', initial_types)

The maximum opset needed by this model is only 8.


{'classlabels_ints': [0, 1],
 'coefficients': [0.14303786136899024,
                  -0.27123271556878475,
                  3.742220431494416,
                  1.8575953342840612,
                  0.30977227348174174,
                  -0.14303786136899024,
                  0.27123271556878475,
                  -3.742220431494416,
                  -1.8575953342840612,
                  -0.30977227348174174],
 'intercepts': [-1.4336914041825082, 1.4336914041825082],
 'multi_class': 1,
 'name': 'LinearClassifier',
 'post_transform': 'NONE'}


In [15]:
with open("models/model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [16]:
spark.stop()