/
plot_errors_pipeline.py
267 lines (212 loc) · 8.04 KB
/
plot_errors_pipeline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
"""
.. _errors-pipeline:
Errors while converting a pipeline
==================================
A pipeline is a patchwork of many different pieces
and the probability of the first try to convert it fails
is quite high. This script gathers the most frequent one
and suggest a solution.
.. contents::
:local:
Converter not registered
++++++++++++++++++++++++
*LightGBM* implements random forest which follow
*scikit-learn* API. Due to that, they can be included a
*scikit-learn* pipeline which can be used to optimize
hyperparameters in grid search or to validate the model
with a cross validation. However, *sklearn-onnx* does not
implement a converter for an instance of
`LGBMClassifier
<https://lightgbm.readthedocs.io/en/latest/Python-API.html?
highlight=LGBMClassifier#lightgbm.LGBMClassifier>`_.
Let's see what happens when a simple pipeline is being converted.
"""
import skl2onnx
import onnxruntime
import onnx
import sklearn
from pandas import DataFrame
import matplotlib.pyplot as plt
import os
from onnx.tools.net_drawer import GetPydotGraph, GetOpNodeProducer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from skl2onnx.common.data_types import Int64TensorType, FloatTensorType
from skl2onnx.common.data_types import StringTensorType
from sklearn.linear_model import LogisticRegression
from skl2onnx import update_registered_converter
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm # noqa
from skl2onnx.common.data_types import DictionaryType, SequenceType
import numbers
from skl2onnx import convert_sklearn
import numpy
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
data = load_iris()
X = data.data[:, :2]
y = data.target
ind = numpy.arange(X.shape[0])
numpy.random.shuffle(ind)
X = X[ind, :].copy()
y = y[ind].copy()
pipe = Pipeline([('scaler', StandardScaler()),
('lgbm', LGBMClassifier(n_estimators=1, max_depth=1))])
pipe.fit(X, y)
##################################
# The conversion happens here and fails.
try:
model_onnx = convert_sklearn(
pipe, 'pipeline', [('input', FloatTensorType([None, 2]))])
except Exception as e:
print(e)
###################################
# *sklearn-onnx* needs to know the appropriate converter
# for class *LGBMClassifier*, the converter needs to be registered.
# The converter comes with two pieces: a shape calculator which
# computes output shapes based on inputs shapes and the converter
# itself which extracts the coefficients of the random forest
# and converts them into *ONNX* format.
# First, the shape calculator:
def lightgbm_classifier_shape_extractor(operator):
N = operator.inputs[0].type.shape[0]
class_labels = operator.raw_operator.classes_
if all(isinstance(i, numpy.ndarray) for i in class_labels):
class_labels = numpy.concatenate(class_labels)
if all(isinstance(i, str) for i in class_labels):
operator.outputs[0].type = StringTensorType(shape=[N])
operator.outputs[1].type = SequenceType(
DictionaryType(StringTensorType([]), FloatTensorType([])), N)
elif all(isinstance(i, (numbers.Real, bool,
numpy.bool_)) for i in class_labels):
operator.outputs[0].type = Int64TensorType(shape=[N])
operator.outputs[1].type = SequenceType(
DictionaryType(Int64TensorType([]), FloatTensorType([])), N)
else:
raise ValueError('Unsupported or mixed label types.')
###################################
# Then the converter itself:
###################################
# They are both registered with the following instruction.
update_registered_converter(LGBMClassifier, 'LightGbmLGBMClassifier',
lightgbm_classifier_shape_extractor,
convert_lightgbm)
#################################
# Let's convert again.
model_onnx = convert_sklearn(
pipe, 'pipeline', [('input', FloatTensorType([None, 2]))])
print(str(model_onnx)[:300] + "\n...")
##################################
# .. _l-dataframe-initial-type:
#
# Working with dataframes
# +++++++++++++++++++++++
#
# *sklearn-onnx* converts a pipeline without knowing the training data,
# more specifically, it does not know the input variables. This is why
# it complain when the parameter *initial_type* is not filled
# when function :func:`skl2onnx.convert_sklearn`
# is called. Let's see what happens without it.
data = load_iris()
X = data.data[:, :2]
y = data.target
clf = LogisticRegression()
clf.fit(X, y)
try:
model_onnx = convert_sklearn(clf)
except Exception as e:
print(e)
################################
# We need to define the initial type.
# Let's write some code to automatically
# fill that parameter from a dataframe.
def convert_dataframe_schema(df, drop=None):
inputs = []
for k, v in zip(df.columns, df.dtypes):
if drop is not None and k in drop:
continue
if v == 'int64':
t = Int64TensorType([None, 1])
elif v == 'float64':
t = FloatTensorType([None, 1])
else:
t = StringTensorType([None, 1])
inputs.append((k, t))
return inputs
data = DataFrame(X, columns=["X1", "X2"])
inputs = convert_dataframe_schema(data)
print(inputs)
##################################
# Let's convert again.
try:
model_onnx = convert_sklearn(clf, initial_types=inputs)
except Exception as e:
print(e)
##################################
# *sklean-onnx* tells it cannot match two single inputs
# with one input vector of dimension 2.
# Let's try it that way:
model_onnx = convert_sklearn(
clf, initial_types=[('X', FloatTensorType([None, 2]))])
print(str(model_onnx)[:300] + "\n...")
##################################
# What if now this model is included in a pipeline
# with a `ColumnTransformer
# <https://scikit-learn.org/stable/modules/generated/
# sklearn.compose.ColumnTransformer.html>`_.
# The following pipeline is a way to concatenate multiple
# columns into a single one with a
# `FunctionTransformer
# <https://scikit-learn.org/stable/modules/generated/
# sklearn.preprocessing.FunctionTransformer.html>`_
# with identify function.
pipe = Pipeline(steps=[
('select', ColumnTransformer(
[('id', FunctionTransformer(), ['X1', 'X2'])])),
('logreg', clf)
])
pipe.fit(data[['X1', 'X2']], y)
pipe_onnx = convert_sklearn(pipe, initial_types=inputs)
print(str(pipe_onnx)[:300] + "\n...")
#########################################
# Let's draw the pipeline for a better understanding.
pydot_graph = GetPydotGraph(
pipe_onnx.graph, name=model_onnx.graph.name, rankdir="TB",
node_producer=GetOpNodeProducer(
"docstring", color="orange", fillcolor="orange", style="filled"))
pydot_graph.write_dot("pipeline_concat.dot")
os.system('dot -O -Gdpi=300 -Tpng pipeline_concat.dot')
image = plt.imread("pipeline_concat.dot.png")
fig, ax = plt.subplots(figsize=(40, 20))
ax.imshow(image)
ax.axis('off')
##################################
# Unused inputs
# +++++++++++++
#
# *sklearn-onnx* converts a model into a ONNX graph
# and this graph is then used to compute predictions
# with a backend. The smaller the graph is, the faster
# the computation is. That's why *sklearn-onnx* raises some
# exception when it detects when something can be optimized.
# That's the case when more inputs than needed are declared.
# Let's reuse the previous example with a new dummy feature.
data["dummy"] = 4.5
inputs = convert_dataframe_schema(data)
print(inputs)
####################################
# The new *initial_types* makes the conversion fail.
try:
pipe_onnx = convert_sklearn(pipe, initial_types=inputs)
except Exception as e:
print(e)
#################################
# **Versions used for this example**
print("numpy:", numpy.__version__)
print("scikit-learn:", sklearn.__version__)
print("onnx: ", onnx.__version__)
print("onnxruntime: ", onnxruntime.__version__)
print("skl2onnx: ", skl2onnx.__version__)