-
Notifications
You must be signed in to change notification settings - Fork 4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
initpy and test_transformers_model_export #10538
Changes from 37 commits
cfebd44
1b899c2
82d011d
97ae2f4
515c9f2
fc6ce63
155fd99
b8c51c2
9710901
167d836
37a00b8
63de361
a3d807f
b8b55d7
d745eee
e486eb9
67db66b
a104e85
0e06964
a546040
3b66134
fc6f28b
fdcefea
37c841b
4c95261
95ed7b7
603a4d4
f3bbe7c
abd8c9f
98cff65
2b84af6
5adc77d
d0db696
e5e89bd
7c555d3
ab9fee7
4dbbaf8
a309e45
be250a6
a44da12
0b5ab75
e73a548
2c54870
3973ffb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1344,7 +1344,6 @@ def _should_add_pyfunc_to_model(pipeline) -> bool: | |
"DocumentQuestionAnsweringPipeline", | ||
"ImageToTextPipeline", | ||
"VisualQuestionAnsweringPipeline", | ||
"ImageClassificationPipeline", | ||
"ImageSegmentationPipeline", | ||
"DepthEstimationPipeline", | ||
"ObjectDetectionPipeline", | ||
|
@@ -1354,11 +1353,6 @@ def _should_add_pyfunc_to_model(pipeline) -> bool: | |
"ZeroShotAudioClassificationPipeline", | ||
] | ||
|
||
impermissible_attrs = {"image_processor"} | ||
|
||
for attr in impermissible_attrs: | ||
if getattr(pipeline, attr, None) is not None: | ||
return False | ||
for model_type in exclusion_model_types: | ||
if hasattr(transformers, model_type): | ||
if isinstance(pipeline.model, getattr(transformers, model_type)): | ||
|
@@ -1426,7 +1420,13 @@ def _get_default_pipeline_signature(pipeline, example=None, model_config=None) - | |
return ModelSignature( | ||
inputs=Schema([ColSpec("string")]), outputs=Schema([ColSpec("string")]) | ||
) | ||
elif isinstance(pipeline, transformers.TextClassificationPipeline): | ||
elif isinstance( | ||
pipeline, | ||
( | ||
transformers.TextClassificationPipeline, | ||
transformers.ImageClassificationPipeline, | ||
), | ||
): | ||
return ModelSignature( | ||
inputs=Schema([ColSpec("string")]), | ||
outputs=Schema([ColSpec("string", name="label"), ColSpec("double", name="score")]), | ||
|
@@ -1816,6 +1816,9 @@ def _predict(self, data): | |
output_key = "token_str" | ||
elif isinstance(self.pipeline, transformers.TextClassificationPipeline): | ||
output_key = "label" | ||
elif isinstance(self.pipeline, transformers.ImageClassificationPipeline): | ||
data = self._convert_image_input(data) | ||
output_key = "label" | ||
elif isinstance(self.pipeline, transformers.ZeroShotClassificationPipeline): | ||
output_key = "labels" | ||
data = self._parse_json_encoded_list(data, "candidate_labels") | ||
|
@@ -1894,7 +1897,11 @@ def _predict(self, data): | |
output = json.dumps(raw_output) | ||
elif isinstance( | ||
self.pipeline, | ||
(transformers.AudioClassificationPipeline, transformers.TextClassificationPipeline), | ||
( | ||
transformers.AudioClassificationPipeline, | ||
transformers.TextClassificationPipeline, | ||
transformers.ImageClassificationPipeline, | ||
), | ||
): | ||
return pd.DataFrame(raw_output) | ||
else: | ||
|
@@ -2581,6 +2588,65 @@ def _convert_cast_lists_from_np_back_to_list(data): | |
parsed_data.append(entry) | ||
return parsed_data | ||
|
||
@staticmethod | ||
def is_base64_image(image): | ||
"""Check whether input image is a base64 encoded""" | ||
|
||
try: | ||
return base64.b64encode(base64.b64decode(image)).decode("utf-8") == image | ||
except binascii.Error: | ||
return False | ||
|
||
def _convert_image_input(self, input_data): | ||
""" | ||
Conversion utility for decoding the base64 encoded bytes data of a raw image file when | ||
parsed through model serving, if applicable. Direct usage of the pyfunc implementation | ||
outside of model serving will treat this utility as a noop. | ||
|
||
For reference, the expected encoding for input to Model Serving will be: | ||
|
||
import requests | ||
import base64 | ||
|
||
response = requests.get("https://www.my.images/a/sound/file.jpg") | ||
encoded_image = base64.b64encode(response.content).decode("utf-8") | ||
|
||
inference_data = json.dumps({"inputs": [encoded_image]}) | ||
|
||
or | ||
|
||
inference_df = pd.DataFrame( | ||
pd.Series([encoded_image], name="image_file") | ||
) | ||
split_dict = {"dataframe_split": inference_df.to_dict(orient="split")} | ||
split_json = json.dumps(split_dict) | ||
|
||
or | ||
|
||
records_dict = {"dataframe_records": inference_df.to_dict(orient="records")} | ||
records_json = json.dumps(records_dict) | ||
|
||
This utility will convert this JSON encoded, base64 encoded text back into bytes for | ||
input into the Image pipelines for inference. | ||
""" | ||
|
||
def process_input_element(input_element): | ||
input_value = next(iter(input_element.values())) | ||
if isinstance(input_value, str) and not self.is_base64_image(input_value): | ||
self._validate_str_input_uri_or_file(input_value) | ||
return input_value | ||
|
||
if isinstance(input_data, list) and all( | ||
isinstance(element, dict) for element in input_data | ||
): | ||
# Use a list comprehension for readability | ||
# the elimination of empty collection declarations | ||
return [process_input_element(element) for element in input_data] | ||
elif isinstance(input_data, str) and not self.is_base64_image(input_data): | ||
self._validate_str_input_uri_or_file(input_data) | ||
|
||
return input_data | ||
|
||
def _convert_audio_input(self, data): | ||
""" | ||
Conversion utility for decoding the base64 encoded bytes data of a raw soundfile when | ||
|
@@ -2669,7 +2735,8 @@ def decode_audio(encoded): | |
@staticmethod | ||
def _validate_str_input_uri_or_file(input_str): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could we keep this method as it is and move your check on the data into a new function? This function's name is just like checking a single input string. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed, thanks |
||
""" | ||
Validation of blob references to audio files, if a string is input to the ``predict`` | ||
Validation of blob references to either audio or image files, | ||
if a string is input to the ``predict`` | ||
method, perform validation of the string contents by checking for a valid uri or | ||
filesystem reference instead of surfacing the cryptic stack trace that is otherwise raised | ||
for an invalid uri input. | ||
|
@@ -2687,7 +2754,7 @@ def is_uri(s): | |
if not valid_uri: | ||
raise MlflowException( | ||
"An invalid string input was provided. String inputs to " | ||
"audio files must be either a file location or a uri.", | ||
"audio or image files must be either a file location or a uri.", | ||
KonakanchiSwathi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
error_code=BAD_REQUEST, | ||
) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,6 +50,7 @@ | |
_record_pipeline_components, | ||
_should_add_pyfunc_to_model, | ||
_TransformersModel, | ||
_TransformersWrapper, | ||
_validate_transformers_task_type, | ||
_write_card_data, | ||
get_default_conda_env, | ||
|
@@ -80,7 +81,8 @@ | |
# runners#supported-runners-and-hardware-resources for instance specs. | ||
RUNNING_IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" | ||
GITHUB_ACTIONS_SKIP_REASON = "Test consumes too much memory" | ||
|
||
image_url = "https://raw.githubusercontent.com/mlflow/mlflow/master/tests/datasets/cat.png" | ||
image_file_path = pathlib.Path(pathlib.Path(__file__).parent.parent, "datasets", "cat.png") | ||
# Test that can only be run locally: | ||
# - Summarization pipeline tests | ||
# - TextClassifier pipeline tests | ||
|
@@ -479,7 +481,7 @@ def test_instance_extraction(small_qa_pipeline): | |
("small_qa_pipeline", True), | ||
("small_seq2seq_pipeline", True), | ||
("small_multi_modal_pipeline", False), | ||
("small_vision_model", False), | ||
("small_vision_model", True), | ||
], | ||
) | ||
def test_pipeline_eligibility_for_pyfunc_registration(model, result, request): | ||
|
@@ -599,8 +601,7 @@ def test_model_card_acquisition_vision_model(small_vision_model): | |
def test_vision_model_save_pipeline_with_defaults(small_vision_model, model_path): | ||
mlflow.transformers.save_model(transformers_model=small_vision_model, path=model_path) | ||
# validate inferred pip requirements | ||
with model_path.joinpath("requirements.txt").open() as file: | ||
requirements = file.read() | ||
requirements = model_path.joinpath("requirements.txt").read_text() | ||
reqs = {req.split("==")[0] for req in requirements.split("\n")} | ||
expected_requirements = {"torch", "torchvision", "transformers"} | ||
assert reqs.intersection(expected_requirements) == expected_requirements | ||
|
@@ -625,6 +626,29 @@ def test_vision_model_save_pipeline_with_defaults(small_vision_model, model_path | |
assert flavor_config["source_model_name"] == "google/mobilenet_v2_1.0_224" | ||
|
||
|
||
def test_vision_model_save_model_for_task_and_card_inference(small_vision_model, model_path): | ||
mlflow.transformers.save_model(transformers_model=small_vision_model, path=model_path) | ||
# validate inferred pip requirements | ||
requirements = model_path.joinpath("requirements.txt").read_text() | ||
reqs = {req.split("==")[0] for req in requirements.split("\n")} | ||
expected_requirements = {"torch", "torchvision", "transformers"} | ||
assert reqs.intersection(expected_requirements) == expected_requirements | ||
# validate inferred model card data | ||
card_data = yaml.safe_load(model_path.joinpath("model_card_data.yaml").read_bytes()) | ||
assert card_data["tags"] == ["vision", "image-classification"] | ||
# Validate inferred model card text | ||
card_text = model_path.joinpath("model_card.md").read_text(encoding="utf-8") | ||
assert len(card_text) > 0 | ||
|
||
# Validate the MLModel file | ||
mlmodel = yaml.safe_load(model_path.joinpath("MLmodel").read_bytes()) | ||
flavor_config = mlmodel["flavors"]["transformers"] | ||
assert flavor_config["instance_type"] == "ImageClassificationPipeline" | ||
assert flavor_config["pipeline_model_type"] == "MobileNetV2ForImageClassification" | ||
assert flavor_config["task"] == "image-classification" | ||
assert flavor_config["source_model_name"] == "google/mobilenet_v2_1.0_224" | ||
|
||
|
||
def test_qa_model_save_model_for_task_and_card_inference(small_seq2seq_pipeline, model_path): | ||
mlflow.transformers.save_model( | ||
transformers_model={ | ||
|
@@ -971,11 +995,6 @@ def test_transformers_log_model_with_no_registered_model_name(small_vision_model | |
conda_env=str(conda_env), | ||
) | ||
mlflow.tracking._model_registry.fluent._register_model.assert_not_called() | ||
model_uri = f"runs:/{mlflow.active_run().info.run_id}/{artifact_path}" | ||
model_path = pathlib.Path(_download_artifact_from_uri(artifact_uri=model_uri)) | ||
model_config = Model.load(str(model_path.joinpath("MLmodel"))) | ||
# Vision models can't be loaded as pyfunc currently. | ||
assert pyfunc.FLAVOR_NAME not in model_config.flavors | ||
|
||
|
||
def test_transformers_save_persists_requirements_in_mlflow_directory( | ||
|
@@ -1341,6 +1360,40 @@ def test_qa_pipeline_pyfunc_load_and_infer(small_qa_pipeline, model_path, infere | |
assert all(isinstance(element, str) for element in inference) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"inference_payload", | ||
[ | ||
image_url, | ||
str(image_file_path), | ||
pytest.param( | ||
"base64", | ||
marks=pytest.mark.skipif( | ||
Version(transformers.__version__) < Version("4.33"), | ||
reason="base64 feature not present", | ||
), | ||
), | ||
], | ||
) | ||
def test_vision_pipeline_pyfunc_load_and_infer(small_vision_model, model_path, inference_payload): | ||
if inference_payload == "base64": | ||
inference_payload = base64.b64encode(image_file_path.read_bytes()).decode("utf-8") | ||
signature = infer_signature( | ||
inference_payload, | ||
mlflow.transformers.generate_signature_output(small_vision_model, inference_payload), | ||
) | ||
mlflow.transformers.save_model( | ||
transformers_model=small_vision_model, | ||
path=model_path, | ||
signature=signature, | ||
) | ||
pyfunc_loaded = mlflow.pyfunc.load_model(model_path) | ||
predictions = pyfunc_loaded.predict(inference_payload) | ||
|
||
transformers_loaded_model = mlflow.transformers.load_model(model_path) | ||
expected_predictions = transformers_loaded_model.predict(inference_payload) | ||
assert list(predictions.to_dict("records")[0].values()) == expected_predictions | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("data", "result"), | ||
[ | ||
|
@@ -2062,6 +2115,65 @@ def test_qa_pipeline_pyfunc_predict(small_qa_pipeline): | |
assert values.to_dict(orient="records") == [{0: "Run"}] | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("input_image", "result"), | ||
[ | ||
(str(image_file_path), False), | ||
(image_url, False), | ||
("base64", True), | ||
("random string", False), | ||
], | ||
) | ||
def test_vision_is_base64_image(input_image, result): | ||
if input_image == "base64": | ||
input_image = base64.b64encode(image_file_path.read_bytes()).decode("utf-8") | ||
assert _TransformersWrapper.is_base64_image(input_image) == result | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"inference_payload", | ||
[ | ||
[str(image_file_path)], | ||
[image_url], | ||
pytest.param( | ||
"base64", | ||
marks=pytest.mark.skipif( | ||
Version(transformers.__version__) < Version("4.33"), | ||
reason="base64 feature not present", | ||
), | ||
), | ||
], | ||
) | ||
def test_vision_pipeline_pyfunc_predict(small_vision_model, inference_payload): | ||
if inference_payload == "base64": | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's not embed complex logic like this that mutates the parameter based on string matching. Just create another test explicitly for this condition. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. well, moved this logic inside because if we do this as an input parameter a lengthy base64 string is being printed in the test suite. now we have to duplicate three tests if we want to create a new test for base64. |
||
inference_payload = [ | ||
base64.b64encode(image_file_path.read_bytes()).decode("utf-8"), | ||
] | ||
artifact_path = "image_classification_model" | ||
|
||
# Log the image classification model | ||
with mlflow.start_run(): | ||
mlflow.transformers.log_model( | ||
transformers_model=small_vision_model, | ||
artifact_path=artifact_path, | ||
) | ||
model_uri = mlflow.get_artifact_uri(artifact_path) | ||
pyfunc_inference_payload = json.dumps({"inputs": inference_payload}) | ||
response = pyfunc_serve_and_score_model( | ||
model_uri, | ||
data=pyfunc_inference_payload, | ||
content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, | ||
extra_args=["--env-manager", "local"], | ||
) | ||
KonakanchiSwathi marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
predictions = PredictionsResponse.from_json(response.content.decode("utf-8")).get_predictions() | ||
|
||
transformers_loaded_model = mlflow.transformers.load_model(model_uri) | ||
expected_predictions = transformers_loaded_model.predict(inference_payload) | ||
|
||
assert [list(pred.values()) for pred in predictions.to_dict("records")] == expected_predictions | ||
|
||
|
||
def test_classifier_pipeline_pyfunc_predict(text_classification_pipeline): | ||
artifact_path = "text_classifier_model" | ||
with mlflow.start_run(): | ||
|
@@ -3486,6 +3598,49 @@ def test_save_model_card_with_non_utf_characters(tmp_path, model_name): | |
assert data == card_data.data.to_dict() | ||
|
||
|
||
def test_vision_pipeline_pyfunc_predict_with_kwargs(small_vision_model): | ||
artifact_path = "image_classification_model" | ||
|
||
parameters = { | ||
"top_k": 2, | ||
} | ||
inference_payload = json.dumps( | ||
{ | ||
"inputs": [image_url], | ||
"params": parameters, | ||
} | ||
) | ||
|
||
with mlflow.start_run(): | ||
mlflow.transformers.log_model( | ||
transformers_model=small_vision_model, | ||
artifact_path=artifact_path, | ||
signature=infer_signature( | ||
image_url, | ||
mlflow.transformers.generate_signature_output(small_vision_model, image_url), | ||
params=parameters, | ||
), | ||
) | ||
model_uri = mlflow.get_artifact_uri(artifact_path) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar here, let's use model_info.model_uri |
||
|
||
transformers_loaded_model = mlflow.transformers.load_model(model_uri) | ||
expected_predictions = transformers_loaded_model.predict(image_url) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. transformers_loaded_model.predict(image_url, params=parameters) -> this is not supported. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah yes, you could use mlflow.pyfunc.load_model(model_uri) then predict with parameters There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here compared the mlflow.pyfunc.load_model(model_uri).predict(data, params) == mlflow.transformers.load_model(model_uri).predict(data)[: top_k] |
||
|
||
response = pyfunc_serve_and_score_model( | ||
model_uri, | ||
data=inference_payload, | ||
content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON, | ||
extra_args=["--env-manager", "local"], | ||
) | ||
|
||
predictions = PredictionsResponse.from_json(response.content.decode("utf-8")).get_predictions() | ||
|
||
assert ( | ||
list(predictions.to_dict("records")[0].values()) | ||
== expected_predictions[: parameters["top_k"]] | ||
) | ||
|
||
|
||
def test_qa_pipeline_pyfunc_predict_with_kwargs(small_qa_pipeline): | ||
artifact_path = "qa_model" | ||
data = { | ||
|
@@ -3875,9 +4030,7 @@ def test_basic_model_with_accelerate_homogeneous_mapping_works(tmp_path): | |
mlflow.transformers.save_model(transformers_model=pipeline, path=str(tmp_path / "model")) | ||
|
||
loaded = mlflow.transformers.load_model(str(tmp_path / "model")) | ||
|
||
text = "Apples are delicious" | ||
|
||
assert loaded(text) == pipeline(text) | ||
|
||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is
decode("utf-8")
necessary? If not, could we reuse is_base64?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
without the decode part, they are not matching
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think if you don't decode the image when you read it, then it matches:
image = base64.b64encode(read_image("cat_image.jpg"))
base64.b64encode(base64.b64decode(image)) == image
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
transformers accepts the base64 image in a string format. It's not accepting direct b64 encoded image format which is in bytes.