Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

pandas read from json don't infer data types #916

Merged
merged 6 commits into from Mar 8, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion mlflow/pyfunc/scoring_server.py
Expand Up @@ -57,7 +57,7 @@ def parse_json_input(json_input, orient="split"):
"""
# pylint: disable=broad-except
try:
return pd.read_json(json_input, orient=orient)
return pd.read_json(json_input, orient=orient, dtype=False)
except Exception:
_handle_serving_error(
error_message=(
Expand Down
47 changes: 46 additions & 1 deletion tests/pyfunc/test_scoring_server.py
Expand Up @@ -12,7 +12,7 @@
import mlflow.sklearn
from mlflow.protos.databricks_pb2 import ErrorCode, MALFORMED_REQUEST, BAD_REQUEST

from tests.helper_functions import pyfunc_serve_and_score_model
from tests.helper_functions import pyfunc_serve_and_score_model, random_int, random_str


ModelWithData = namedtuple("ModelWithData", ["model", "inference_data"])
Expand Down Expand Up @@ -156,3 +156,48 @@ def test_scoring_server_responds_to_invalid_content_type_request_with_unsupporte
data=pandas_split_content,
content_type="not_a_supported_content_type")
assert response.status_code == 415


def test_parse_json_input_records_oriented():
size = 20
data = {"col_m": [random_int(0, 1000) for _ in range(size)],
"col_z": [random_str(4) for _ in range(size)],
"col_a": [random_int() for _ in range(size)]}
p1 = pd.DataFrame.from_dict(data)
p2 = pyfunc_scoring_server.parse_json_input(p1.to_json(orient="records"), orient="records")
# "records" orient may shuffle column ordering. Hence comparing each column Series
for col in data.keys():
assert all(p1[col] == p2[col])


def test_parse_json_input_split_oriented():
size = 200
data = {"col_m": [random_int(0, 1000) for _ in range(size)],
"col_z": [random_str(4) for _ in range(size)],
"col_a": [random_int() for _ in range(size)]}
p1 = pd.DataFrame.from_dict(data)
p2 = pyfunc_scoring_server.parse_json_input(p1.to_json(orient="split"), orient="split")
assert all(p1 == p2)


def test_records_oriented_json_to_df():
# test that datatype for "zip" column is not converted to "int64"
jstr = '[' \
'{"zip":"95120","cost":10.45,"score":8},' \
'{"zip":"95128","cost":23.0,"score":0},' \
'{"zip":"95128","cost":12.1,"score":10}' \
']'
df = pyfunc_scoring_server.parse_json_input(jstr, orient="records")

assert set(df.columns) == {'zip', 'cost', 'score'}
assert set(str(dt) for dt in df.dtypes) == {'object', 'float64', 'int64'}


def test_split_oriented_json_to_df():
# test that datatype for "zip" column is not converted to "int64"
jstr = '{"columns":["zip","cost","count"],"index":[0,1,2],' \
'"data":[["95120",10.45,-8],["95128",23.0,-1],["95128",12.1,1000]]}'
df = pyfunc_scoring_server.parse_json_input(jstr, orient="split")

assert set(df.columns) == {'zip', 'cost', 'count'}
assert set(str(dt) for dt in df.dtypes) == {'object', 'float64', 'int64'}