Skip to content

Commit

Permalink
pandas read from json don't infer data types (#916)
Browse files Browse the repository at this point in the history
* pandas read from json don't infer data types
* added more tests
* Adding int64 columns for json -> pandas
  • Loading branch information
mparkhe committed Mar 8, 2019
1 parent 8b9e727 commit e20e712
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 2 deletions.
2 changes: 1 addition & 1 deletion mlflow/pyfunc/scoring_server.py
Expand Up @@ -57,7 +57,7 @@ def parse_json_input(json_input, orient="split"):
"""
# pylint: disable=broad-except
try:
return pd.read_json(json_input, orient=orient)
return pd.read_json(json_input, orient=orient, dtype=False)
except Exception:
_handle_serving_error(
error_message=(
Expand Down
47 changes: 46 additions & 1 deletion tests/pyfunc/test_scoring_server.py
Expand Up @@ -12,7 +12,7 @@
import mlflow.sklearn
from mlflow.protos.databricks_pb2 import ErrorCode, MALFORMED_REQUEST, BAD_REQUEST

from tests.helper_functions import pyfunc_serve_and_score_model
from tests.helper_functions import pyfunc_serve_and_score_model, random_int, random_str


ModelWithData = namedtuple("ModelWithData", ["model", "inference_data"])
Expand Down Expand Up @@ -156,3 +156,48 @@ def test_scoring_server_responds_to_invalid_content_type_request_with_unsupporte
data=pandas_split_content,
content_type="not_a_supported_content_type")
assert response.status_code == 415


def test_parse_json_input_records_oriented():
size = 20
data = {"col_m": [random_int(0, 1000) for _ in range(size)],
"col_z": [random_str(4) for _ in range(size)],
"col_a": [random_int() for _ in range(size)]}
p1 = pd.DataFrame.from_dict(data)
p2 = pyfunc_scoring_server.parse_json_input(p1.to_json(orient="records"), orient="records")
# "records" orient may shuffle column ordering. Hence comparing each column Series
for col in data.keys():
assert all(p1[col] == p2[col])


def test_parse_json_input_split_oriented():
size = 200
data = {"col_m": [random_int(0, 1000) for _ in range(size)],
"col_z": [random_str(4) for _ in range(size)],
"col_a": [random_int() for _ in range(size)]}
p1 = pd.DataFrame.from_dict(data)
p2 = pyfunc_scoring_server.parse_json_input(p1.to_json(orient="split"), orient="split")
assert all(p1 == p2)


def test_records_oriented_json_to_df():
# test that datatype for "zip" column is not converted to "int64"
jstr = '[' \
'{"zip":"95120","cost":10.45,"score":8},' \
'{"zip":"95128","cost":23.0,"score":0},' \
'{"zip":"95128","cost":12.1,"score":10}' \
']'
df = pyfunc_scoring_server.parse_json_input(jstr, orient="records")

assert set(df.columns) == {'zip', 'cost', 'score'}
assert set(str(dt) for dt in df.dtypes) == {'object', 'float64', 'int64'}


def test_split_oriented_json_to_df():
# test that datatype for "zip" column is not converted to "int64"
jstr = '{"columns":["zip","cost","count"],"index":[0,1,2],' \
'"data":[["95120",10.45,-8],["95128",23.0,-1],["95128",12.1,1000]]}'
df = pyfunc_scoring_server.parse_json_input(jstr, orient="split")

assert set(df.columns) == {'zip', 'cost', 'count'}
assert set(str(dt) for dt in df.dtypes) == {'object', 'float64', 'int64'}

0 comments on commit e20e712

Please sign in to comment.