Skip to content
This repository has been archived by the owner on May 18, 2023. It is now read-only.

Commit

Permalink
improve array function
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed Jul 1, 2020
1 parent 054e2e3 commit 148f60f
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 5 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink_data_normalisation"
version = "0.1.1"
version = "0.1.2"
description = ""
authors = ["Robin Linacre <robin.linacre@digital.justice.gov.uk>"]
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion splink_data_normalisation/arrays.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ def fix_zero_length_arrays(df:DataFrame):

stmt = """
case
when size({c}) > 0 then {c}
when size(filter({c}, x -> x is not null and trim(x) != '')) > 0 then filter({c}, x -> x is not null and trim(x) != '')
else null
end
"""
Expand Down
13 changes: 10 additions & 3 deletions tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,26 @@ def test_fix_1(spark):
names_list = [
{"id": 1, "my_arr1": ["a", "b", "c"], "other_arr": [ ],"my_str": "a"},
{"id": 2, "my_arr1": [ ], "other_arr": [1],"my_str": "a"},

]
{"id": 3, "my_arr1": [None, '', 'c'], "other_arr": [1],"my_str": "a"},
{"id": 4, "my_arr1": [None, '' ], "other_arr": [1],"my_str": "a"},
{"id": 5, "my_arr1": ['' ], "other_arr": [1],"my_str": "a"},
{"id": 6, "my_arr1": [None, None ], "other_arr": [1],"my_str": "a"},
]

df = spark.createDataFrame(Row(**x) for x in names_list)
df = df.select(list(names_list[0].keys()))

df = fix_zero_length_arrays(df)

df_result = df.toPandas()

df_expected = [
{"id": 1, "my_arr1": ["a", "b", "c"], "other_arr": None,"my_str": "a"},
{"id": 2, "my_arr1": None, "other_arr": [1] ,"my_str": "a"},
{"id": 3, "my_arr1": ['c'], "other_arr": [1] ,"my_str": "a"},
{"id": 4, "my_arr1": None, "other_arr": [1] ,"my_str": "a"},
{"id": 5, "my_arr1": None, "other_arr": [1] ,"my_str": "a"},
{"id": 6, "my_arr1": None, "other_arr": [1] ,"my_str": "a"},

]

df_expected = pd.DataFrame(df_expected)
Expand Down

0 comments on commit 148f60f

Please sign in to comment.