diff --git a/splink_data_normalisation/arrays.py b/splink_data_normalisation/arrays.py index aeb6f5e..37663f9 100644 --- a/splink_data_normalisation/arrays.py +++ b/splink_data_normalisation/arrays.py @@ -1,9 +1,10 @@ from pyspark.sql.dataframe import DataFrame from pyspark.sql.functions import expr, regexp_replace, col -# One of the original motivations for this was problems with Athena handling arrays containing only a null (i.e. [None,], as opposed to None) +# This fixes a problem where athena can't handle a parquet file with a zero length array +# so [None] is fine, and so is None, but [] is not # See here: https://forums.aws.amazon.com/thread.jspa?messageID=874178&tstart=0 -# This no longer seems to be a problem: https://gist.github.com/RobinL/0692e2cd266483b3088646206aa8be62 +# A reprex is here https://gist.github.com/RobinL/0692e2cd266483b3088646206aa8be62 def fix_zero_length_arrays(df:DataFrame): """For every field of type array, turn zero length arrays into true nulls