Update arrays.py

moj-analytical-services · Oct 6, 2020 · a7087be · a7087be
1 parent 6e57f0e
commit a7087be
Showing 1 changed file with 3 additions and 2 deletions.
diff --git a/splink_data_normalisation/arrays.py b/splink_data_normalisation/arrays.py
@@ -1,9 +1,10 @@
 from pyspark.sql.dataframe import DataFrame
 from pyspark.sql.functions import expr, regexp_replace, col
 
-# One of the original motivations for this was problems with Athena handling arrays containing only a null (i.e. [None,], as opposed to None)
+# This fixes a problem where athena can't handle a parquet file with a zero length array 
+# so [None] is fine, and so is None, but [] is not
 # See here: https://forums.aws.amazon.com/thread.jspa?messageID=874178&tstart=0
-# This no longer seems to be a problem:  https://gist.github.com/RobinL/0692e2cd266483b3088646206aa8be62
+# A reprex is here https://gist.github.com/RobinL/0692e2cd266483b3088646206aa8be62
 def fix_zero_length_arrays(df:DataFrame):
     """For every field of type array, turn zero length arrays into true nulls