Skip to content
This repository has been archived by the owner on May 18, 2023. It is now read-only.

Commit

Permalink
arr fix
Browse files Browse the repository at this point in the history
  • Loading branch information
RobinL committed Feb 24, 2021
1 parent 07c4707 commit e068f04
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink_data_standardisation"
version = "0.2.5"
version = "0.2.6"
description = ""
authors = ["Robin Linacre <robin.linacre@digital.justice.gov.uk>"]
license = "MIT"
Expand Down
14 changes: 7 additions & 7 deletions splink_data_standardisation/arrays.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.functions import expr, regexp_replace, col
from pyspark.sql.functions import expr
import pyspark.sql.functions as f

# This fixes a problem where athena can't handle a parquet file with a zero length array
Expand All @@ -17,7 +17,7 @@ def fix_zero_length_arrays(df: DataFrame):
"""

array_cols = [
(item[0], not(item[1].startswith("array<array")))
(item[0], item[1] == "array<string>")
for item in df.dtypes
if item[1].startswith("array")
]
Expand Down Expand Up @@ -56,13 +56,13 @@ def remove_leading_zeros_array(df: DataFrame, array_colname: str):
Args:
df (DataFrame): Input Spark dataframe
array_colname (str): Column name of array column to remove leading zeros from values
"""

stmt = f"""
TRANSFORM({array_colname}, x -> regexp_replace(x, "^0+", ""))
stmt = f"""
TRANSFORM({array_colname}, x -> regexp_replace(x, "^0+", ""))
"""

df = df.withColumn(array_colname, f.expr(stmt))
Expand Down

0 comments on commit e068f04

Please sign in to comment.