Skip to content
This repository has been archived by the owner on May 18, 2023. It is now read-only.

Commit

Permalink
Merge pull request #13 from moj-analytical-services/trim_names
Browse files Browse the repository at this point in the history
trim names before split
  • Loading branch information
samnlindsay committed Jul 26, 2021
2 parents 6276a8d + d4d8ecb commit 8f7c69a
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion splink_data_standardisation/names.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def standardise_names(df:DataFrame, name_cols: list, drop_orig:bool=True):
df = df.withColumn('name_concat', expr(f"concat_ws(' ', {name_col_joined})"))
df = df.withColumn('name_concat', expr('lower(name_concat)'))
df = df.withColumn('name_concat', expr("regexp_replace(name_concat, '[\\-\\.]', ' ')"))
df = df.withColumn('name_concat', expr("trim(name_concat)"))
df = df.withColumn('name_arr', expr("split(name_concat, ' ')"))
df = df.withColumn('surname_std', expr(f"case when {surname_col_name} is not null then element_at(name_arr,-1) else null end"))
df = df.withColumn('forename1_std', expr("case when size(name_arr) > 1 then element_at(name_arr,1) else null end"))
Expand All @@ -31,4 +32,4 @@ def standardise_names(df:DataFrame, name_cols: list, drop_orig:bool=True):
if drop_orig:
for n in name_cols:
df = df.drop(n)
return df
return df

0 comments on commit 8f7c69a

Please sign in to comment.