Skip to content
This repository has been archived by the owner on May 18, 2023. It is now read-only.

Commit

Permalink
Merge pull request #14 from moj-analytical-services/name_options
Browse files Browse the repository at this point in the history
Name options
  • Loading branch information
samnlindsay committed Aug 12, 2021
2 parents 7fedd2c + 4be5a24 commit 3ff6894
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 13 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "splink_data_standardisation"
version = "0.2.7"
version = "0.2.8"
description = ""
authors = ["Robin Linacre <robin.linacre@digital.justice.gov.uk>"]
license = "MIT"
Expand Down
32 changes: 24 additions & 8 deletions splink_data_standardisation/names.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,43 @@
from pyspark.sql.functions import expr
from pyspark.sql.dataframe import DataFrame

def standardise_names(df:DataFrame, name_cols: list, drop_orig:bool=True):
def standardise_names(df:DataFrame, name_cols: list, drop_orig:bool=True, retain_surname:bool=False, split_hyphens:bool=True):
"""Take a one or more name columns in a list and standardise the names
so one name appears in each column consistently
Args:
df (DataFrame): Spark DataFrame
name_cols (list): A list of columns that contain names, in order from first name to last name
drop_orig (bool, optional): Drop the original columns after standardisation. Defaults to True.
retain_surname (bool, optional): Maintain separation of surname and forename columns. Defaults to False.
split_hyphens (bool, optional): Split hyphenated names into two words. Defaults to True.
(Note: surnames are standardised with hyphens removed if retain_surname=True, regardless of split_hyphens)
Returns:
DataFrame: A Spark DataFrame with standardised name columns
"""

if split_hyphens:
split = " *- *| "
else:
split = " "


name_col_joined = ", ".join(name_cols)
surname_col_name = name_cols[-1]
if retain_surname:
name_col_joined = ", ".join(name_cols[:-1])
else:
name_col_joined = ", ".join(name_cols)
df = df.withColumn('name_concat', expr(f"concat_ws(' ', {name_col_joined})"))
df = df.withColumn('name_concat', expr('lower(name_concat)'))
df = df.withColumn('name_concat', expr("regexp_replace(name_concat, '[\\-\\.]', ' ')"))
df = df.withColumn('name_concat', expr("regexp_replace(name_concat, '[\\.]', ' ')"))
df = df.withColumn('name_concat', expr("regexp_replace(name_concat, ' *\\- *', '-')"))
df = df.withColumn('name_concat', expr("trim(name_concat)"))
df = df.withColumn('name_arr', expr("split(name_concat, ' ')"))
df = df.withColumn('name_arr', expr(f"split(name_concat, '{split}')"))
if retain_surname:
df = df.withColumn('surname_std', expr(f"regexp_replace(lower({surname_col_name}), '[\\-\\.]', ' ')"))
df = df.withColumn('surname_std', expr(f"trim(regexp_replace(surname_std, ' +', ' '))"))
if split_hyphens:
df = df.withColumn('name_arr', expr("array_union(name_arr, split(surname_std, '-'))"))
else:
df = df.withColumn('name_arr', expr("array_union(name_arr, array(surname_std))"))
df = df.withColumn('surname_std', expr(f"case when {surname_col_name} is not null then element_at(name_arr,-1) else null end"))
df = df.withColumn('forename1_std', expr("case when size(name_arr) > 1 then element_at(name_arr,1) else null end"))
df = df.withColumn('forename2_std', expr("case when size(name_arr) > 2 then element_at(name_arr,2) else null end"))
Expand All @@ -32,4 +48,4 @@ def standardise_names(df:DataFrame, name_cols: list, drop_orig:bool=True):
if drop_orig:
for n in name_cols:
df = df.drop(n)
return df
return df
8 changes: 4 additions & 4 deletions tests/test_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
def test_names_1(spark):

names_list = [
{"id":1, "first_name":"John","surname":"smith jones"},
{"id":1, "first_name":"John-paul","surname":"smith jones"},
{"id":2, "first_name":"john","surname":"Smith-Jones"},
{"id":3, "first_name":"john.smith","surname":"jones"}
]
Expand All @@ -19,7 +19,7 @@ def test_names_1(spark):
df_result = df.toPandas()

df_expected = [
{'id': 1, 'surname_std': 'jones', 'forename1_std': 'john', 'forename2_std': 'smith', 'forename3_std': None, 'forename4_std': None, 'forename5_std': None},
{'id': 1, 'surname_std': 'jones', 'forename1_std': 'john', 'forename2_std': 'paul', 'forename3_std': 'smith', 'forename4_std': None, 'forename5_std': None},
{'id': 2, 'surname_std': 'jones', 'forename1_std': 'john', 'forename2_std': 'smith', 'forename3_std': None, 'forename4_std': None, 'forename5_std': None},
{'id': 3, 'surname_std': 'jones', 'forename1_std': 'john', 'forename2_std': 'smith', 'forename3_std': None, 'forename4_std': None, 'forename5_std': None}
]
Expand All @@ -35,12 +35,12 @@ def test_names_1(spark):
]

df = spark.createDataFrame(Row(**x) for x in names_list)
df = standardise_names(df, ["first_name", "middle_name", "surname"])
df = standardise_names(df, ["first_name", "middle_name", "surname"], retain_surname=True)

df_result = df.toPandas()

df_expected = [
{'id': 1, 'surname_std': 'jones', 'forename1_std': 'john', 'forename2_std': 'james', 'forename3_std': "peter", 'forename4_std': "smith", 'forename5_std': None},
{'id': 1, 'surname_std': 'smith jones', 'forename1_std': 'john', 'forename2_std': 'james', 'forename3_std': "peter", 'forename4_std': None, 'forename5_std': None},
{'id': 2, 'surname_std': 'jones', 'forename1_std': 'john', 'forename2_std': 'james', 'forename3_std': "peter", 'forename4_std': "smith", 'forename5_std': None},

]
Expand Down

0 comments on commit 3ff6894

Please sign in to comment.