Skip to content
This repository has been archived by the owner on May 18, 2023. It is now read-only.

Commit

Permalink
Merge pull request #16 from moj-analytical-services/fixissue15
Browse files Browse the repository at this point in the history
Fix issue15
  • Loading branch information
mamonu committed Nov 18, 2021
2 parents 3ff6894 + 9125c72 commit 486d702
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 5 deletions.
8 changes: 4 additions & 4 deletions splink_data_standardisation/date_of_birth.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,11 @@ def null_suspicious_dob_std(df: DataFrame, dob_col: str = "dob_std"):
DataFrame: Original dataframe with suspicious dates of birth nulled out
"""

case_stmt = """
case_stmt = f"""
case
when dob_std = "1900-01-01" then null
when dob_std = "1970-01-01" then null
else dob_std end
when {dob_col} = "1900-01-01" then null
when {dob_col} = "1970-01-01" then null
else {dob_col} end
"""
df = df.withColumn(dob_col, expr(case_stmt))
Expand Down
33 changes: 32 additions & 1 deletion tests/test_dobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
import pandas as pd

from splink_data_standardisation.date_of_birth import standardise_dob
from splink_data_standardisation.date_of_birth import standardise_dob,null_suspicious_dob_std
from pyspark.sql import Row

def test_dob_1(spark):
Expand Down Expand Up @@ -67,3 +67,34 @@ def test_dob_1(spark):
df_result = df2.toPandas()

pd.testing.assert_frame_equal(df_result,df_expected)

def test_null_suspicious_dob_std(spark):


dt = datetime.datetime(1900, 1, 1, 8, 5, 44, 815715)
date = dt.date()
date_str = date.strftime("%Y-%m-%d")
date_str_alt = date.strftime("%d/%m/%Y")

names_list = [
{"dob_lol": "1900-01-01"},
{"dob_lol": "1970-01-01"},
{"dob_lol": None}
]


df = spark.createDataFrame(Row(**x) for x in names_list)

expected = [
{"dob_lol": None},
{"dob_lol": None},
{"dob_lol": None}
]
df_expected = pd.DataFrame(expected)
df2 = null_suspicious_dob_std(df.select("dob_lol"), "dob_lol")
df_result = df2.toPandas()

pd.testing.assert_frame_equal(df_result,df_expected)



0 comments on commit 486d702

Please sign in to comment.