Skip to content
This repository has been archived by the owner on May 18, 2023. It is now read-only.

Commit

Permalink
null_out_values tests added
Browse files Browse the repository at this point in the history
  • Loading branch information
mamonu committed Dec 22, 2020
1 parent a60b229 commit cb55a89
Showing 1 changed file with 67 additions and 0 deletions.
67 changes: 67 additions & 0 deletions tests/test_remove_anomalies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import pytest
import pandas as pd

from splink_data_standardisation.remove_anomalies import null_out_values
from pyspark.sql import Row


def test_null_out_vals_0(spark):

data_list = [
{"id": 1, "mycol": "A"},
{"id": 2, "mycol": "B"},
{"id": 3, "mycol": "B"},
{"id": 4, "mycol": "C"},
{"id": 5, "mycol": "C"},
]

garbagevals = []

df = spark.createDataFrame(Row(**x) for x in data_list)

df = null_out_values(df, "mycol", garbagevals)

df_result = df.toPandas()

df_expected = [
{"id": 1, "mycol": "A"},
{"id": 2, "mycol": "B"},
{"id": 3, "mycol": "B"},
{"id": 4, "mycol": "C"},
{"id": 5, "mycol": "C"},
]

df_expected = pd.DataFrame(df_expected)

pd.testing.assert_frame_equal(df_result, df_expected)


def test_null_out_vals_1(spark):

data_list = [
{"id": 1, "mycol": "A"},
{"id": 2, "mycol": "B"},
{"id": 3, "mycol": "B"},
{"id": 4, "mycol": "C"},
{"id": 5, "mycol": "C"},
]

garbagevals = ["C"]

df = spark.createDataFrame(Row(**x) for x in data_list)

df = null_out_values(df, "mycol", garbagevals)

df_result = df.toPandas()

df_expected = [
{"id": 1, "mycol": "A"},
{"id": 2, "mycol": "B"},
{"id": 3, "mycol": "B"},
{"id": 4, "mycol": None},
{"id": 5, "mycol": None},
]

df_expected = pd.DataFrame(df_expected)

pd.testing.assert_frame_equal(df_result, df_expected)

0 comments on commit cb55a89

Please sign in to comment.