In [1]:
#| default_exp tests_data

In [2]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [3]:
#| export
from fastcore.utils import *
import pandas as pd
from pandas import DataFrame
import numpy as np
from typing import Dict, List, Tuple, Any

from data_harmonising.data import *
from data_harmonising.transforms import *

import raine_tools as rn

# Tests

## Data

In [4]:
INPUT = "../data/test/input.sav"
OUTPUT = "../data/test/output.sav"

Create test data

In [5]:
data = {
    "ID":   [0, 1, 2, 3, 4],
    "ABC1": [1, 2, 3, 4, 5],
    "ABC2": [1, 2, 3, 4, 5],
    "CBA3": [1, 2, 3, 4, 5],
    "ABC4": [1, 2, 3, 4, 5],
    "ABC5": [1, 2, 3, 4, 5], 
}

df = DataFrame(data)
write_sav(INPUT, df, meta=None)

Alter the data for comparison

In [6]:
df, meta = read_sav(INPUT, index="ID")

In [7]:
# conditions = [
#     {"condition": 1, "value": 10},
#     {"condition": lambda x: x > 3, "value": 1},
#     {"condition": lambda x: 1 < x < 4, "value": 5}
# ]

# CREATE = [
#     {
#         "target_col": "ABC6",
#         "condition_col": "ABC1",
#         "conditions": conditions
#     },
#     {
#         "target_col": "ABC7",
#         "condition_col": "ABC1",
#         "conditions": conditions
#     }
# ]

# df, meta = create_variables(df, meta, CREATE)

In [8]:
CREATE = ["ABC6", "ABC7"]
df, meta = create_variables(df, meta, CREATE)

In [9]:
DELETE = ["ABC4", "ABC5"]
df, meta = delete_variables(df, meta, DELETE)

In [10]:
RENAME = {"CBA3": "ABC3"}
df, meta = rename_variables(df, meta, RENAME)

Next, introduce changes to the raw data

In [11]:
# Case 1: changing one value for one variable
df.loc[0, ["ABC1"]] = 2
# Case 2: changing multiple values for one variable
df.loc[[0, 1], ["ABC2"]] = 0
# Case 3: changing one value for multiple variables
df.loc[[4], ["ABC1", "ABC2"]] = 9
# Case 4: changing multiples values for multiple variables
df.loc[[2, 3], ["ABC1", "ABC2"]] = -88
# Case 5: changing values for a renamed variable
df.loc[0, ["ABC3"]] = 0

In [12]:
write_sav(OUTPUT, df, meta)

Read and compare the files

In [13]:
df_old, _ = read_sav(INPUT)
df_new, _ = read_sav(OUTPUT)

Check that the differences in variable names can all be attributed to the intentional changes; by creating, deleting, and renaming specified variables.

In [14]:
ADDED, REMOVED = reformat_crud(CREATE, RENAME, DELETE)

In [15]:
#| export
def test_columns(df1: DataFrame, # updated raw data
                 df2: DataFrame, # original raw data
                 added: List[str], # variables to be created
                 removed: List[str], # old variables that were renamed
                 ) -> None:
    """Verify only intended changes were made to columns."""
    unchanged = [col for col in df2.columns if col not in removed]

    # Check all columns to be added exist
    assert all([col in df1.columns for col in added])
    # Check all columns to be removed do not exist
    assert all([col not in df1.columns for col in removed])
    # Check all other columns exist
    assert all([col in df1.columns for col in unchanged])

In [16]:
test_columns(df_new, df_old, ADDED, REMOVED)

Next, we'll write tests that validate the actual raw data, comparing the original and updated data to ensure only intended changes were made.

In [18]:
#| export
def verify_condition_changes(df: pd.DataFrame, #
                             cols: List[str], #
                             conds: Dict[str, Any], #
                             val: int|str #
                             ) -> None:
    condition = pd.Series([True] * len(df), index=df.index)
    for col, cond in conds.items():
        if callable(cond):
            condition &= df[col].apply(cond)
        else:
            condition &= (df[col] == cond)
    assert np.all(df.loc[condition, cols].eq(val)), f"Change not applied: {cols}, {conds}"

def verify_index_changes(df: pd.DataFrame, 
                         cols: List[str], #
                         idx: List[int], #
                         val: int|str #
                         ) -> None:
    assert np.all(df.loc[idx, cols].eq(val)), f"Change not applied: {cols}, {idx}"

def verify_changes(df: pd.DataFrame, 
                   changes: List[Dict[str, Any]]
                   ) -> None:
    """Process all explicitly defined changes to ensure they have been correctly implemented."""
    for change in changes:
        match change:
            case {"columns": cols, "index": idx, "value": val}:
                verify_index_changes(df, cols, idx, val)
            case {"columns": cols, "conditions": conds, "value": val}:
                verify_condition_changes(df, cols, conds, val)
            case _:
                raise ValueError(f"Unknown change format: {change}")

Verify that for corresponding columns and IDs specified, these changes were implemented.  
Furthermore, verify that there are no other differences between the original and updated datasets.

In [20]:
CHANGES = [
    {"columns": ["ABC1"], "index": [0], "value": 2},
    {"columns": ["ABC2"], "index": [0, 1], "value": 0},
    {"columns": ["ABC1", "ABC2"], "index": [4], "value": 9},
    {"columns": ["ABC1", "ABC2"], "index": [2, 3], "value": -88},
    {"columns": ["ABC3"], "index": [0], "value": 0},
]

verify_changes(df, CHANGES)

We'll then ensure that this fails when attempting to verify incorrect changes.

In [19]:
FALSE_CHANGES = [
    {
        "columns": ["ABC1"],
        "idx": [0],
        "value": -88,
    },
]

try:
    verify_changes(df, FALSE_CHANGES)
    raise ValueError("Error: incorrect changes passed.")
except AssertionError:
    print("Incorrect changes were succesfully detected.")

Incorrect changes were succesfully detected.


In [34]:
CHANGES3 = [
    {
        "columns": ["G201_HOSP_N_1"],
        "conditions": (df["G201_HOSP_N_1"] == 0),
        "value": 88
    },
    {
        "columns": ["G201_HOSP_N_2", "G201_HOSP_N_3", "G201_HOSP_N_4", "G201_HOSP_N_5", "G201_HOSP_N_6"],
        "conditions": (df["G201_HOSP_N_2"] == 0) & (df["G201_HOSP_ICD9_2"] == "88888"),
        "value": 88
    }
]

KeyError: 'G201_HOSP_N_1'

### Testing datasets are equal where no changes are expected

Firstly, let's test broadly by verifying that for all columns where no changes were made, the values should be equal.

In [22]:
#| export
def filter_updated_for_comparison(df: DataFrame, # updated raw data
                                  create: List[Dict[str, Any]], # variables to be created
                                  ) -> DataFrame:
    """Filter out newly added variables."""
    return df[df.columns.difference(create, sort=False)]

def filter_original_for_comparison(df: DataFrame, # original raw data
                                   rename: Dict[str, str], # variables to be renamed
                                   delete: List[str], # variables to be deleted
                                   ) -> DataFrame:
    """Filter out deleted variables, and rename old variables for comparison."""
    return df[df.columns.difference(delete, sort=False)].rename(columns=rename)

In [23]:
#| export
def columns_with_data_changed(changes: List[Dict[str, Any]] # explicit changes
                              ) -> List[str]:
    """Return a list of unique columns where data was changed."""
    return list({col for change in changes for col in change['columns']})

In [24]:
#| export
def test_data_eq(df1: DataFrame, # updated raw data
                 df2: DataFrame, # original raw data
                 ) -> None:
    """Verify two datasets are the same."""
    pd.testing.assert_frame_equal(df1, df2)

def verify_data_unchanged_for_unchanged_columns(df1: DataFrame, # updated raw data
                                                df2: DataFrame, # original raw data
                                                changes: List[Dict[str, Any]] # explicit changes
                                                ) -> None:
    """Verify the updated and original datasets remain unchanged except for all columns where no changes were implemented."""
    changed_columns = columns_with_data_changed(changes)
    df1 = df1.drop(columns=changed_columns)
    df2 = df2.drop(columns=changed_columns)
    test_data_eq(df1, df2)

In [25]:
df_new_filtered = filter_updated_for_comparison(df_new, CREATE)
df_old_filtered = filter_original_for_comparison(df_old, RENAME, DELETE)

In [26]:
verify_data_unchanged_for_unchanged_columns(df_new_filtered, df_old_filtered, CHANGES)

Next, we'll investigate the columns where changes *were* applied, excluding for the IDs were the changes were applied; all columns besides those IDs should be equal.

In [27]:
#| export
def verify_data_unchanged_for_changed_columns(df1: DataFrame, # updated raw data
                                              df2: DataFrame, # original raw data
                                              changes: List[Dict[str, Any]] # explicit changes
                                              ) -> None:
    """Verify data remains identical for columns impacted by recoding for all IDs where no changes were implemented."""
    # Create a mask for changed cells
    mask = pd.DataFrame(False, index=df1.index, columns=df1.columns)

    for change in changes:
        for col in change['columns']:
            mask.loc[change['idx'], col] = True

    test_data_eq(df1[~mask], df2[~mask])

In [28]:
verify_data_unchanged_for_changed_columns(df_new_filtered, df_old_filtered, CHANGES)

Now introduce a new change that's unrecorded, and verify the test fails

In [29]:
df_new_filtered_changed = df_new_filtered.copy()
df_new_filtered_changed.at[3, "ABC3"] = 0
test_fail(verify_data_unchanged_for_changed_columns, args=(df_new_filtered_changed, df_old_filtered, CHANGES))

In [30]:
#| export
def test_data_created():
    """Verify that columns that were created..."""
    pass

In [31]:
#| export
def run_full_test_suite(df1: DataFrame, # updated raw data
                        df2: DataFrame, # original raw data
                        create: List[Dict[str, Any]], # variables to be created
                        rename: Dict[str, str], # variables to be renamed
                        delete: List[str], # variables to be deleted
                        changes: List[Dict[str, Any]] # explicit changes
                        ) -> None:
    """Run all tests for validating data."""
    # Filter for matching columns to compare
    df1_filtered = filter_updated_for_comparison(df1, create)
    df2_filtered = filter_original_for_comparison(df2, rename, delete)

    added, removed = reformat_crud(create, rename, delete)

    # Validate columns
    test_columns(df1, df2, added, removed)

    # Validate raw data, not including variables that were created or deleted, and renaming the old variables to match for comparison
    verify_changes(df1, changes)

    # Confirm all data remains identical for columns that were not intended for change
    verify_data_unchanged_for_unchanged_columns(df1_filtered, df2_filtered, changes)

    # Confirm that for columns where changes were implemented, data remains the same for all IDs where no changes were intended
    verify_data_unchanged_for_changed_columns(df1_filtered, df2_filtered, changes)

    print("All tests run successfully.")

In [32]:
#| hide
import nbdev; nbdev.nbdev_export()