In [3]:
#| default_exp tests

In [4]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [5]:
#| export
from fastcore.utils import *
import pandas as pd
from pandas import DataFrame
import numpy as np
from typing import Dict, List, Tuple, Any

from data_harmonising.data import *
from data_harmonising.transforms import *

import raine_tools as rn

## Tests

In [6]:
INPUT = "../data/test/input.sav"
OUTPUT = "../data/test/output.sav"

Create test data

In [7]:
data = {
    "ID":   [0, 1, 2, 3, 4],
    "ABC1": [1, 2, 3, 4, 5],
    "ABC2": [1, 2, 3, 4, 5],
    "CBA3": [1, 2, 3, 4, 5],
    "ABC4": [1, 2, 3, 4, 5],
    "ABC5": [1, 2, 3, 4, 5], 
}

df = DataFrame(data)
write_sav(INPUT, df, meta=None)

Alter the data for comparison

In [8]:
df, _ = read_sav(INPUT, index="ID")

In [9]:
conditions = [
    {"condition": 1, "value": 10},
    {"condition": lambda x: x > 3, "value": 1},
    {"condition": lambda x: 1 < x < 4, "value": 5}
]

CREATE = [
    {
        "target_col": "ABC6",
        "condition_col": "ABC1",
        "conditions": conditions
    },
    {
        "target_col": "ABC7",
        "condition_col": "ABC1",
        "conditions": conditions
    }
]

df = create_variables(df, CREATE)

In [10]:
DELETE = ["ABC4", "ABC5"]
df = delete_variables(df, DELETE)

In [11]:
RENAME = {"CBA3": "ABC3"}
df = rename_variables(df, RENAME)

Next, introduce changes to the raw data

In [12]:
# Case 1: changing one value for one variable
df.loc[0, ["ABC1"]] = 2
# Case 2: changing multiple values for one variable
df.loc[[0, 1], ["ABC2"]] = 0
# Case 3: changing one value for multiple variables
df.loc[[4], ["ABC1", "ABC2"]] = 9
# Case 4: changing multiples values for multiple variables
df.loc[[2, 3], ["ABC1", "ABC2"]] = -88
# Case 5: changing values for a renamed variable
df.loc[0, ["ABC3"]] = 0

In [13]:
write_sav(OUTPUT, df, meta=None)

Read and compare the files

In [14]:
df_old, _ = read_sav(INPUT)
df_new, _ = read_sav(OUTPUT)

Check that the differences in variable names can all be attributed to the intentional changes; by creating, deleting, and renaming specified variables.

In [15]:
#| export
def reformat_crud(create: List[Dict[str, Any]], # variables to be created
                  rename: Dict[str, str], # variables to be renamed
                  delete: List[str], # variables to be deleted
                  ) -> List[List[str]]:
    created = [col['target_col'] for col in create]
    replaced = list(rename.keys())
    updated = list(rename.values())
    added = created + updated
    removed = delete + replaced
    return added, removed

In [16]:
ADDED, REMOVED = reformat_crud(CREATE, RENAME, DELETE)

In [17]:
#| export
def test_columns(df1: DataFrame, # updated raw data
                 df2: DataFrame, # original raw data
                 added: List[str], # variables to be created
                 removed: List[str], # old variables that were renamed
                 ) -> None:
    """Verify only intended changes were made to columns."""
    unchanged = [col for col in df2.columns if col not in removed]

    # Check all columns to be added exist
    assert all([col in df1.columns for col in added])
    # Check all columns to be removed do not exist
    assert all([col not in df1.columns for col in removed])
    # Check all other columns exist
    assert all([col in df1.columns for col in unchanged])

In [18]:
test_columns(df_new, df_old, ADDED, REMOVED)

Next, we'll write tests that validate the actual raw data, comparing the original and updated data to ensure only intended changes were made.

In [19]:
#| export
def test_changes_correctly_implemented(df1: DataFrame, # updated data
                                       df2: DataFrame, # original data
                                       columns: List[str], # columns impacted
                                       idx: List[int], # IDs impacted
                                       new_value: str|int, # new value
                                       old_value: str|int # value that was replaced
                                       ) -> bool: # Output True if all expected changes pass
    """Validate expected change correctly implemented."""
    dfx = df1.loc[idx, columns].compare(df2.loc[idx, columns], result_names=("new", "old"))
    dfx = dfx.stack(future_stack=True).swaplevel()
    assert (dfx.loc["new"] == new_value).all().all()
    assert (dfx.loc["old"] == old_value).all().all()

def validate_changes(changes: List[Dict[str, Any]], # specified changes in an expected format
                     df1: DataFrame, # updated data
                     df2: DataFrame, # original data
                     ) -> None:
    """Iteratively validate that all specified changes were correctly implemented."""
    for n, change in enumerate(changes, start=1):
        print(f"Change {n}")
        columns = change["columns"]
        idx = change["idx"]
        new_value = change["new_value"]
        old_value = change["old_value"]
        test_changes_correctly_implemented(df1, df2, columns, idx, new_value, old_value)

In [21]:
# df.loc[0, ["ABC1"]] = 2
# df.loc[[0, 1], ["ABC2"]] = 0
# df.loc[[4], ["ABC1", "ABC2"]] = 9
# df.loc[[2, 3], ["ABC1", "ABC2"]] = -88
# df.loc[0, ["ABC3"]] = 0

CHANGES = [
    {
        "columns": ["ABC1"],
        "idx": [0],
        "old_value": 1,
        "new_value": 2 
    },
    {
        "columns": ["ABC2"],
        "idx": [0],
        "old_value": 1,
        "new_value": 0
    },
    {
        "columns": ["ABC2"],
        "idx": [1],
        "old_value": 2,
        "new_value": 0
    },
    {
        "columns": ["ABC1", "ABC2"],
        "idx": [4],
        "old_value": 5,
        "new_value": 9
    },
    {
        "columns": ["ABC1", "ABC2"],
        "idx": [2],
        "old_value": 3,
        "new_value": -88
    },
    {
        "columns": ["ABC1", "ABC2"],
        "idx": [3],
        "old_value": 4,
        "new_value": -88
    },
    # {
    #     "columns": ["ABC3"],
    #     "idx": [0],
    #     "old_value": 1,
    #     "new_value": 0
    # },
]

validate_changes(CHANGES, df_new, df_old)

Change 1
Change 2
Change 3
Change 4
Change 5
Change 6


In [None]:
# df.loc[0, ["ABC1"]] = 2
# df.loc[[0, 1], ["ABC2"]] = 0
# df.loc[[4], ["ABC1", "ABC2"]] = 9
# df.loc[[2, 3], ["ABC1", "ABC2"]] = -88
# df.loc[0, ["ABC3"]] = 0

CHANGES2 = [
    {
        "columns": ["ABC1"],
        "idx": [0],
        "value": 2,
    },
    {
        "columns": ["ABC2"],
        "idx": [0, 1],
        "value": 0,
    },
    {
        "columns": ["ABC1", "ABC2"],
        "idx": 4,
        "value": 9,
    },
    {
        "columns": ["ABC1", "ABC2"],
        "idx": [2, 3],
        "value": -88,
    },
    # Case 5: changing values for a renamed variable
    # {
    #     "columns": ["ABC3"],
    #     "idx_to_values": {0: 0}
    # },
]

FALSE_CHANGES = [
    {
        "columns": ["ABC1"],
        "idx": [0],
        "value": -88,
    },
]

Verify that for corresponding columns and IDs specified, these changes were implemented.  
Furthermore, verify that there are no other differences between the original and updated datasets.

In [None]:
#| export
def verify_change(df: DataFrame, # updated dataset
                  columns: List[str], # columns impacted
                  idx: List[int], # IDs impacted
                  value: int|str # new value
                   ) -> None:
    """Verify that for corresponding columns and IDs specified, these changes were implemented."""
    assert np.all(df.loc[idx, columns] == value)

# def read_changes(changes: List[Dict[str, Any]]
#                  ) -> Tuple[List[str], List[str], int|str]:
#     columns = change['columns']
#     [idx] = change["value_to_idx"].values()
#     [value] = change["value_to_idx"].keys()
#     return (columns, idx, value)

In [31]:
for n, change in enumerate(CHANGES2, start=1):
    columns = change["columns"]
    idx = change["idx"]
    value = change["value"]
    print("Change", n)
    verify_change(df, columns, idx, value)

Change 1
Change 2
Change 3
Change 4


This is a variation that's potentially more efficient; defining ALL changes for any given column once (could also be done for a given ID, but that makes less sense). However, it's possibly nicer to more explicitly separate each change thematically; greater separation to investigate changes which are made for different reasons.

In [25]:
CHANGES3 = [
    {
        "column": "ABC1",
        "value_to_idx": {
            2: [0],
            9: [4],
            -88: [2, 3]
        }
    },
    {
        "column": "ABC2",
        "value_to_idx": {
            0: [0, 1],
            9: [4],
            -88: [2, 3]
        }
    },
]

In [26]:
#| export

def test_data_unchanged(df1: DataFrame, # updated raw data
                        df2: DataFrame, # original raw data
                        added: List[str] = [], # columns which were added
                        removed: List[str] = [], # columns which were removed
                        ) -> bool:
    """Verify that columns without changes specified remain unchanged."""
    # new_cols = [col for col in df1.columns if col not in added]
    # old_cols = [col for col in df2.columns if col not in removed]
    # return pd.testing.assert_frame_equal(df1[new_cols], df2[old_cols])
    pass


def test_data_created():
    """Verify that columns that were created..."""
    pass

def test_data_renamed(df1: DataFrame, # updated raw data
                      df2: DataFrame, # original raw data
                      renamed: Dict[str, str] = {}, # columns which were renamed
                      ) -> bool:
    """Verify that columns that were renamed..."""
    pass

In [27]:
test_data_unchanged(df_new, df_old, ADDED, REMOVED)

In [28]:
#| hide
import nbdev; nbdev.nbdev_export()