In [1]:
#| default_exp tests

In [2]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [3]:
#| export
from fastcore.utils import *
import pandas as pd
from pandas import DataFrame
import numpy as np
from typing import Dict, List, Tuple, Any

from data_harmonising.data import *
from data_harmonising.transforms import *

import raine_tools as rn

## Tests

In [4]:
INPUT = "../data/test/input.sav"
OUTPUT = "../data/test/output.sav"

Create test data

In [5]:
data = {
    "ID":   [0, 1, 2, 3, 4],
    "ABC1": [1, 2, 3, 4, 5],
    "ABC2": [1, 2, 3, 4, 5],
    "CBA3": [1, 2, 3, 4, 5],
    "ABC4": [1, 2, 3, 4, 5],
    "ABC5": [1, 2, 3, 4, 5], 
}

df = DataFrame(data)
write_sav(INPUT, df, meta=None)

Alter the data for comparison

In [6]:
df, _ = read_sav(INPUT, index="ID")

In [7]:
conditions = [
    {"condition": 1, "value": 10},
    {"condition": lambda x: x > 3, "value": 1},
    {"condition": lambda x: 1 < x < 4, "value": 5}
]

CREATE = [
    {
        "target_col": "ABC6",
        "condition_col": "ABC1",
        "conditions": conditions
    },
    {
        "target_col": "ABC7",
        "condition_col": "ABC1",
        "conditions": conditions
    }
]

df = create_variables(df, CREATE)

In [8]:
DELETE = ["ABC4", "ABC5"]
df = delete_variables(df, DELETE)

In [9]:
RENAME = {"CBA3": "ABC3"}
df = rename_variables(df, RENAME)

Next, introduce changes to the raw data

In [10]:
# Case 1: changing one value for one variable
df.loc[0, ["ABC1"]] = 2
# Case 2: changing multiple values for one variable
df.loc[[0, 1], ["ABC2"]] = 0
# Case 3: changing one value for multiple variables
df.loc[[4], ["ABC1", "ABC2"]] = 9
# Case 4: changing multiples values for multiple variables
df.loc[[2, 3], ["ABC1", "ABC2"]] = -88
# Case 5: changing values for a renamed variable
df.loc[0, ["ABC3"]] = 0

In [11]:
write_sav(OUTPUT, df, meta=None)

Read and compare the files

In [12]:
df_old, _ = read_sav(INPUT)
df_new, _ = read_sav(OUTPUT)

Check that the differences in variable names can all be attributed to the intentional changes; by creating, deleting, and renaming specified variables.

In [13]:
#| export
def reformat_create(create: List[Dict[str, Any]] # variables to be created
                    ) -> List[str]:
    return [col['target_col'] for col in create]

def reformat_crud(create: List[Dict[str, Any]], # variables to be created
                  rename: Dict[str, str], # variables to be renamed
                  delete: List[str], # variables to be deleted
                  ) -> List[List[str]]:
    created = reformat_create(create)
    replaced = list(rename.keys())
    updated = list(rename.values())
    added = created + updated
    removed = delete + replaced
    return added, removed

In [14]:
ADDED, REMOVED = reformat_crud(CREATE, RENAME, DELETE)

In [15]:
#| export
def test_columns(df1: DataFrame, # updated raw data
                 df2: DataFrame, # original raw data
                 added: List[str], # variables to be created
                 removed: List[str], # old variables that were renamed
                 ) -> None:
    """Verify only intended changes were made to columns."""
    unchanged = [col for col in df2.columns if col not in removed]

    # Check all columns to be added exist
    assert all([col in df1.columns for col in added])
    # Check all columns to be removed do not exist
    assert all([col not in df1.columns for col in removed])
    # Check all other columns exist
    assert all([col in df1.columns for col in unchanged])

In [16]:
test_columns(df_new, df_old, ADDED, REMOVED)

Next, we'll write tests that validate the actual raw data, comparing the original and updated data to ensure only intended changes were made.

In [17]:
#| export
def verify_change(df: DataFrame, # updated dataset
                  columns: List[str], # columns impacted
                  idx: List[int], # IDs impacted
                  value: int|str # new value
                   ) -> None:
    """Verify that for corresponding columns and IDs specified, these changes were implemented."""
    assert np.all(df.loc[idx, columns] == value)

def verify_changes(df: DataFrame, # updated dataset
                   changes: List[Dict[str, Any]], # explicit changes
                   ) -> None: 
    """Run `verify_change` for each change specified in `changes` for the given dataframe."""
    for change in changes:
        columns = change["columns"]
        idx = change["idx"]
        value = change["value"]
        verify_change(df, columns, idx, value)

Verify that for corresponding columns and IDs specified, these changes were implemented.  
Furthermore, verify that there are no other differences between the original and updated datasets.

In [18]:
CHANGES = [
    {
        "columns": ["ABC1"],
        "idx": [0],
        "value": 2,
    },
    {
        "columns": ["ABC2"],
        "idx": [0, 1],
        "value": 0,
    },
    {
        "columns": ["ABC1", "ABC2"],
        "idx": [4],
        "value": 9,
    },
    {
        "columns": ["ABC1", "ABC2"],
        "idx": [2, 3],
        "value": -88,
    },
    {
        "columns": ["ABC3"],
        "idx": [0],
        "value": 0
    },
]

verify_changes(df, CHANGES)

We'll then ensure that this fails when attempting to verify incorrect changes.

In [19]:
FALSE_CHANGES = [
    {
        "columns": ["ABC1"],
        "idx": [0],
        "value": -88,
    },
]

try:
    verify_changes(df, FALSE_CHANGES)
    raise ValueError("Error: incorrect changes passed.")
except AssertionError:
    print("Incorrect changes were succesfully detected.")

Incorrect changes were succesfully detected.


This is a variation that's potentially more efficient; defining ALL changes for any given column once (could also be done for a given ID, but that makes less sense). However, it's possibly nicer to more explicitly separate each change thematically; greater separation to investigate changes which are made for different reasons.

In [20]:
CHANGES2 = [
    {
        "column": "ABC1",
        "value_to_idx": {
            2: [0],
            9: [4],
            -88: [2, 3]
        }
    },
    {
        "column": "ABC2",
        "value_to_idx": {
            0: [0, 1],
            9: [4],
            -88: [2, 3]
        }
    },
]

### Testing datasets are equal where no changes are expected

Firstly, let's test broadly by verifying that for all columns where no changes were made, the values should be equal.

In [21]:
#| export
def filter_updated_for_comparison(df: DataFrame, # updated raw data
                                  create: List[Dict[str, Any]], # variables to be created
                                  ) -> DataFrame:
    """Filter out newly added variables."""
    created = reformat_create(create)
    return df[df.columns.difference(created, sort=False)]

def filter_original_for_comparison(df: DataFrame, # original raw data
                                   rename: Dict[str, str], # variables to be renamed
                                   delete: List[str], # variables to be deleted
                                   ) -> DataFrame:
    """Filter out deleted variables, and rename old variables for comparison."""
    return df[df.columns.difference(delete, sort=False)].rename(columns=rename)

In [22]:
#| export
def columns_with_data_changed(changes: List[Dict[str, Any]] # explicit changes
                              ) -> List[str]:
    """Return a list of unique columns where data was changed."""
    return list({col for change in changes for col in change['columns']})

In [None]:
#| export
def test_data_eq(df1: DataFrame, # updated raw data
                 df2: DataFrame, # original raw data
                 ) -> None:
    """Verify two datasets are the same."""
    pd.testing.assert_frame_equal(df1, df2)

def verify_data_unchanged_for_unchanged_columns(df1: DataFrame, # updated raw data
                                                df2: DataFrame, # original raw data
                                                changes: List[Dict[str, Any]] # explicit changes
                                                ) -> None:
    """Verify the updated and original datasets remain unchanged except for all columns where no changes were implemented."""
    changed_columns = columns_with_data_changed(changes)
    df1 = df1.drop(columns=changed_columns)
    df2 = df2.drop(columns=changed_columns)
    test_data_eq(df1, df2)

In [24]:
df_new_filtered = filter_updated_for_comparison(df_new, CREATE)
df_old_filtered = filter_original_for_comparison(df_old, RENAME, DELETE)

In [25]:
verify_data_unchanged_for_unchanged_columns(df_new_filtered, df_old_filtered, CHANGES)

Next, we'll investigate the columns where changes *were* applied, excluding for the IDs were the changes were applied; all columns besides those IDs should be equal.

In [26]:
def verify_data_unchanged_for_changed_columns(df1: DataFrame, # updated raw data
                                              df2: DataFrame, # original raw data
                                              changes: Dict[str, List[int]]
                                              ) -> None:
    """Verify data remains identical for columns impacted by recoding for all IDs where no changes were implemented."""
    # Create a mask for changed cells
    mask = pd.DataFrame(False, index=df1.index, columns=df1.columns)

    for change in changes:
        for col in change['columns']:
            mask.loc[change['idx'], col] = True

    test_data_eq(df1[~mask], df2[~mask])

Now introduce a new change that's unrecorded, and verify the test fails

In [35]:
df_new_filtered_changed = df_new_filtered.copy()
df_new_filtered_changed.at[3, "ABC3"] = 0
test_fail(verify_data_unchanged_for_changed_columns, args=(df_new_filtered_changed, df_old_filtered, CHANGES))

In [None]:
#| export

def test_data_unchanged(df1: DataFrame, # updated raw data
                        df2: DataFrame, # original raw data
                        added: List[str] = [], # columns which were added
                        removed: List[str] = [], # columns which were removed
                        ) -> bool:
    """Verify that columns without changes specified remain unchanged."""
    # new_cols = [col for col in df1.columns if col not in added]
    # old_cols = [col for col in df2.columns if col not in removed]
    # return pd.testing.assert_frame_equal(df1[new_cols], df2[old_cols])
    pass


def test_data_created():
    """Verify that columns that were created..."""
    pass

def test_data_renamed(df1: DataFrame, # updated raw data
                      df2: DataFrame, # original raw data
                      renamed: Dict[str, str] = {}, # columns which were renamed
                      ) -> bool:
    """Verify that columns that were renamed..."""
    pass

In [None]:
test_data_unchanged(df_new, df_old, ADDED, REMOVED)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()