In [37]:
#| default_exp transforms

In [38]:
#| hide
from nbdev.showdoc import *
from fastcore.test import *

In [39]:
#| export
from fastcore.utils import *
import pandas as pd
from pandas import DataFrame
import numpy as np
from typing import Dict, List, Tuple, Any

from data_harmonising.data import *

In [40]:
file = '../data/G201_Quest_PA.sav'
df, meta = read_sav(file, index="ID")
data = Dataset(df, meta)

## Create

Add new variables to the dataset.

In [41]:
#| export
# TODO: broaden it so it also accepts multiple condition_cols; responds differently depending on list or str?
def apply_conditions(df: DataFrame, 
                     condition_col: str, 
                     conditions: List[Dict[str, Any]],
                     default = np.nan
                     ) -> np.ndarray:
    """Apply a series of conditions to a column and return the resulting values."""
    conditions_list = []
    choices = []

    for condition in conditions:
        cond = condition['condition']
        value = condition['value']
        
        if callable(cond):
            conditions_list.append(df[condition_col].apply(cond))
        else:
            conditions_list.append(df[condition_col] == cond)
        
        choices.append(value)
    
    return np.select(conditions_list, choices, default=default)

In [42]:
#| hide

# Tests - refer to Injuries and Incidents G201

In [43]:
#| export
# TODO: decouple from apply_conditions; make separate/orthogonal such that functions can be composed sequentially
def create_variables(df: DataFrame, 
                     transformations: list
                     ) -> DataFrame:
    """Create multiple binary variables based on conditions and concatenate them to the DataFrame."""
    new_columns = {}

    for transformation in transformations:
        target_col = transformation['target_col']
        condition_col = transformation['condition_col']
        conditions = transformation['conditions']

        new_columns[target_col] = apply_conditions(df, condition_col, conditions)

    # Create a new DataFrame with the new columns
    new_df = DataFrame(new_columns, index=df.index)

    # Concatenate the new columns to the original DataFrame
    df = pd.concat([df, new_df], axis=1)

    return df

In [44]:
#| hide

# Tests

conditions = [
    {"condition": 999, "value": 999},
    {"condition": 88, "value": 0},
    {"condition": lambda x: x not in [88, 999], "value": 1}
]

CREATE = [
    {
        "target_col": "G201_IL",
        "condition_col": "G201_IF1",
        "conditions": conditions
    }
]

df = create_variables(df, CREATE)

TEST = {
    "G201_IL": "G201_IF1", 
}

for target_col, condition_col in TEST.items():
    assert all(df.loc[~df[condition_col].isin([88, 999]), target_col] == 1) 
    assert all(df.loc[df[condition_col] == 88, target_col] == 0) 
    assert all(df.loc[df[condition_col] == 999, target_col] == 999) 

## Delete

In [45]:
#| export
def delete_variables(df: DataFrame, # data
                     vars: List[str] # list of variables to remove
                     ) -> DataFrame:
    df = df.drop(vars, axis=1)
    # Test variables were dropped
    for var in vars:
        assert var not in df.columns
    return df

## Rename

In [46]:
#| export
def rename_variables(data: DataFrame, # data or metadata
                     vars: Dict[str, str]
                     ) -> DataFrame:
    data = data.rename(columns=vars)
    # Test changes successful
    for original, updated in vars.items():
        assert original not in data.columns
        assert updated in data.columns
    return data

## Update

In [47]:
#| hide
import nbdev; nbdev.nbdev_export()