In [4]:
%load_ext rich
%load_ext autoreload
%autoreload 2

In [5]:
import polars as pl
import pointblank as pb

from odyssey.core import *
from odyssey.explore import *



In [6]:
from config import INTERIM_DATA

In [7]:
g220 = Dataset("G220_Q.sav", INTERIM_DATA)
lf_g220, _meta = g220.load_data()
df_g220 = lf_g220.collect()

In [8]:
g222 = Dataset("G222_Q.sav", INTERIM_DATA)
lf_g222, _meta = g222.load_data()
df_g222 = lf_g222.collect()

In [9]:
from typing import Callable

def check_total_mins(
    hpd_column: str,
    mpd_column: str
    ) -> Callable:
    """
    Returns a preprocessing function to verify the minutes for a given category have been correctly calculated.
    Cap the total at 180 minutes, and preserve null values.
    """
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        return df.with_columns(
            (pl.col(hpd_column).fill_null(0) * 60 + pl.col(mpd_column).fill_null(0))
            .pipe(lambda expr: pl.when(expr > 180).then(180).otherwise(expr))
            .alias("check")
        )
    return preprocessor

def check_met(
    mins_column: str, 
    n_days_column: str, 
    met_column: str,
    factor: int|float # the corresponding factor for the activity (Vig: 8, Mod: 4, Walk: 3.3)
    ) -> Callable:
    """Returns a preprocessing function to verify the calculated MET value for a given category."""
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        return df.with_columns(
            (pl.col(mins_column).fill_null(0) * pl.col(n_days_column).fill_null(0) * factor).alias("check"),
            pl.col(met_column).fill_null(0)
        )
    return preprocessor

def check_tot_met(
    met_columns: list[str],
    tot_met_column: str
    ) -> Callable:
    """Returns a preprocessing function to verify the calculated total MET value."""
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        expr = sum(pl.col(col).fill_null(0) for col in met_columns)
        
        return df.with_columns(
            expr.alias("check"),
            pl.col(tot_met_column).fill_null(0) # Fill nulls with 0; otherwise the validation skips if one value in a comparison is null
        )
    return preprocessor

In [None]:
#TODO: flesh this out and run validations after initial harmonisation

def check_ipaq_cat(
    vig_d: str, # days of vigorous exercise per week
    mod_d: str,
    walk_d: str,
    vig_m: str, # mins of vigorous exercise per day
    mod_m: str,
    walk_m: str,
    tot_met: str, # total MET minutes per week
    cat: str # IPAQ category (low: 0, moderate: 1, high: 2)
    ) -> Callable:
    """
    Returns a preprocessing function to verify the IPAQ category.

    HIGH: 2
    Vigorous exercise on 3+ days AND >= 1500 MET mins per week
    OR any exercise on 7 days AND >= 3000 MET mins per week

    MODERATE: 1
    Vig exercise 3+ days AND/OR walking 30 mins/day
    OR Mod exercise 5+ days AND/OR walking 30 mins/day
    OR any exercise on 5+ days AND >= 600 MET mins per week

    LOW: 0
    None of the above criteria
        """
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        return df.with_columns(
            # We don't have the data to clearly determine "exercise of any kind on 7 days"
            # Based on my assumption that means the participant exercised every day in the week
            # I've accounted it such that if the participant did two forms of exercise in a single day
            # That would be counted as if they exercised on two days
            (pl.when(
                (pl.col(vig_d).ge(3) & pl.col(tot_met).ge(1500)) | 
                (sum(pl.col(col).fill_null(0) for col in [vig_d, mod_d, walk_d]).ge(7) & pl.col(tot_met).ge(3000))
            ).then(2)
            # The "AND/OR walking 30 mins/day" is also unfortunately ambiguous
            .when(
                (pl.col(vig_d).ge(3) & pl.col(vig_m).ge(30)) |
                (pl.col(mod_d).ge(5) & pl.col(mod_m).ge(30)) |
                (sum(pl.col(col).fill_null(0) for col in [vig_d, mod_d, walk_d]).ge(5) & pl.col(tot_met).ge(600))
            ).then(1)
            .when(pl.col(tot_met).is_null()).then(None) # Don't impute 0 if data is missing
            .otherwise(0)
            ).alias("check"),
            pl.col(cat).fill_null(0) # Fill nulls with 0; otherwise the validation skips if one value in a comparison is null
        )
    return preprocessor

In [16]:
def test_validate_ipaq(
    prefix: str, # prefix for the dataset
    df: pl.DataFrame
    ) -> pb.Validate:

    validation = (
        pb.Validate(
            data=df,
        )
        .col_vals_eq(
            columns=f"{prefix}_IPAQ_CAT",
            value=pb.col("check"),
            pre=check_ipaq_cat(
                f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_WALK_D",
                f"{prefix}_IPAQ_VIG_MINS", f"{prefix}_IPAQ_MOD_MINS", f"{prefix}_IPAQ_WALK_MINS",
                f"{prefix}_IPAQ_TOT_MET", f"{prefix}_IPAQ_CAT"
            ),
            brief="Check `IPAQ_CAT` is correctly calculated."
        )
    ).interrogate()

    return validation

In [18]:
v = test_validate_ipaq("G220", df_g220)
v

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars,2025-06-18|08:29:37Polars
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C66,1,col_vals_eq  col_vals_eq()  Check IPAQ_CAT is correctly calculated.,G220_IPAQ_CAT,check,,✓,1487,1100 0.74,87 0.06,—,—,—,CSV
2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC,2025-06-18 08:29:37 UTC< 1 s2025-06-18 08:29:37 UTC


In [20]:
v.get_step_report(i=1, columns_subset=pb.matches(r"ID|_MET|IPAQ_CAT"))

Report for Validation Step 1ASSERTION G220_IPAQ_CAT = check87 / 1487 TEST UNIT FAILURES IN COLUMN 93 EXTRACT OF FIRST 10 ROWS (WITH TEST UNIT FAILURES IN RED):,Report for Validation Step 1ASSERTION G220_IPAQ_CAT = check87 / 1487 TEST UNIT FAILURES IN COLUMN 93 EXTRACT OF FIRST 10 ROWS (WITH TEST UNIT FAILURES IN RED):,Report for Validation Step 1ASSERTION G220_IPAQ_CAT = check87 / 1487 TEST UNIT FAILURES IN COLUMN 93 EXTRACT OF FIRST 10 ROWS (WITH TEST UNIT FAILURES IN RED):,Report for Validation Step 1ASSERTION G220_IPAQ_CAT = check87 / 1487 TEST UNIT FAILURES IN COLUMN 93 EXTRACT OF FIRST 10 ROWS (WITH TEST UNIT FAILURES IN RED):,Report for Validation Step 1ASSERTION G220_IPAQ_CAT = check87 / 1487 TEST UNIT FAILURES IN COLUMN 93 EXTRACT OF FIRST 10 ROWS (WITH TEST UNIT FAILURES IN RED):,Report for Validation Step 1ASSERTION G220_IPAQ_CAT = check87 / 1487 TEST UNIT FAILURES IN COLUMN 93 EXTRACT OF FIRST 10 ROWS (WITH TEST UNIT FAILURES IN RED):,Report for Validation Step 1ASSERTION G220_IPAQ_CAT = check87 / 1487 TEST UNIT FAILURES IN COLUMN 93 EXTRACT OF FIRST 10 ROWS (WITH TEST UNIT FAILURES IN RED):
Unnamed: 0_level_1,IDFloat64,G220_IPAQ_VIG_METFloat64,G220_IPAQ_MOD_METFloat64,G220_IPAQ_WALK_METFloat64,G220_IPAQ_TOT_METFloat64,G220_IPAQ_CATFloat64
3,10030.0,1920.0,0.0,,,0.0
39,10720.0,7200.0,,,,0.0
58,11180.0,3840.0,2160.0,4158.0,10158.0,0.0
61,11230.0,10080.0,2520.0,4158.0,16758.0,0.0
106,12160.0,2880.0,,4158.0,,0.0
134,12790.0,8640.0,5040.0,4158.0,17838.0,0.0
137,12820.0,0.0,0.0,495.0,495.0,1.0
139,12900.0,7200.0,,,,0.0
181,13810.0,1920.0,,1188.0,,0.0
198,14110.0,3600.0,0.0,,,0.0


In [None]:
def validate_ipaq(
    prefix: str, # prefix for the dataset
    df: pl.DataFrame
    ) -> pb.Validate:

    validation = (
        pb.Validate(
            data=df,
        )
        .col_vals_eq(
            columns=f"{prefix}_IPAQ_VIG_MINS",
            value=pb.col("check"),
            pre=check_total_mins(f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD"),
            brief="Check total mins/day equals `HPD*60 + MPD`",
            na_pass=True,
        )
        .col_vals_eq(
            columns=f"{prefix}_IPAQ_MOD_MINS",
            value=pb.col("check"),
            pre=check_total_mins(f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD"),
            brief="Check total mins/day equals `HPD*60 + MPD`",
            na_pass=True,
        )
        .col_vals_eq(
            columns=f"{prefix}_IPAQ_WALK_MINS",
            value=pb.col("check"),
            pre=check_total_mins(f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD"),
            brief="Check total mins/day equals `HPD*60 + MPD`",
            na_pass=True,
        )
        # .col_vals_eq(
        #     columns=f"{prefix}_IPAQ_VIG_MET",
        #     value=pb.col("check"),
        #     pre=check_met(f"{prefix}_IPAQ_VIG_MINS", f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_VIG_MET", factor=8),
        # )
        # .col_vals_eq(
        #     columns=f"{prefix}_IPAQ_MOD_MET",
        #     value=pb.col("check"),
        #     pre=check_met(f"{prefix}_IPAQ_MOD_MINS", f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_MOD_MET", factor=4),
        # )
        # .col_vals_eq(
        #     columns=f"{prefix}_IPAQ_WALK_MET",
        #     value=pb.col("check"),
        #     pre=check_met(f"{prefix}_IPAQ_WALK_MINS", f"{prefix}_IPAQ_WALK_D", f"{prefix}_IPAQ_WALK_MET", factor=3.3),
        # )
        # .col_vals_eq(
        #     columns=f"{prefix}_IPAQ_TOT_MET",
        #     value=pb.col("check"),
        #     pre=check_tot_met([f"{prefix}_IPAQ_VIG_MET", f"{prefix}_IPAQ_MOD_MET", f"{prefix}_IPAQ_WALK_MET"], f"{prefix}_IPAQ_TOT_MET"),
        #     brief="Check `TOT_MET` equals the sum of `VIG_MET`, `MOD_MET`, and `WALK_MET`"
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_VIG_D",
        #     left=1,
        #     right=7,
        #     segments=(f"{prefix}_IPAQ_VIG_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_VIG_HPD", 
        #     left=0,
        #     right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        #     segments=(f"{prefix}_IPAQ_VIG_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_VIG_MPD", 
        #     left=0,
        #     right=59,
        #     segments=(f"{prefix}_IPAQ_VIG_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_VIG_MINS", 
        #     left=0,
        #     right=180, # total mins per category is capped at 180 mins
        #     segments=(f"{prefix}_IPAQ_VIG_W", 1),
        # )
        # .col_vals_eq(
        #     columns=[f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD", f"{prefix}_IPAQ_VIG_MINS", f"{prefix}_IPAQ_VIG_MET"], 
        #     value=0,
        #     segments=(f"{prefix}_IPAQ_VIG_W", 0)
        # )
        # .col_vals_null(
        #     columns=[f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD", f"{prefix}_IPAQ_VIG_MINS", f"{prefix}_IPAQ_VIG_MET"],
        #     pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_VIG_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        #     segments=(f"{prefix}_IPAQ_VIG_W", -1)
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_MOD_D",
        #     left=1,
        #     right=7,
        #     segments=(f"{prefix}_IPAQ_MOD_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_MOD_HPD", 
        #     left=0,
        #     right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        #     segments=(f"{prefix}_IPAQ_MOD_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_MOD_MPD", 
        #     left=0,
        #     right=59,
        #     segments=(f"{prefix}_IPAQ_MOD_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_MOD_MINS", 
        #     left=0,
        #     right=180, # total mins per category is capped at 180 mins
        #     segments=(f"{prefix}_IPAQ_MOD_W", 1),
        # )
        # .col_vals_eq(
        #     columns=[f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD", f"{prefix}_IPAQ_MOD_MINS", f"{prefix}_IPAQ_MOD_MET"], 
        #     value=0,
        #     segments=(f"{prefix}_IPAQ_MOD_W", 0)
        # )
        # .col_vals_null(
        #     columns=[f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD", f"{prefix}_IPAQ_MOD_MINS", f"{prefix}_IPAQ_MOD_MET"],
        #     pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_MOD_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        #     segments=(f"{prefix}_IPAQ_MOD_W", -1)
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_WALK_D",
        #     left=1,
        #     right=7,
        #     segments=(f"{prefix}_IPAQ_WALK_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_WALK_HPD", 
        #     left=0,
        #     right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        #     segments=(f"{prefix}_IPAQ_WALK_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_WALK_MPD", 
        #     left=0,
        #     right=59,
        #     segments=(f"{prefix}_IPAQ_WALK_W", 1),
        # )
        # .col_vals_between(
        #     columns=f"{prefix}_IPAQ_WALK_MINS", 
        #     left=0,
        #     right=180, # total mins per category is capped at 180 mins
        #     segments=(f"{prefix}_IPAQ_WALK_W", 1),
        # )
        # .col_vals_eq(
        #     columns=[f"{prefix}_IPAQ_WALK_D", f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD", f"{prefix}_IPAQ_WALK_MINS", f"{prefix}_IPAQ_WALK_MET"], 
        #     value=0,
        #     segments=(f"{prefix}_IPAQ_WALK_W", 0)
        # )
        # .col_vals_null(
        #     columns=[f"{prefix}_IPAQ_WALK_D", f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD", f"{prefix}_IPAQ_WALK_MINS", f"{prefix}_IPAQ_WALK_MET"],
        #     pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_WALK_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        #     segments=(f"{prefix}_IPAQ_WALK_W", -1)
        # )
        .col_vals_eq(
            columns=f"{prefix}_IPAQ_CAT",
            value=pb.col("check"),
            pre=check_ipaq_cat([f"{prefix}_IPAQ_VIG_MET", f"{prefix}_IPAQ_MOD_MET", f"{prefix}_IPAQ_WALK_MET"], f"{prefix}_IPAQ_CAT"),
            brief="Check `IPAQ_CAT` is correctly calculated."
        )
    )

    return validation

In [None]:
validation = validate_ipaq(prefix="G222", df=df_g222).interrogate()