In [29]:
%load_ext rich
%load_ext autoreload
%autoreload 2

The rich extension is already loaded. To reload it, use:
  %reload_ext rich
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
from pathlib import Path
import polars as pl
import pointblank as pb
import numpy as np
from rich import print as rprint

from odyssey.core import *
from odyssey.explore import *

In [31]:
from config.paths import RAW_DATA

In [None]:
cols_g220 = [
    "ID",
    "G220_IPAQ_MOD_D", "G220_IPAQ_MOD_HPD", "G220_IPAQ_MOD_MPD", "G220_IPAQ_MOD_W", 
    "G220_IPAQ_VIG_D", "G220_IPAQ_VIG_HPD", "G220_IPAQ_VIG_MPD", "G220_IPAQ_VIG_W", 
    "G220_IPAQ_WALK_D", "G220_IPAQ_WALK_HPD", "G220_IPAQ_WALK_MPD", "G220_IPAQ_WALK_W", 
    "G220_IPAQ_SIT_WD_HPD", "G220_IPAQ_SIT_WD_MPD", "G220_SIT_WD_TRUNC", "G220_IPAQ_SIT_COM",
    "G220_VIG_MET", "G220_VIG_MINS", 
    "G220_MOD_MET", "G220_MOD_MINS", 
    "G220_WALK_MET", "G220_WALK_MINS",
    "G220_IPAQ_CAT", "G220_TOT_MET", 
]

In [47]:
cols_g222 = [
    "G222_IPAQ_VIG_W", "G222_IPAQ_VIG_D", "G222_IPAQ_VIG_HPD", "G222_IPAQ_VIG_MPD", "G222_IPAQ_VIG_UNK", 
    "G222_IPAQ_MOD_W", "G222_IPAQ_MOD_D", "G222_IPAQ_MOD_HPD", "G222_IPAQ_MOD_MPD", "G222_IPAQ_MOD_UNK", 
    "G222_IPAQ_WALK_W", "G222_IPAQ_WALK_D", "G222_IPAQ_WALK_HPD", "G222_IPAQ_WALK_MPD", "G222_IPAQ_WALK_UNK", 
    "G222_IPAQ_SIT_WD_HPD", "G222_IPAQ_SIT_WD_MPD", "G222_SIT_WD_TRUNC", "G222_IPAQ_SIT_WD_UNK", 
    "G222_IPAQ_SIT_WE_HPD", "G222_IPAQ_SIT_WE_MPD", "G222_SIT_WE_TRUNC", "G222_IPAQ_SIT_WE_UNK", 
    "G222_VIG_MINS", "G222_MOD_MINS", "G222_WALK_MINS", 
    "G222_VIG_MET", "G222_MOD_MET", "G222_WALK_MET", "G222_TOT_MET", "G222_IPAQ_CAT", 
]

In [None]:
g220 = Dataset("G220_Q.sav", RAW_DATA)
lf_g220, _meta = g220.load_data()
df_g220 = lf_g220.select(cols_g220).collect()

In [48]:
g222 = Dataset("G222_Q.sav", RAW_DATA)
lf_g222, _meta = g222.load_data()
df_g222 = lf_g222.select(cols_g222).collect()

In [35]:
from typing import Callable

def check_total_mins(
    hpd_column: str,
    mpd_column: str
    ) -> Callable:
    """
    Returns a preprocessing function to verify the minutes for a given category have been correctly calculated.
    Cap the total at 180 minutes, and preserve null values.
    """
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        return df.with_columns(
            (pl.col(hpd_column).fill_null(0) * 60 + pl.col(mpd_column).fill_null(0))
            .pipe(lambda expr: pl.when(expr > 180).then(180).otherwise(expr))
            .alias("check")
        )
    return preprocessor

def check_met(
    mins_column: str, 
    n_days_column: str, 
    met_column: str,
    factor: int|float # the corresponding factor for the activity (Vig: 8, Mod: 4, Walk: 3.3)
    ) -> Callable:
    """Returns a preprocessing function to verify the calculated MET value for a given category."""
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        return df.with_columns(
            (pl.col(mins_column).fill_null(0) * pl.col(n_days_column).fill_null(0) * factor).alias("check"),
            pl.col(met_column).fill_null(0)
        )
    return preprocessor

def check_tot_met(
    met_columns: list[str],
    tot_met_column: str
    ) -> Callable:
    """Returns a preprocessing function to verify the calculated total MET value."""
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        expr = sum(pl.col(col).fill_null(0) for col in met_columns)
        
        return df.with_columns(
            expr.alias("check"),
            pl.col(tot_met_column).fill_null(0) # Fill nulls with 0; otherwise the validation skips if one value in a comparison is null
        )
    return preprocessor

In [36]:
# TODO: WIP - create abstract and reusable functions to be used for any given dataset

from functools import reduce

def make_total_minutes_validator(prefix: str) -> Callable[[pb.Validate, str], pb.Validate]:
    """Factory function that returns a validator function with prefix baked in."""
    def add_total_minutes_validation(validator: pb.Validate, category: str) -> pb.Validate:
        return validator.col_vals_eq(
            columns=f"{prefix}_{category}_MINS",
            value=pb.col("check"),
            pre=check_total_mins(f"{prefix}_IPAQ_{category}_HPD", f"{prefix}_IPAQ_{category}_MPD"),
            brief="Check total mins/day equals `HPD*60 + MPD`",
        )
    return add_total_minutes_validation

def apply_total_minutes_validation(
    validator: pb.Validate,
    prefix: str, # prefix of the dataset (ie. "G220")
    categories: list[str]
) -> pb.Validate:
    """
    Take a Validation object and return it with validations applied to check the total minutes calculation is correct.
    """
    return reduce(make_total_minutes_validator(prefix), categories, validator)

In [None]:
prefix = "G220"

validation = (
    pb.Validate(
        data=df_g220,
    )
    .col_vals_eq(
        columns=f"{prefix}_VIG_MINS",
        value=pb.col("check"),
        pre=check_total_mins(f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns=f"{prefix}_MOD_MINS",
        value=pb.col("check"),
        pre=check_total_mins(f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns=f"{prefix}_WALK_MINS",
        value=pb.col("check"),
        pre=check_total_mins(f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns=f"{prefix}_VIG_MET",
        value=pb.col("check"),
        pre=check_met(f"{prefix}_VIG_MINS", f"{prefix}_IPAQ_VIG_D", f"{prefix}_VIG_MET", factor=8),
    )
    .col_vals_eq(
        columns=f"{prefix}_MOD_MET",
        value=pb.col("check"),
        pre=check_met(f"{prefix}_MOD_MINS", f"{prefix}_IPAQ_MOD_D", f"{prefix}_MOD_MET", factor=4),
    )
    .col_vals_eq(
        columns=f"{prefix}_WALK_MET",
        value=pb.col("check"),
        pre=check_met(f"{prefix}_WALK_MINS", f"{prefix}_IPAQ_WALK_D", f"{prefix}_WALK_MET", factor=3.3),
    )
    .col_vals_eq(
        columns=f"{prefix}_TOT_MET",
        value=pb.col("check"),
        pre=check_tot_met([f"{prefix}_VIG_MET", f"{prefix}_MOD_MET", f"{prefix}_WALK_MET"], f"{prefix}_TOT_MET"),
        brief="Check `TOT_MET` equals the sum of `VIG_MET`, `MOD_MET`, and `WALK_MET`"
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_VIG_D",
        left=1,
        right=7,
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_VIG_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_VIG_MPD", 
        left=0,
        right=59,
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_VIG_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_eq(
        columns=[f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD", f"{prefix}_VIG_MINS", f"{prefix}_VIG_MET"], 
        value=0,
        segments=(f"{prefix}_IPAQ_VIG_W", 0)
    )
    .col_vals_null(
        columns=[f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD", f"{prefix}_VIG_MINS", f"{prefix}_VIG_MET"],
        pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_VIG_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=(f"{prefix}_IPAQ_VIG_W", -1)
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_MOD_D",
        left=1,
        right=7,
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_MOD_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_MOD_MPD", 
        left=0,
        right=59,
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_MOD_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_eq(
        columns=[f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD", f"{prefix}_MOD_MINS", f"{prefix}_MOD_MET"], 
        value=0,
        segments=(f"{prefix}_IPAQ_MOD_W", 0)
    )
    .col_vals_null(
        columns=[f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD", f"{prefix}_MOD_MINS", f"{prefix}_MOD_MET"],
        pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_MOD_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=(f"{prefix}_IPAQ_MOD_W", -1)
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_WALK_D",
        left=1,
        right=7,
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_WALK_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_WALK_MPD", 
        left=0,
        right=59,
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_WALK_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_eq(
        columns=[f"{prefix}_IPAQ_WALK_D", f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD", f"{prefix}_WALK_MINS", f"{prefix}_WALK_MET"], 
        value=0,
        segments=(f"{prefix}_IPAQ_WALK_W", 0)
    )
    .col_vals_null(
        columns=[f"{prefix}_IPAQ_WALK_D", f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD", f"{prefix}_WALK_MINS", f"{prefix}_WALK_MET"],
        pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_WALK_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=(f"{prefix}_IPAQ_WALK_W", -1)
    )
)

In [49]:
prefix = "G222"

validation = (
    pb.Validate(
        data=df_g222,
    )
    .col_vals_eq(
        columns=f"{prefix}_VIG_MINS",
        value=pb.col("check"),
        pre=check_total_mins(f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns=f"{prefix}_MOD_MINS",
        value=pb.col("check"),
        pre=check_total_mins(f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns=f"{prefix}_WALK_MINS",
        value=pb.col("check"),
        pre=check_total_mins(f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns=f"{prefix}_VIG_MET",
        value=pb.col("check"),
        pre=check_met(f"{prefix}_VIG_MINS", f"{prefix}_IPAQ_VIG_D", f"{prefix}_VIG_MET", factor=8),
    )
    .col_vals_eq(
        columns=f"{prefix}_MOD_MET",
        value=pb.col("check"),
        pre=check_met(f"{prefix}_MOD_MINS", f"{prefix}_IPAQ_MOD_D", f"{prefix}_MOD_MET", factor=4),
    )
    .col_vals_eq(
        columns=f"{prefix}_WALK_MET",
        value=pb.col("check"),
        pre=check_met(f"{prefix}_WALK_MINS", f"{prefix}_IPAQ_WALK_D", f"{prefix}_WALK_MET", factor=3.3),
    )
    .col_vals_eq(
        columns=f"{prefix}_TOT_MET",
        value=pb.col("check"),
        pre=check_tot_met([f"{prefix}_VIG_MET", f"{prefix}_MOD_MET", f"{prefix}_WALK_MET"], f"{prefix}_TOT_MET"),
        brief="Check `TOT_MET` equals the sum of `VIG_MET`, `MOD_MET`, and `WALK_MET`"
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_VIG_D",
        left=1,
        right=7,
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_VIG_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_VIG_MPD", 
        left=0,
        right=59,
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_VIG_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=(f"{prefix}_IPAQ_VIG_W", 1),
    )
    .col_vals_eq(
        columns=[f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD", f"{prefix}_VIG_MINS", f"{prefix}_VIG_MET"], 
        value=0,
        segments=(f"{prefix}_IPAQ_VIG_W", 0)
    )
    .col_vals_null(
        columns=[f"{prefix}_IPAQ_VIG_D", f"{prefix}_IPAQ_VIG_HPD", f"{prefix}_IPAQ_VIG_MPD", f"{prefix}_VIG_MINS", f"{prefix}_VIG_MET"],
        pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_VIG_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=(f"{prefix}_IPAQ_VIG_W", -1)
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_MOD_D",
        left=1,
        right=7,
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_MOD_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_MOD_MPD", 
        left=0,
        right=59,
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_MOD_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=(f"{prefix}_IPAQ_MOD_W", 1),
    )
    .col_vals_eq(
        columns=[f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD", f"{prefix}_MOD_MINS", f"{prefix}_MOD_MET"], 
        value=0,
        segments=(f"{prefix}_IPAQ_MOD_W", 0)
    )
    .col_vals_null(
        columns=[f"{prefix}_IPAQ_MOD_D", f"{prefix}_IPAQ_MOD_HPD", f"{prefix}_IPAQ_MOD_MPD", f"{prefix}_MOD_MINS", f"{prefix}_MOD_MET"],
        pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_MOD_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=(f"{prefix}_IPAQ_MOD_W", -1)
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_WALK_D",
        left=1,
        right=7,
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_WALK_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_IPAQ_WALK_MPD", 
        left=0,
        right=59,
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns=f"{prefix}_WALK_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=(f"{prefix}_IPAQ_WALK_W", 1),
    )
    .col_vals_eq(
        columns=[f"{prefix}_IPAQ_WALK_D", f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD", f"{prefix}_WALK_MINS", f"{prefix}_WALK_MET"], 
        value=0,
        segments=(f"{prefix}_IPAQ_WALK_W", 0)
    )
    .col_vals_null(
        columns=[f"{prefix}_IPAQ_WALK_D", f"{prefix}_IPAQ_WALK_HPD", f"{prefix}_IPAQ_WALK_MPD", f"{prefix}_WALK_MINS", f"{prefix}_WALK_MET"],
        pre=lambda df: df.with_columns(pl.col(f"{prefix}_IPAQ_WALK_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=(f"{prefix}_IPAQ_WALK_W", -1)
    )
)

In [50]:
validation.interrogate()

Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation,Pointblank Validation
2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars,2025-05-26|09:00:14Polars
Unnamed: 0_level_2,Unnamed: 1_level_2,STEP,COLUMNS,VALUES,TBL,EVAL,UNITS,PASS,FAIL,W,E,C,EXT
#4CA64C66,1,col_vals_eq  col_vals_eq()  Check total mins/day equals HPD*60 + MPD,G222_VIG_MINS,check,,✓,1146,1131 0.99,15 0.01,—,—,—,CSV
#4CA64C66,2,col_vals_eq  col_vals_eq()  Check total mins/day equals HPD*60 + MPD,G222_MOD_MINS,check,,✓,1146,1130 0.99,16 0.01,—,—,—,CSV
#4CA64C66,3,col_vals_eq  col_vals_eq()  Check total mins/day equals HPD*60 + MPD,G222_WALK_MINS,check,,✓,1146,1131 0.99,15 0.01,—,—,—,CSV
#4CA64C66,4,col_vals_eq  col_vals_eq(),G222_VIG_MET,check,,✓,1146,1132 0.99,14 0.01,—,—,—,CSV
#4CA64C66,5,col_vals_eq  col_vals_eq(),G222_MOD_MET,check,,✓,1146,1132 0.99,14 0.01,—,—,—,CSV
#4CA64C66,6,col_vals_eq  col_vals_eq(),G222_WALK_MET,check,,✓,1146,1132 0.99,14 0.01,—,—,—,CSV
#4CA64C66,7,"col_vals_eq  col_vals_eq()  Check TOT_MET equals the sum of VIG_MET, MOD_MET, and WALK_MET",G222_TOT_MET,check,,✓,1146,1104 0.96,42 0.04,—,—,—,CSV
#4CA64C,8,SEGMENT G222_IPAQ_VIG_W / 1 col_vals_between  col_vals_between(),G222_IPAQ_VIG_D,"[1, 7]",,✓,756,756 1.00,0 0.00,—,—,—,—
#4CA64C66,9,SEGMENT G222_IPAQ_VIG_W / 1 col_vals_between  col_vals_between(),G222_IPAQ_VIG_HPD,"[0, 18]",,✓,756,721 0.95,35 0.05,—,—,—,CSV
#4CA64C66,10,SEGMENT G222_IPAQ_VIG_W / 1 col_vals_between  col_vals_between(),G222_IPAQ_VIG_MPD,"[0, 59]",,✓,756,721 0.95,35 0.05,—,—,—,CSV
