In [None]:
%load_ext rich
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import polars as pl
import pointblank as pb
import numpy as np
from rich import print as rprint

from odyssey.core import *
from odyssey.explore import *

In [None]:
from config.paths import RAW_DATA

In [None]:
g220 = Dataset("G220_Q.sav", RAW_DATA)
lf, meta = g220.load_data()

In [None]:
cols = [
    "ID",
    "G220_IPAQ_MOD_D", "G220_IPAQ_MOD_HPD", "G220_IPAQ_MOD_MPD", "G220_IPAQ_MOD_W", 
    "G220_IPAQ_VIG_D", "G220_IPAQ_VIG_HPD", "G220_IPAQ_VIG_MPD", "G220_IPAQ_VIG_W", 
    "G220_IPAQ_WALK_D", "G220_IPAQ_WALK_HPD", "G220_IPAQ_WALK_MPD", "G220_IPAQ_WALK_W", 
    "G220_IPAQ_SIT_WD_HPD", "G220_IPAQ_SIT_WD_MPD", 
    "G220_SIT_WD_TRUNC", "G220_IPAQ_SIT_COM",
    "G220_VIG_MET", "G220_VIG_MINS", 
    "G220_MOD_MET", "G220_MOD_MINS", 
    "G220_WALK_MET", "G220_WALK_MINS",
    "G220_IPAQ_CAT", "G220_TOT_MET", 
]

In [None]:
df = lf.select(cols).collect()

In [None]:
from typing import Callable

def check_total_mins(
    hpd_column: str,
    mpd_column: str
    ) -> Callable:
    """
    Returns a preprocessing function to verify the minutes for a given category have been correctly calculated.
    Cap the total at 180 minutes, and preserve null values.
    """
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        return df.with_columns(
            (pl.col(hpd_column).fill_null(0) * 60 + pl.col(mpd_column).fill_null(0))
            .pipe(lambda expr: pl.when(expr > 180).then(180).otherwise(expr))
            .alias("check")
        )
    return preprocessor

def check_met(
    mins_column: str, 
    n_days_column: str, 
    met_column: str,
    factor: int|float # the corresponding factor for the activity (Vig: 8, Mod: 4, Walk: 3.3)
    ) -> Callable:
    """Returns a preprocessing function to verify the calculated MET value for a given category."""
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        return df.with_columns(
            (pl.col(mins_column).fill_null(0) * pl.col(n_days_column).fill_null(0) * factor).alias("check"),
            pl.col(met_column).fill_null(0)
        )
    return preprocessor

def check_tot_met(
    met_columns: list[str],
    tot_met_column: str
    ) -> Callable:
    """Returns a preprocessing function to verify the calculated total MET value."""
    def preprocessor(df: pl.DataFrame) -> pl.DataFrame:
        expr = sum(pl.col(col).fill_null(0) for col in met_columns)
        
        return df.with_columns(
            expr.alias("check"),
            pl.col(tot_met_column).fill_null(0) # Fill nulls with 0; otherwise the validation skips if one value in a comparison is null
        )
    return preprocessor

In [None]:
# TODO: WIP - create abstract and reusable functions to be used for any given dataset

from functools import reduce

def make_total_minutes_validator(prefix: str) -> Callable[[pb.Validate, str], pb.Validate]:
    """Factory function that returns a validator function with prefix baked in."""
    def add_total_minutes_validation(validator: pb.Validate, category: str) -> pb.Validate:
        return validator.col_vals_eq(
            columns=f"{prefix}_{category}_MINS",
            value=pb.col("check"),
            pre=check_total_mins(f"{prefix}_IPAQ_{category}_HPD", f"{prefix}_IPAQ_{category}_MPD"),
            brief="Check total mins/day equals `HPD*60 + MPD`",
        )
    return add_total_minutes_validation

def apply_total_minutes_validation(
    validator: pb.Validate,
    prefix: str, # prefix of the dataset (ie. "G220")
    categories: list[str]
) -> pb.Validate:
    """
    Take a Validation object and return it with validations applied to check the total minutes calculation is correct.
    """
    return reduce(make_total_minutes_validator(prefix), categories, validator)

In [None]:
validation = (
    pb.Validate(
        data=df,
    )
    .col_vals_eq(
        columns="G220_VIG_MINS",
        value=pb.col("check"),
        pre=check_total_mins("G220_IPAQ_VIG_HPD", "G220_IPAQ_VIG_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns="G220_MOD_MINS",
        value=pb.col("check"),
        pre=check_total_mins("G220_IPAQ_MOD_HPD", "G220_IPAQ_MOD_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns="G220_WALK_MINS",
        value=pb.col("check"),
        pre=check_total_mins("G220_IPAQ_WALK_HPD", "G220_IPAQ_WALK_MPD"),
        brief="Check total mins/day equals `HPD*60 + MPD`",
        na_pass=True,
    )
    .col_vals_eq(
        columns="G220_VIG_MET",
        value=pb.col("check"),
        pre=check_met("G220_VIG_MINS", "G220_IPAQ_VIG_D", "G220_VIG_MET", factor=8),
    )
    .col_vals_eq(
        columns="G220_MOD_MET",
        value=pb.col("check"),
        pre=check_met("G220_MOD_MINS", "G220_IPAQ_MOD_D", "G220_MOD_MET", factor=4),
    )
    .col_vals_eq(
        columns="G220_WALK_MET",
        value=pb.col("check"),
        pre=check_met("G220_WALK_MINS", "G220_IPAQ_WALK_D", "G220_WALK_MET", factor=3.3),
    )
    .col_vals_eq(
        columns="G220_TOT_MET",
        value=pb.col("check"),
        pre=check_tot_met(["G220_VIG_MET", "G220_MOD_MET", "G220_WALK_MET"], "G220_TOT_MET"),
        brief="Check `TOT_MET` equals the sum of `VIG_MET`, `MOD_MET`, and `WALK_MET`"
    )
)

In [None]:
validation_continued = (
    validation
    .col_vals_between(
        columns="G220_IPAQ_VIG_D",
        left=1,
        right=7,
        segments=("G220_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns="G220_IPAQ_VIG_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=("G220_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns="G220_IPAQ_VIG_MPD", 
        left=0,
        right=59,
        segments=("G220_IPAQ_VIG_W", 1),
    )
    .col_vals_between(
        columns="G220_VIG_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=("G220_IPAQ_VIG_W", 1),
    )
    .col_vals_eq(
        columns=["G220_IPAQ_VIG_D", "G220_IPAQ_VIG_HPD", "G220_IPAQ_VIG_MPD", "G220_VIG_MINS", "G220_VIG_MET"], 
        value=0,
        segments=("G220_IPAQ_VIG_W", 0)
    )
    .col_vals_null(
        columns=["G220_IPAQ_VIG_D", "G220_IPAQ_VIG_HPD", "G220_IPAQ_VIG_MPD", "G220_VIG_MINS", "G220_VIG_MET"],
        pre=lambda df: df.with_columns(pl.col("G220_IPAQ_VIG_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=("G220_IPAQ_VIG_W", -1)
    )
)

In [None]:
validation_continued = (
    validation
    .col_vals_between(
        columns="G220_IPAQ_MOD_D",
        left=1,
        right=7,
        segments=("G220_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns="G220_IPAQ_MOD_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=("G220_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns="G220_IPAQ_MOD_MPD", 
        left=0,
        right=59,
        segments=("G220_IPAQ_MOD_W", 1),
    )
    .col_vals_between(
        columns="G220_MOD_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=("G220_IPAQ_MOD_W", 1),
    )
    .col_vals_eq(
        columns=["G220_IPAQ_MOD_D", "G220_IPAQ_MOD_HPD", "G220_IPAQ_MOD_MPD", "G220_MOD_MINS", "G220_MOD_MET"], 
        value=0,
        segments=("G220_IPAQ_MOD_W", 0)
    )
    .col_vals_null(
        columns=["G220_IPAQ_MOD_D", "G220_IPAQ_MOD_HPD", "G220_IPAQ_MOD_MPD", "G220_MOD_MINS", "G220_MOD_MET"],
        pre=lambda df: df.with_columns(pl.col("G220_IPAQ_MOD_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=("G220_IPAQ_MOD_W", -1)
    )
)

In [None]:
validation_continued = (
    validation
    .col_vals_between(
        columns="G220_IPAQ_WALK_D",
        left=1,
        right=7,
        segments=("G220_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns="G220_IPAQ_WALK_HPD", 
        left=0,
        right=18, # unrealistic to do more than 18 hours of exercise per day (even that is a stretch!!)
        segments=("G220_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns="G220_IPAQ_WALK_MPD", 
        left=0,
        right=59,
        segments=("G220_IPAQ_WALK_W", 1),
    )
    .col_vals_between(
        columns="G220_WALK_MINS", 
        left=0,
        right=180, # total mins per category is capped at 180 mins
        segments=("G220_IPAQ_WALK_W", 1),
    )
    .col_vals_eq(
        columns=["G220_IPAQ_WALK_D", "G220_IPAQ_WALK_HPD", "G220_IPAQ_WALK_MPD", "G220_WALK_MINS", "G220_WALK_MET"], 
        value=0,
        segments=("G220_IPAQ_WALK_W", 0)
    )
    .col_vals_null(
        columns=["G220_IPAQ_WALK_D", "G220_IPAQ_WALK_HPD", "G220_IPAQ_WALK_MPD", "G220_WALK_MINS", "G220_WALK_MET"],
        pre=lambda df: df.with_columns(pl.col("G220_IPAQ_WALK_W").fill_null(-1)), # Pointblank doesn't seem to like segmenting values with null, so transform null to -1 and segment y that
        segments=("G220_IPAQ_WALK_W", -1)
    )
)

In [None]:
validation_continued.interrogate()