## TODO

- Add validation for metadata -> should be a separate heading (validate alignment across ALL datasets: should be identical)
- Update Dataset class so it doesn't have to take prefix or usecols

In [None]:
%load_ext rich
%load_ext autoreload
%autoreload 2

The rich extension is already loaded. To reload it, use:
  %reload_ext rich
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
from pain.read import *
from pathlib import Path
import numpy as np

import polars as pl
import pandera.polars as pa
from pandera.typing import DataFrame, Series

In [None]:
# from config import METADATA
data_dir = Path("../data/raw")

## Data Schema

Define expected data structure for each variable

In [None]:
PN17 = pa.Field(isin=(-99, 0, 1), coerce=True)
PN25 = pa.Field(isin=(-88, -99, 0, 1), coerce=True)
PN34 = pa.Field(isin=(-88, -99, 0, 1), coerce=True)
PN35 = pa.Field(isin=(-88, -99, 0, 1), coerce=True) # do not include when checking N/A when PN17 is 0
PN36 = pa.Field(isin=(-88, -99, 0, 1), coerce=True)
PN9 = pa.Field(isin=(-88, -99, 0, 1), coerce=True)
PN38 = pa.Field(isin=(-88, -99, 0, 1), coerce=True)

## Metadata Validation

In [None]:
# Schema to validate metadata
# TODO: create a MetadataClass class which contains a list of metadata containers? -> rename current Metadata class to VariableMetadata?
# TODO: Create these in the config file and read them in here

PN17 = Metadata(
    label= "Ever had back pain",
    field_values = {-99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

PN25 = Metadata(
    label= "Sought professional advice/treatment",
    field_values = {-88: "N/A", -99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

PN34 = Metadata(
    label= "Took medication to relieve pain",
    field_values = {-88: "N/A", -99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

PN35 = Metadata(
    label= "Missed work due to pain",
    field_values = {-88: "N/A", -99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

## G214_PQ

In [None]:
G214_PQ = Dataset("G214_PQ.sav", data_dir)
df, meta = G214_PQ.load_data()

In [None]:
# Create df designed to return an error
fake_df = df.collect().to_pandas()
fake_df.iloc[0, 0] = 1
# fake_df.iloc[6, 1] = 1
fake_df = pl.from_pandas(fake_df)

In [None]:
df = (
    df
    .select(
        pl.col("G214_PQ_PN17").replace({9: -99}),
        pl.col("G214_PQ_PN25").replace({8: -88, 9: -99}),
        pl.col("G214_PQ_PN34").replace({8: -88, 9: -99}),
        pl.col("G214_PQ_PN35").replace({8: -88, 9: -99}),
        pl.col("G214_PQ_PN36"),#.replace({8: -88, 9: -99})
    )
).collect()

In [None]:
cols_to_check = pl.col("G214_PQ_PN25", "G214_PQ_PN34")

class G214PQDataSchema(pa.DataFrameModel):
    G214_PQ_PN17: Series[int] = PN17
    G214_PQ_PN25: Series[int] = PN25
    G214_PQ_PN34: Series[int] = PN34
    G214_PQ_PN35: Series[int] = PN35
    G214_PQ_PN36: Series[int] = PN36

    # TODO: this check does not apply for PN35
    @pa.dataframe_check
    def check_for_na(cls, data: pa.PolarsData) -> pl.LazyFrame:
        """Return a LazyFrame with multiple boolean columns."""
        return data.lazyframe.filter(pl.col("G214_PQ_PN17") == 0).select(cols_to_check == -88)
    
    @pa.dataframe_check
    def check_for_na2(cls, data: pa.PolarsData) -> pl.LazyFrame:
        """Return a LazyFrame with multiple boolean columns."""
        return data.lazyframe.filter(pl.col("G214_PQ_PN25") == -88).select(pl.col("G214_PQ_PN17") == 0)

try:
    G214PQDataSchema.validate(df, lazy=True)
except pa.errors.SchemaErrors as err:
    print(err)

{
    "DATA": {
        "DATAFRAME_CHECK": [
            {
                "schema": "G214PQDataSchema",
                "column": "G214_PQ_PN35",
                "check": "isin((-88, -99, 0, 1))",
                "error": "Column 'G214_PQ_PN35' failed validator number 0: <Check isin: isin((-88, -99, 0, 1))> failure case examples: [{'G214_PQ_PN35': 8}, {'G214_PQ_PN35': 8}, {'G214_PQ_PN35': 8}, {'G214_PQ_PN35': 8}, {'G214_PQ_PN35': 8}]"
            },
            {
                "schema": "G214PQDataSchema",
                "column": "G214_PQ_PN36",
                "check": "isin((-88, -99, 0, 1))",
                "error": "Column 'G214_PQ_PN36' failed validator number 0: <Check isin: isin((-88, -99, 0, 1))> failure case examples: [{'G214_PQ_PN36': 8}, {'G214_PQ_PN36': 8}, {'G214_PQ_PN36': 8}, {'G214_PQ_PN36': 8}, {'G214_PQ_PN36': 8}]"
            }
        ]
    }
}


## G214_SQ

In [None]:
G214_SQ = Dataset("G214_SQ.sav", data_dir)
df, _ = G214_SQ.load_data()

In [None]:
df = (
    df
    .select(
        pl.col("G214_SQ_PN17").replace({9: -99}),
        pl.col("G214_SQ_PN25").replace({8: -88, 9: -99}),
        pl.col("G214_SQ_PN34").replace({8: -88, 9: -99}),
        pl.col("G214_SQ_PN35").replace({8: -88, 9: -99}),
        pl.col("G214_SQ_PN36"),#.replace({8: -88, 9: -99}),
    )
)

In [None]:
class G214SQDataSchema(pa.DataFrameModel):
    G214_SQ_PN17: Series[int] = PN17
    G214_SQ_PN25: Series[int] = PN25
    G214_SQ_PN34: Series[int] = PN34
    G214_SQ_PN35: Series[int] = PN35
    G214_SQ_PN36: Series[int] = PN36

try:
    df = G214SQDataSchema.validate(df, lazy=True)
except pa.errors.SchemaErrors as err:
    print(err)

## G217_PQ

In [None]:
G217_PQ = Dataset("G217_PQ.sav", data_dir)
df, _ = G217_PQ.load_data()

In [None]:
df = (
    df
    .select(
        pl.col("G217_PQ_PN17").replace({9: -99}),
        pl.col("G217_PQ_PN9"),#.replace({9: -99}),
        pl.col("G217_PQ_PN38"),#.replace({9: -99}),
        pl.col("G217_PQ_PN25").replace({7: -99, 9: -99}),
        pl.col("G217_PQ_PN34").replace({7: -99, 9: -99}),
        pl.col("G217_PQ_PN35").replace({7: -99, 9: -99}),
        pl.col("G217_PQ_PN36"),#.replace({7: -99, 9: -99})
    )
)

In [None]:
class G217PQDataSchema(pa.DataFrameModel):
    G217_PQ_PN17: Series[int] = PN17
    G217_PQ_PN9: Series[int] = PN9
    G217_PQ_PN38: Series[int] = PN38
    G217_PQ_PN25: Series[int] = PN25
    G217_PQ_PN34: Series[int] = PN34
    G217_PQ_PN35: Series[int] = PN35
    G217_PQ_PN36: Series[int] = PN36

try:
    df = G217PQDataSchema.validate(df, lazy=True)
except pa.errors.SchemaErrors as err:
    print(err)

## G217_SQ

In [None]:
G217_SQ = Dataset("G217_SQ.sav", data_dir)
df, _ = G217_SQ.load_data()

In [None]:
df = (
    df
    .select(
        pl.col("G217_SQ_PN17").replace({9: -99}),
        pl.col("G217_SQ_PN9"),#.replace({9: -99}),
        pl.col("G217_SQ_PN38"),#.replace({9: -99}),
        pl.col("G217_SQ_PN25").replace({9: -99}),
        pl.col("G217_SQ_PN34").replace({9: -99}),
        pl.col("G217_SQ_PN35").replace({9: -99}),
        pl.col("G217_SQ_PN36"),#.replace({9: -99})
    )
)

In [None]:
class G217SQDataSchema(pa.DataFrameModel):
    G217_SQ_PN17: Series[int] = PN17
    G217_SQ_PN9: Series[int] = PN9
    G217_SQ_PN38: Series[int] = PN38
    G217_SQ_PN25: Series[int] = PN25
    G217_SQ_PN34: Series[int] = PN34
    G217_SQ_PN35: Series[int] = PN35
    G217_SQ_PN36: Series[int] = PN36

try:
    df = G217SQDataSchema.validate(df, lazy=True)
except pa.errors.SchemaErrors as err:
    print(err)