In [None]:
#| default_exp explore

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#| hide
from nbdev.showdoc import *
from nbdev import nbdev_export

In [None]:
#| export
from pain.read import *
from pathlib import Path
import polars as pl
import regex as re

In [None]:
data_dir = Path("../data/raw")
datasets = [
    Dataset("G214_PQ.sav", data_dir, "G214_PQ_", ["ID", "G214_PQ_PN17", "G214_PQ_PN25", "G214_PQ_PN34", "G214_PQ_PN35", "G214_PQ_PN36"]),
    Dataset("G214_SQ.sav", data_dir, "G214_SQ_", ["ID", "G214_SQ_PN17", "G214_SQ_PN25", "G214_SQ_PN34", "G214_SQ_PN35", "G214_SQ_PN36"]),
    Dataset("G217_PQ.sav", data_dir, "G217_PQ_", ["ID", "G217_PQ_PN17", "G217_PQ_PN25", "G217_PQ_PN34", "G217_PQ_PN35", "G217_PQ_PN36", "G217_PQ_PN38", "G217_PQ_PN9"]),
    Dataset("G217_SQ.sav", data_dir, "G217_SQ_", ["ID", "G217_SQ_PN17", "G217_SQ_PN25", "G217_SQ_PN34", "G217_SQ_PN35", "G217_SQ_PN36", "G217_SQ_PN38", "G217_SQ_PN9"])
]
dataframes, metadata = read_and_filter_data(datasets)
merged_df = combine_dataframes(dataframes)
merged_metadata = merge_dictionaries(metadata)

Define a function which takes a string and outputs a dictionary of the unique values for all columns that match that string.

In [None]:
#| export
def filter_columns(pattern: str,
                   columns: list[str]
                   ) -> list[str]:
    "Return a list of all columns that match a given regex pattern."
    return [col for col in columns if re.search(pattern, col) is not None]

def unpack_unique_values(df: pl.LazyFrame,
                         col: str
                         ) -> tuple[str|int|float]:
    "Return a tuple of the unique values for a given column in a dataframe."
    [vals] = df.select(pl.col(col).unique()).collect().to_dict(as_series=False).values()
    return tuple(vals)

def unique_values(df: pl.LazyFrame,
                  pattern: str
                  ) -> dict[str, set]:
    "Output a tuple of the unique values for each column in a given dataframe that matches the pattern."
    filtered_columns = filter_columns(pattern, df.collect_schema().names())
    return {col: unpack_unique_values(df, col) for col in filtered_columns} 

In [None]:
unique_values(merged_df, "PN38")

{'G217_PQ_PN38': (None, 0.0, 1.0, 7.0, 9.0),
 'G217_SQ_PN38': (None, 0.0, 1.0, 9.0)}

Now define a function to explore the metadata for a particular variable across datasets.
It should again take a string, and return a nested dictionary.

In [None]:
#| export
from collections import defaultdict
from typing import Any
import pandas as pd

In [None]:
#| export
def _filter_metadata(m: dict[dict[str, Any]], # metadata nested dict
                    cols: list[str] # list of columns to filter metadata
                    ) -> dict[dict[str, Any]]:
    "Filter metadata from a dataset for the given columns."
    d = defaultdict(dict)

    for key, nested_dicts in m.items():
        for nested_key, value in nested_dicts.items():
            if nested_key in cols:
                d[key][nested_key] = value

    return d

def filter_metadata(pattern: str, # string or regex to filter columns,
                    df: pl.LazyFrame, # merged dataframe,
                    m: dict[dict[str, Any]] # merged metadata
                    ) -> dict[dict[str, Any]]:
    "Filter metadata for given columns that match the provided pattern."
    cols = df.collect_schema().names()
    filtered_columns = filter_columns(pattern, cols)
    filtered_metadata = _filter_metadata(m, filtered_columns)
    return filtered_metadata

In [None]:
pattern = "PN25"
filtered_metadata = filter_metadata(pattern, merged_df, merged_metadata)
pd.DataFrame(filtered_metadata).T

Unnamed: 0,G214_PQ_PN25,G214_SQ_PN25,G217_PQ_PN25,G217_SQ_PN25
Label,Seek pro advice for back pain,Seek pro advice for back pain,Ever sought health professional advice/treatme...,Seek treatment for back pain?
Field Type,Numeric,Numeric,Numeric,Numeric
Field Width,8,8,8,8
Decimals,0,0,0,0
Variable Type,scale,scale,scale,scale
Field Values,"{0.0: 'No', 1.0: 'Yes', 8.0: 'Not applicable',...","{0.0: 'No', 1.0: 'Yes', 8.0: 'Not applicable',...","{0.0: 'No', 1.0: 'Yes', 7.0: 'Involved in inco...","{0.0: 'No', 1.0: 'Yes', 9.0: 'Not stated'}"


In [None]:
#| hide
nbdev_export()