# TODO

Variables to explore:

- [x] PN17
- [x] PN25
- [x] PN34
- [x] PN35
- [x] PN36
- [x] PN9
- [x] PN38

## Set up

In [None]:
%load_ext rich
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
import polars as pl
import pandas as pd
import numpy as np
from rich import print as rprint

from pain.read import *
from pain.explore import *

In [None]:
data_dir = Path("../data/raw")

In [None]:
datasets = [
    Dataset("G214_PQ.sav", data_dir, "G214_PQ_", ["ID", "G214_PQ_PN17", "G214_PQ_PN25", "G214_PQ_PN34", "G214_PQ_PN35", "G214_PQ_PN36"]),
    Dataset("G214_SQ.sav", data_dir, "G214_SQ_", ["ID", "G214_SQ_PN17", "G214_SQ_PN25", "G214_SQ_PN34", "G214_SQ_PN35", "G214_SQ_PN36"]),
    Dataset("G217_PQ.sav", data_dir, "G217_PQ_", ["ID", "G217_PQ_PN17", "G217_PQ_PN25", "G217_PQ_PN34", "G217_PQ_PN35", "G217_PQ_PN36", "G217_PQ_PN38", "G217_PQ_PN9"]),
    Dataset("G217_SQ.sav", data_dir, "G217_SQ_", ["ID", "G217_SQ_PN17", "G217_SQ_PN25", "G217_SQ_PN34", "G217_SQ_PN35", "G217_SQ_PN36", "G217_SQ_PN38", "G217_SQ_PN9"])
]

prefixes = ["G214_PQ_", "G214_SQ_", "G217_PQ_", "G217_SQ_"]

In [None]:
dataframes, metadata = read_and_filter_data(datasets)
df = combine_dataframes(dataframes)
meta = merge_dictionaries(metadata)

## PN17

In [None]:
var = "PN17"

### Data

The existing data options for PN17 are all the same:
- 0: No
- 1: Yes
- 9: Missing

Changes to make:
- Convert 9 to -99 in all cases

In [None]:
unique_values(df, var)


[1m{[0m
    [32m'G214_PQ_PN17'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G214_SQ_PN17'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_PQ_PN17'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_SQ_PN17'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m
[1m}[0m

### Metadata

The metadata is generally identical, with a minor discrepancy in the label for G217_SQ.
There are differences in value labels, but these values do not exist in the data, so they can be harmonised without changes to the raw data.

In [None]:
m = filter_metadata(var, df, meta)
pd.DataFrame(m).T

Unnamed: 0,G214_PQ_PN17,G214_SQ_PN17,G217_PQ_PN17,G217_SQ_PN17
Label,Ever had back pain,Ever had back pain,Ever had back pain,Ever had back pain?
Field Type,Numeric,Numeric,Numeric,Numeric
Field Width,8,8,8,8
Decimals,0,0,0,0
Variable Type,scale,scale,scale,scale
Field Values,"{0.0: 'No', 1.0: 'Yes', 8.0: 'Not applicable',...","{0.0: 'No', 1.0: 'Yes', 8.0: 'Not applicable',...","{0.0: 'No', 1.0: 'Yes', 7.0: 'Involved in inco...","{0.0: 'No', 1.0: 'Yes', 9.0: 'Not stated'}"


## PN25

In [None]:
var = "PN25"

### Data

Some of the existing options are the same across all datasets:
- 0: No
- 1: Yes
- 9: Missing

For G214_PQ and G214_SQ
- 8: N/A

For G217_PQ:
- 7: Involved in incorrect skip - not answered

Values of 7 should be considered missing, as discussed with Alex D'Vauz.

Changes:
- Convert 8 to -88
- Convert 9 to -99
- Convert 7 to -99

In [None]:
unique_vals = unique_values(df, var)
rprint(unique_vals)

[1m{[0m
    [32m'G214_PQ_PN25'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G214_SQ_PN25'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_PQ_PN25'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_SQ_PN25'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m
[1m}[0m


In [None]:
# Note the unique values for PN25 across all datasets combined
value_options = set()
for values in unique_vals.values(): 
    value_options.update(values)
value_options.discard(None)
value_options

[1m{[0m[1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m}[0m

#### Define properties to test, validate, and explore

- When PN17 is 0 (No), PN25 should be 8 (N/A)
- When PN17 is 1 (Yes), PN25 should be 0, 1, or 9
- When PN17 is 9 (Missing), PN25 should be 9

In [None]:
test = df.clone().collect()

In [None]:
for value in (0, 1, 9):
    rprint(f"When PN17 == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}PN17") == value)
            .select(f"{p}{var}")
            .unique()
            .to_dict(as_series=False)
        )

When PN17 == [1;36m0[0m
[1m{[0m[32m'G214_PQ_PN25'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN25'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN25'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN25'[0m: [1m[[0m[1;36m0.0[0m, [1;36m9.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m1[0m
[1m{[0m[32m'G214_PQ_PN25'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN25'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m, [1;36m9.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN25'[0m: [1m[[0m[1;36m0.0[0m, [1;36m9.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN25'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m9[0m
[1m{[0m[32m'G214_PQ_PN25'[0m: [1m[[0m[1;36m9.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN25'[0m: [1m[[0m[1;36m9.0[0m[1m]

Properties 2 and 3 were as expected.
Property 1 did not pass, as there were differences in the Y17 follow-ups.
On further investigation, it appears there were two additional questions, related to neck, and shoulder pain, which changed the following logic of the questions.
For Y14, is the participant responded "No" to PN17, all following questions were skipped.
For Y17, they may have responded "No" to PN17, but if they answered "Yes" to having either neck or shoulder pain, they still answered the subsequent questions; hence, values of 0, 1 and 9 (and 7 due to incorrect skips).

In [None]:
for value in value_options:
    rprint(f"When {var} == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}{var}") == value)
            .select(f"{p}PN17")
            .unique()
            .to_dict(as_series=False)
        )

When PN25 == [1;36m0.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN25 == [1;36m1.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
When PN25 == [1;36m7.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m

No additional, unexpected values were found.

### Metadata

Clear discrepancies in labels and field values.

Changes:
- Harmonised label to "Sought professional advice/treatment"
- Updated field values to reflect changes in data

In [None]:
m = filter_metadata(var, df, meta)

In [None]:
rprint(m)

[1;35mdefaultdict[0m[1m([0m[1m<[0m[1;95mclass[0m[39m [0m[32m'dict'[0m[1m>[0m, [1m{[0m
    [32m'Label'[0m: [1m{[0m
        [32m'G214_PQ_PN25'[0m: [32m'Seek pro advice for back pain'[0m,
        [32m'G214_SQ_PN25'[0m: [32m'Seek pro advice for back pain'[0m,
        [32m'G217_PQ_PN25'[0m: [32m'Ever sought health professional advice/treatment for [0m
[32mback pain'[0m,
        [32m'G217_SQ_PN25'[0m: [32m'Seek treatment for back pain?'[0m
    [1m}[0m,
    [32m'Field Type'[0m: [1m{[0m
        [32m'G214_PQ_PN25'[0m: [32m'Numeric'[0m,
        [32m'G214_SQ_PN25'[0m: [32m'Numeric'[0m,
        [32m'G217_PQ_PN25'[0m: [32m'Numeric'[0m,
        [32m'G217_SQ_PN25'[0m: [32m'Numeric'[0m
    [1m}[0m,
    [32m'Field Width'[0m: [1m{[0m
        [32m'G214_PQ_PN25'[0m: [1;36m8[0m,
        [32m'G214_SQ_PN25'[0m: [1;36m8[0m,
        [32m'G217_PQ_PN25'[0m: [1;36m8[0m,
        [32m'G217_SQ_PN25'[0m: [1;36m8[0m
    [1m}[0m,
    [

In [None]:
rprint(m["Field Values"])

[1m{[0m
    [32m'G214_PQ_PN25'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G214_SQ_PN25'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_PQ_PN25'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m7.0[0m: [32m'Involved in incorrect skip - not answered'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_SQ_PN25'[0m: [1m{[0m[1;36m0.0[0m: [32m'No'[0m, [1;36m1.0[0m: [32m'Yes'[0m, [1;36m9.0[0m: [32m'Not stated'[0m[1m}[0m
[1m}[0m


In [None]:
PN25 = Metadata(
    label= "Sought professional advice/treatment",
    field_values = {-88: "N/A", -99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

[31m┌─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m────────────────────[0m[31m─┐[0m
[31m│[0m in [92m<module>[0m:[94m1[0m                                                               [31m│[0m
[31m│[0m                                                                             [31m│[0m
[31m│[0m [31m> [0m1 PN25 = Metadata(                                                        [31m│[0m
[31m│[0m   [2m2 [0m[2m│   [0mlabel= [33m"[0m[33mSought professional advice/treatment[0m[33m"[0m,                      [31m│[0m
[31m│[0m   [2m3 [0m[2m│   [0mfield_values = {-[94m88[0m: [33m"[0m[33mN/A[0m[33m"[0m, -[94m99[0m: [33m"[0m[33mMissing[0m[33m"[0m, [94m0[0m: [33m"[0m[33mNo[0m[33m"[0m, [94m1[0m: [33m"[0m[33mYes[0m[33m"[0m},     [31m│[0m
[31m│[0m   [2m4 [0m[2m│   [0mfield_type = [33m"[0m[33mNumeric[0m[33m"[0m,                                     

## PN34

In [None]:
var = "PN34"

### Data

Some of the existing options are the same across all datasets:
- 0: No
- 1: Yes
- 9: Missing

For G214_PQ and G214_SQ
- 8: N/A

For G217_PQ:
- 7: Involved in incorrect skip - not answered

Values of 7 should be considered missing, as discussed with Alex D'Vauz.

Changes:
- Convert 8 to -88
- Convert 9 to -99
- Convert 7 to -99

In [None]:
unique_vals = unique_values(df, var)
rprint(unique_vals)

[1m{[0m
    [32m'G214_PQ_PN34'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G214_SQ_PN34'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_PQ_PN34'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_SQ_PN34'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m
[1m}[0m


In [None]:
# Note the unique values for PN34 across all datasets combined
value_options = set()
for values in unique_vals.values(): 
    value_options.update(values)
value_options.discard(None)
value_options

[1m{[0m[1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m}[0m

#### Define properties to test, validate, and explore

- When PN17 is 0 (No), PN34 should be 8 (N/A)
- When PN17 is 1 (Yes), PN34 should be 0, 1, or 9
- When PN17 is 9 (Missing), PN34 should be 9

In [None]:
test = df.clone().collect()

In [None]:
for value in (0, 1, 9):
    rprint(f"When PN17 == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}PN17") == value)
            .select(f"{p}{var}")
            .unique()
            .to_dict(as_series=False)
        )

When PN17 == [1;36m0[0m
[1m{[0m[32m'G214_PQ_PN34'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN34'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN34'[0m: [1m[[0m[1;36m9.0[0m, [1;36m7.0[0m, [1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN34'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m, [1;36m9.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m1[0m
[1m{[0m[32m'G214_PQ_PN34'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN34'[0m: [1m[[0m[1;36m0.0[0m, [1;36m9.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN34'[0m: [1m[[0m[1;36m0.0[0m, [1;36m9.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN34'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m9[0m
[1m{[0m[32m'G214_PQ_PN34'[0m: [1m[[0m[1;36m9.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN34'[0m: [1m[[0m[1;36m9.0[0m[1m]

As per PN25, properties 2 and 3 were as expected and property 1 did not pass due to differences in the Y17 follow-ups.

In [None]:
for value in value_options:
    rprint(f"When {var} == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}{var}") == value)
            .select(f"{p}PN17")
            .unique()
            .to_dict(as_series=False)
        )

When PN34 == [1;36m0.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN34 == [1;36m1.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN34 == [1;36m7.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m

No additional, unexpected values were found.

### Metadata

Clear discrepancies in labels and field values.

Changes:
- Harmonised label to "Sought professional advice/treatment"
- Updated field values to reflect changes in data

In [None]:
m = filter_metadata(var, df, meta)

In [None]:
rprint(m)

[1;35mdefaultdict[0m[1m([0m[1m<[0m[1;95mclass[0m[39m [0m[32m'dict'[0m[1m>[0m, [1m{[0m
    [32m'Label'[0m: [1m{[0m
        [32m'G214_PQ_PN34'[0m: [32m'Takes meds to relieve back pain'[0m,
        [32m'G214_SQ_PN34'[0m: [32m'Takes meds to relieve back pain'[0m,
        [32m'G217_PQ_PN34'[0m: [32m'Ever taken medication to relieve back pain'[0m,
        [32m'G217_SQ_PN34'[0m: [32m'Take any medication for back pain?'[0m
    [1m}[0m,
    [32m'Field Type'[0m: [1m{[0m
        [32m'G214_PQ_PN34'[0m: [32m'Numeric'[0m,
        [32m'G214_SQ_PN34'[0m: [32m'Numeric'[0m,
        [32m'G217_PQ_PN34'[0m: [32m'Numeric'[0m,
        [32m'G217_SQ_PN34'[0m: [32m'Numeric'[0m
    [1m}[0m,
    [32m'Field Width'[0m: [1m{[0m
        [32m'G214_PQ_PN34'[0m: [1;36m8[0m,
        [32m'G214_SQ_PN34'[0m: [1;36m8[0m,
        [32m'G217_PQ_PN34'[0m: [1;36m8[0m,
        [32m'G217_SQ_PN34'[0m: [1;36m8[0m
    [1m}[0m,
    [32m'Decimals'[0m: [

In [None]:
rprint(m["Field Values"])

[1m{[0m
    [32m'G214_PQ_PN34'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G214_SQ_PN34'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_PQ_PN34'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m7.0[0m: [32m'Involved in incorrect skip - not answered'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_SQ_PN34'[0m: [1m{[0m[1;36m0.0[0m: [32m'No'[0m, [1;36m1.0[0m: [32m'Yes'[0m, [1;36m9.0[0m: [32m'Not stated'[0m[1m}[0m
[1m}[0m


In [None]:
PN34 = Metadata(
    label= "Took medication to relieve pain",
    field_values = {-88: "N/A", -99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

[31m┌─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m────────────────────[0m[31m─┐[0m
[31m│[0m in [92m<module>[0m:[94m1[0m                                                               [31m│[0m
[31m│[0m                                                                             [31m│[0m
[31m│[0m [31m> [0m1 PN34 = Metadata(                                                        [31m│[0m
[31m│[0m   [2m2 [0m[2m│   [0mlabel= [33m"[0m[33mTook medication to relieve pain[0m[33m"[0m,                           [31m│[0m
[31m│[0m   [2m3 [0m[2m│   [0mfield_values = {-[94m88[0m: [33m"[0m[33mN/A[0m[33m"[0m, -[94m99[0m: [33m"[0m[33mMissing[0m[33m"[0m, [94m0[0m: [33m"[0m[33mNo[0m[33m"[0m, [94m1[0m: [33m"[0m[33mYes[0m[33m"[0m},     [31m│[0m
[31m│[0m   [2m4 [0m[2m│   [0mfield_type = [33m"[0m[33mNumeric[0m[33m"[0m,                                     

## PN35

In [None]:
var = "PN35"

### Data

Some of the existing options are the same across all datasets:
- 0: No
- 1: Yes
- 9: Missing

For G214_PQ and G214_SQ
- 8: N/A

For G217_PQ:
- 7: Involved in incorrect skip - not answered

Values of 7 should be considered missing, as discussed with Alex D'Vauz.

Changes:
- Convert 8 to -88
- Convert 9 to -99
- Convert 7 to -99

In [None]:
unique_vals = unique_values(df, var)
rprint(unique_vals)

[1m{[0m
    [32m'G214_PQ_PN35'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G214_SQ_PN35'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_PQ_PN35'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_SQ_PN35'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m
[1m}[0m


In [None]:
# Note the unique values for PN35 across all datasets combined
value_options = set()
for values in unique_vals.values(): 
    value_options.update(values)
value_options.discard(None)
value_options

[1m{[0m[1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m}[0m

#### Define properties to test, validate, and explore

- When PN17 is 0 (No), PN35 should be 8 (N/A)
- When PN17 is 1 (Yes), PN35 should be 0, 1, or 9 [CORRECTION: as per discussion below, 8 is also valid for PN35]
- When PN17 is 9 (Missing), PN35 should be 9

In [None]:
test = df.clone().collect()

In [None]:
for value in (0, 1, 9):
    rprint(f"When PN17 == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}PN17") == value)
            .select(f"{p}{var}")
            .unique()
            .to_dict(as_series=False)
        )

When PN17 == [1;36m0[0m
[1m{[0m[32m'G214_PQ_PN35'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN35'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN35'[0m: [1m[[0m[1;36m1.0[0m, [1;36m7.0[0m, [1;36m0.0[0m, [1;36m9.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN35'[0m: [1m[[0m[1;36m9.0[0m, [1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m1[0m
[1m{[0m[32m'G214_PQ_PN35'[0m: [1m[[0m[1;36m0.0[0m, [1;36m8.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN35'[0m: [1m[[0m[1;36m0.0[0m, [1;36m9.0[0m, [1;36m1.0[0m, [1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN35'[0m: [1m[[0m[1;36m0.0[0m, [1;36m9.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN35'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m9[0m
[1m{[0m[32m'G214_PQ_PN35'[0m: [1m[[0m[1;36m9.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN35'[

As per PN25, property 3 was as expected and property 1 did not pass due to differences in the Y17 follow-ups.
However, property 2 failed unexpectedly; there are values of 8 (N/A) when PN17 is 1 (Yes).

Upon further exploration, it was confirmed that in ALL cases, this is because the participant was not employed at the time of the questionnaire, and thus the response of 8 is valid.
This was done by checking values of `G214_PQ_YWRK_1` and `G214_PQ_YWRK_YN` when PN17 == 1 and PN35 == 8.

In [None]:
for value in value_options:
    rprint(f"When {var} == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}{var}") == value)
            .select(f"{p}PN17")
            .unique()
            .to_dict(as_series=False)
        )

When PN35 == [1;36m0.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
When PN35 == [1;36m1.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN35 == [1;36m7.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m

No additional, unexpected values were found.

### Metadata

Clear discrepancies in labels and field values.

Changes:
- Harmonised label to "Missed work due to pain"
- Updated field values to reflect changes in data

In [None]:
m = filter_metadata(var, df, meta)

In [None]:
rprint(m)

[1;35mdefaultdict[0m[1m([0m[1m<[0m[1;95mclass[0m[39m [0m[32m'dict'[0m[1m>[0m, [1m{[0m
    [32m'Label'[0m: [1m{[0m
        [32m'G214_PQ_PN35'[0m: [32m'Miss work due to back pain'[0m,
        [32m'G214_SQ_PN35'[0m: [32m'Miss work due to back pain'[0m,
        [32m'G217_PQ_PN35'[0m: [32m'Back pain stopped you from going to work'[0m,
        [32m'G217_SQ_PN35'[0m: [32m'Did back pain stop you going to work?'[0m
    [1m}[0m,
    [32m'Field Type'[0m: [1m{[0m
        [32m'G214_PQ_PN35'[0m: [32m'Numeric'[0m,
        [32m'G214_SQ_PN35'[0m: [32m'Numeric'[0m,
        [32m'G217_PQ_PN35'[0m: [32m'Numeric'[0m,
        [32m'G217_SQ_PN35'[0m: [32m'Numeric'[0m
    [1m}[0m,
    [32m'Field Width'[0m: [1m{[0m
        [32m'G214_PQ_PN35'[0m: [1;36m8[0m,
        [32m'G214_SQ_PN35'[0m: [1;36m8[0m,
        [32m'G217_PQ_PN35'[0m: [1;36m8[0m,
        [32m'G217_SQ_PN35'[0m: [1;36m8[0m
    [1m}[0m,
    [32m'Decimals'[0m: [1m{[0m
 

In [None]:
rprint(m["Field Values"])

[1m{[0m
    [32m'G214_PQ_PN35'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G214_SQ_PN35'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_PQ_PN35'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m7.0[0m: [32m'Involved in incorrect skip - not answered'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_SQ_PN35'[0m: [1m{[0m[1;36m0.0[0m: [32m'No'[0m, [1;36m1.0[0m: [32m'Yes'[0m, [1;36m9.0[0m: [32m'Not stated'[0m[1m}[0m
[1m}[0m


In [None]:
PN35 = Metadata(
    label= "Missed work due to pain",
    field_values = {-88: "N/A", -99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

[31m┌─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m────────────────────[0m[31m─┐[0m
[31m│[0m in [92m<module>[0m:[94m1[0m                                                               [31m│[0m
[31m│[0m                                                                             [31m│[0m
[31m│[0m [31m> [0m1 PN35 = Metadata(                                                        [31m│[0m
[31m│[0m   [2m2 [0m[2m│   [0mlabel= [33m"[0m[33mMissed work due to pain[0m[33m"[0m,                                   [31m│[0m
[31m│[0m   [2m3 [0m[2m│   [0mfield_values = {-[94m88[0m: [33m"[0m[33mN/A[0m[33m"[0m, -[94m99[0m: [33m"[0m[33mMissing[0m[33m"[0m, [94m0[0m: [33m"[0m[33mNo[0m[33m"[0m, [94m1[0m: [33m"[0m[33mYes[0m[33m"[0m},     [31m│[0m
[31m│[0m   [2m4 [0m[2m│   [0mfield_type = [33m"[0m[33mNumeric[0m[33m"[0m,                                     

## PN36

In [None]:
var = "PN36"

### Data

Some of the existing options are the same across all datasets:
- 0: No
- 1: Yes
- 9: Missing

For G214_PQ and G214_SQ
- 8: N/A

For G217_PQ:
- 7: Involved in incorrect skip - not answered

Values of 7 should be considered missing, as discussed with Alex D'Vauz.

Changes:
- Convert 8 to -88
- Convert 9 to -99
- Convert 7 to -99

In [None]:
unique_vals = unique_values(df, var)
rprint(unique_vals)

[1m{[0m
    [32m'G214_PQ_PN36'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G214_SQ_PN36'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_PQ_PN36'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_SQ_PN36'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m
[1m}[0m


In [None]:
# Note the unique values for PN36 across all datasets combined
value_options = set()
for values in unique_vals.values(): 
    value_options.update(values)
value_options.discard(None)
value_options

[1m{[0m[1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m8.0[0m, [1;36m9.0[0m[1m}[0m

#### Define properties to test, validate, and explore

- When PN17 is 0 (No), PN36 should be 8 (N/A)
- When PN17 is 1 (Yes), PN36 should be 0, 1, or 9
- When PN17 is 9 (Missing), PN36 should be 9

In [None]:
test = df.clone().collect()

In [None]:
for value in (0, 1, 9):
    rprint(f"When PN17 == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}PN17") == value)
            .select(f"{p}{var}")
            .unique()
            .to_dict(as_series=False)
        )

When PN17 == [1;36m0[0m
[1m{[0m[32m'G214_PQ_PN36'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN36'[0m: [1m[[0m[1;36m8.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN36'[0m: [1m[[0m[1;36m9.0[0m, [1;36m1.0[0m, [1;36m0.0[0m, [1;36m7.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN36'[0m: [1m[[0m[1;36m0.0[0m, [1;36m9.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m1[0m
[1m{[0m[32m'G214_PQ_PN36'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN36'[0m: [1m[[0m[1;36m9.0[0m, [1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN36'[0m: [1m[[0m[1;36m9.0[0m, [1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN36'[0m: [1m[[0m[1;36m1.0[0m, [1;36m9.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
When PN17 == [1;36m9[0m
[1m{[0m[32m'G214_PQ_PN36'[0m: [1m[[0m[1;36m9.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN36'[0m: [1m[[0m[1

As per PN25, properties 2 and 3 were as expected and property 1 did not pass due to differences in the Y17 follow-ups.

In [None]:
for value in value_options:
    rprint(f"When {var} == {value}")
    for p in prefixes:
        rprint(
            test
            .filter(pl.col(f"{p}{var}") == value)
            .select(f"{p}PN17")
            .unique()
            .to_dict(as_series=False)
        )

When PN36 == [1;36m0.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m0.0[0m, [1;36m1.0[0m[1m][0m[1m}[0m
When PN36 == [1;36m1.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m[1;36m1.0[0m, [1;36m0.0[0m[1m][0m[1m}[0m
When PN36 == [1;36m7.0[0m
[1m{[0m[32m'G214_PQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G214_SQ_PN17'[0m: [1m[[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_PQ_PN17'[0m: [1m[[0m[1;36m0.0[0m[1m][0m[1m}[0m
[1m{[0m[32m'G217_SQ_PN17'[0m: [1m[[0m

No additional, unexpected values were found.

### Metadata

Clear discrepancies in labels and field values.

Changes:
- Harmonised label to "Pain interfered with normal activities"
- Updated field values to reflect changes in data

In [None]:
m = filter_metadata(var, df, meta)

In [None]:
rprint(m)

[1;35mdefaultdict[0m[1m([0m[1m<[0m[1;95mclass[0m[39m [0m[32m'dict'[0m[1m>[0m, [1m{[0m
    [32m'Label'[0m: [1m{[0m
        [32m'G214_PQ_PN36'[0m: [32m'Back pain interferes with daily activities'[0m,
        [32m'G214_SQ_PN36'[0m: [32m'Back pain interferes with daily activities'[0m,
        [32m'G217_PQ_PN36'[0m: [32m'Back pain interfered with normal activities'[0m,
        [32m'G217_SQ_PN36'[0m: [32m'Did the back pain interfere with normal activities?'[0m
    [1m}[0m,
    [32m'Field Type'[0m: [1m{[0m
        [32m'G214_PQ_PN36'[0m: [32m'Numeric'[0m,
        [32m'G214_SQ_PN36'[0m: [32m'Numeric'[0m,
        [32m'G217_PQ_PN36'[0m: [32m'Numeric'[0m,
        [32m'G217_SQ_PN36'[0m: [32m'Numeric'[0m
    [1m}[0m,
    [32m'Field Width'[0m: [1m{[0m
        [32m'G214_PQ_PN36'[0m: [1;36m8[0m,
        [32m'G214_SQ_PN36'[0m: [1;36m8[0m,
        [32m'G217_PQ_PN36'[0m: [1;36m8[0m,
        [32m'G217_SQ_PN36'[0m: [1;36m8[0m
  

In [None]:
rprint(m["Field Values"])

[1m{[0m
    [32m'G214_PQ_PN36'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G214_SQ_PN36'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m8.0[0m: [32m'Not applicable'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_PQ_PN36'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m7.0[0m: [32m'Involved in incorrect skip - not answered'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_SQ_PN36'[0m: [1m{[0m[1;36m0.0[0m: [32m'No'[0m, [1;36m1.0[0m: [32m'Yes'[0m, [1;36m9.0[0m: [32m'Not stated'[0m[1m}[0m
[1m}[0m


In [None]:
PN36 = Metadata(
    label= "Pain interfered with normal activities",
    field_values = {-88: "N/A", -99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

[31m┌─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m────────────────────[0m[31m─┐[0m
[31m│[0m in [92m<module>[0m:[94m1[0m                                                               [31m│[0m
[31m│[0m                                                                             [31m│[0m
[31m│[0m [31m> [0m1 PN36 = Metadata(                                                        [31m│[0m
[31m│[0m   [2m2 [0m[2m│   [0mlabel= [33m"[0m[33mPain interfered with normal activities[0m[33m"[0m,                    [31m│[0m
[31m│[0m   [2m3 [0m[2m│   [0mfield_values = {-[94m88[0m: [33m"[0m[33mN/A[0m[33m"[0m, -[94m99[0m: [33m"[0m[33mMissing[0m[33m"[0m, [94m0[0m: [33m"[0m[33mNo[0m[33m"[0m, [94m1[0m: [33m"[0m[33mYes[0m[33m"[0m},     [31m│[0m
[31m│[0m   [2m4 [0m[2m│   [0mfield_type = [33m"[0m[33mNumeric[0m[33m"[0m,                                     

## PN9

In [None]:
var = "PN9"

### Data

Some of the existing options are the same across all datasets:
- 0: No
- 1: Yes
- 9: Missing

For G217_PQ:
- 7: Involved in incorrect skip - not answered

Values of 7 should be considered missing, as discussed with Alex D'Vauz.

Changes:
- Convert 9 to -99
- Convert 7 to -99

In [None]:
unique_vals = unique_values(df, var)
rprint(unique_vals)

[1m{[0m
    [32m'G217_PQ_PN9'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_SQ_PN9'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m
[1m}[0m


In [None]:
# Note the unique values for PN9 across all datasets combined
value_options = set()
for values in unique_vals.values(): 
    value_options.update(values)
value_options.discard(None)
value_options

[1m{[0m[1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m}[0m

#### Define properties to test, validate, and explore

PN9 is an independent variable, and as such, has no key properties to test.

In [None]:
test = df.clone().collect()

### Metadata

Clear discrepancies in labels and field values.

Changes:
- Harmonised label to "Ever had neck/shoulder pain"
- Updated field values to reflect changes in data

In [None]:
m = filter_metadata(var, df, meta)

In [None]:
rprint(m)

[1;35mdefaultdict[0m[1m([0m[1m<[0m[1;95mclass[0m[39m [0m[32m'dict'[0m[1m>[0m, [1m{[0m
    [32m'Label'[0m: [1m{[0m
        [32m'G217_PQ_PN9'[0m: [32m'Ever had neck/shoulder pain'[0m,
        [32m'G217_SQ_PN9'[0m: [32m'Ever had neck/shoulder pain?'[0m
    [1m}[0m,
    [32m'Field Type'[0m: [1m{[0m[32m'G217_PQ_PN9'[0m: [32m'Numeric'[0m, [32m'G217_SQ_PN9'[0m: [32m'Numeric'[0m[1m}[0m,
    [32m'Field Width'[0m: [1m{[0m[32m'G217_PQ_PN9'[0m: [1;36m8[0m, [32m'G217_SQ_PN9'[0m: [1;36m8[0m[1m}[0m,
    [32m'Decimals'[0m: [1m{[0m[32m'G217_PQ_PN9'[0m: [1;36m0[0m, [32m'G217_SQ_PN9'[0m: [1;36m0[0m[1m}[0m,
    [32m'Variable Type'[0m: [1m{[0m[32m'G217_PQ_PN9'[0m: [32m'scale'[0m, [32m'G217_SQ_PN9'[0m: [32m'scale'[0m[1m}[0m,
    [32m'Field Values'[0m: [1m{[0m
        [32m'G217_PQ_PN9'[0m: [1m{[0m
            [1;36m0.0[0m: [32m'No'[0m,
            [1;36m1.0[0m: [32m'Yes'[0m,
            [1;36m7.0[0m: [32m

In [None]:
rprint(m["Field Values"])

[1m{[0m
    [32m'G217_PQ_PN9'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m7.0[0m: [32m'Involved in incorrect skip - not answered'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_SQ_PN9'[0m: [1m{[0m[1;36m0.0[0m: [32m'No'[0m, [1;36m1.0[0m: [32m'Yes'[0m, [1;36m9.0[0m: [32m'Not stated'[0m[1m}[0m
[1m}[0m


In [None]:
PN9 = Metadata(
    label= "Ever had neck/shoulder pain",
    field_values = {-99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

[31m┌─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m────────────────────[0m[31m─┐[0m
[31m│[0m in [92m<module>[0m:[94m1[0m                                                               [31m│[0m
[31m│[0m                                                                             [31m│[0m
[31m│[0m [31m> [0m1 PN9 = Metadata(                                                         [31m│[0m
[31m│[0m   [2m2 [0m[2m│   [0mlabel= [33m"[0m[33mEver had neck/shoulder pain[0m[33m"[0m,                               [31m│[0m
[31m│[0m   [2m3 [0m[2m│   [0mfield_values = {-[94m99[0m: [33m"[0m[33mMissing[0m[33m"[0m, [94m0[0m: [33m"[0m[33mNo[0m[33m"[0m, [94m1[0m: [33m"[0m[33mYes[0m[33m"[0m},                 [31m│[0m
[31m│[0m   [2m4 [0m[2m│   [0mfield_type = [33m"[0m[33mNumeric[0m[33m"[0m,                                             [31m│[0m
[31m│[0m       

## PN38

In [None]:
var = "PN38"

### Data

Some of the existing options are the same across all datasets:
- 0: No
- 1: Yes
- 9: Missing

For G217_PQ:
- 7: Involved in incorrect skip - not answered

Values of 7 should be considered missing, as discussed with Alex D'Vauz.

Changes:
- Convert 9 to -99
- Convert 7 to -99

In [None]:
unique_vals = unique_values(df, var)
rprint(unique_vals)

[1m{[0m
    [32m'G217_PQ_PN38'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m)[0m,
    [32m'G217_SQ_PN38'[0m: [1m([0m[3;35mNone[0m, [1;36m0.0[0m, [1;36m1.0[0m, [1;36m9.0[0m[1m)[0m
[1m}[0m


In [None]:
# Note the unique values for PN34 across all datasets combined
value_options = set()
for values in unique_vals.values(): 
    value_options.update(values)
value_options.discard(None)
value_options

[1m{[0m[1;36m0.0[0m, [1;36m1.0[0m, [1;36m7.0[0m, [1;36m9.0[0m[1m}[0m

#### Define properties to test, validate, and explore

Given PN38 asks "have you ever had lower back pain", one would assume it's a subset of PN17 ("have you ever had back pain").
Therefore, one would expect when PN17 is 0 (No), PN38 would be 8 (N/A).
Likewise, when PN17 is 9 (Missing), PN38 would likely (but not necessarily) also be 9.

In [None]:
test = df.clone().collect()

In [None]:
(
    test
    .filter(pl.col("G217_PQ_PN17").eq(0))
    .select(pl.col("G217_PQ_PN38").value_counts(sort=True))
)

G217_PQ_PN38
struct[2]
"{0.0,254}"
"{1.0,63}"
"{7.0,12}"
"{9.0,4}"


In [None]:
(
    test
    .filter(pl.col("G217_SQ_PN17").eq(0))
    .select(pl.col("G217_SQ_PN38").value_counts(sort=True))
)

G217_SQ_PN38
struct[2]
"{0.0,165}"
"{1.0,34}"
"{9.0,7}"


It's true when PN17 is missing, PN38 is also missing.
However, when PN17 is 0, there are numerous instances where PN38 is 1; it appears the logic somehow differs, such that people have said "Yes, I have had lower back pain" after saying "No, I've never had back pain".
This may be a result of either misunderstanding on the participants part, an error in the data, or perhaps lower back pain in the questionnaire intentionally distinguishing a difference between back pain (perhaps being upper back) versus lower back pain.
Without further clarification, no changes to the data will be made, and this distinction can be raised and clarified by the researchers.

### Metadata

Clear discrepancies in labels and field values.

Changes:
- Harmonised label to "Ever had low back pain"
- Updated field values to reflect changes in data

In [None]:
m = filter_metadata(var, df, meta)

In [None]:
rprint(m)

[1;35mdefaultdict[0m[1m([0m[1m<[0m[1;95mclass[0m[39m [0m[32m'dict'[0m[1m>[0m, [1m{[0m
    [32m'Label'[0m: [1m{[0m
        [32m'G217_PQ_PN38'[0m: [32m'Ever had low back pain'[0m,
        [32m'G217_SQ_PN38'[0m: [32m'Ever had low back pain?'[0m
    [1m}[0m,
    [32m'Field Type'[0m: [1m{[0m[32m'G217_PQ_PN38'[0m: [32m'Numeric'[0m, [32m'G217_SQ_PN38'[0m: [32m'Numeric'[0m[1m}[0m,
    [32m'Field Width'[0m: [1m{[0m[32m'G217_PQ_PN38'[0m: [1;36m8[0m, [32m'G217_SQ_PN38'[0m: [1;36m8[0m[1m}[0m,
    [32m'Decimals'[0m: [1m{[0m[32m'G217_PQ_PN38'[0m: [1;36m0[0m, [32m'G217_SQ_PN38'[0m: [1;36m0[0m[1m}[0m,
    [32m'Variable Type'[0m: [1m{[0m[32m'G217_PQ_PN38'[0m: [32m'scale'[0m, [32m'G217_SQ_PN38'[0m: [32m'scale'[0m[1m}[0m,
    [32m'Field Values'[0m: [1m{[0m
        [32m'G217_PQ_PN38'[0m: [1m{[0m
            [1;36m0.0[0m: [32m'No'[0m,
            [1;36m1.0[0m: [32m'Yes'[0m,
            [1;36m7.0[0m: [32

In [None]:
rprint(m["Field Values"])

[1m{[0m
    [32m'G217_PQ_PN38'[0m: [1m{[0m
        [1;36m0.0[0m: [32m'No'[0m,
        [1;36m1.0[0m: [32m'Yes'[0m,
        [1;36m7.0[0m: [32m'Involved in incorrect skip - not answered'[0m,
        [1;36m9.0[0m: [32m'Not stated'[0m
    [1m}[0m,
    [32m'G217_SQ_PN38'[0m: [1m{[0m[1;36m0.0[0m: [32m'No'[0m, [1;36m1.0[0m: [32m'Yes'[0m, [1;36m9.0[0m: [32m'Not stated'[0m[1m}[0m
[1m}[0m


In [None]:
PN38 = Metadata(
    label= "Ever had low back pain",
    field_values = {-99: "Missing", 0: "No", 1: "Yes"},
    field_type = "Numeric",
    field_width = 3,
    decimals =  0,
    variable_type = "Nominal"
)

[31m┌─[0m[31m────────────────────[0m[31m [0m[1;31mTraceback [0m[1;2;31m(most recent call last)[0m[31m [0m[31m────────────────────[0m[31m─┐[0m
[31m│[0m in [92m<module>[0m:[94m1[0m                                                               [31m│[0m
[31m│[0m                                                                             [31m│[0m
[31m│[0m [31m> [0m1 PN38 = Metadata(                                                        [31m│[0m
[31m│[0m   [2m2 [0m[2m│   [0mlabel= [33m"[0m[33mEver had low back pain[0m[33m"[0m,                                    [31m│[0m
[31m│[0m   [2m3 [0m[2m│   [0mfield_values = {-[94m99[0m: [33m"[0m[33mMissing[0m[33m"[0m, [94m0[0m: [33m"[0m[33mNo[0m[33m"[0m, [94m1[0m: [33m"[0m[33mYes[0m[33m"[0m},                 [31m│[0m
[31m│[0m   [2m4 [0m[2m│   [0mfield_type = [33m"[0m[33mNumeric[0m[33m"[0m,                                             [31m│[0m
[31m│[0m       

## Further investigation

What values exist for Y17 follow-ups in subsequent variables when PN17, PN9 and PN38 are 0?

In [None]:
(
    df
    .filter(
        pl.col("G217_SQ_PN17") == 0,
        pl.col("G217_SQ_PN9") == 0,
        pl.col("G217_SQ_PN38") == 0
    )
    .select('G217_SQ_PN25', 'G217_SQ_PN34', 'G217_SQ_PN35', 'G217_SQ_PN36')
    .unique()
    .collect()
)

G217_SQ_PN25,G217_SQ_PN34,G217_SQ_PN35,G217_SQ_PN36
f64,f64,f64,f64
0.0,0.0,0.0,0.0
0.0,0.0,0.0,9.0


In [None]:
(
    df
    .filter(
        pl.col("G217_PQ_PN17") == 0,
    )
    .select('G217_PQ_PN25', 'G217_PQ_PN34', 'G217_PQ_PN35', 'G217_PQ_PN36')
    .unique()
    .collect()
)

G217_PQ_PN25,G217_PQ_PN34,G217_PQ_PN35,G217_PQ_PN36
f64,f64,f64,f64
1.0,1.0,0.0,1.0
9.0,9.0,9.0,9.0
0.0,0.0,9.0,0.0
7.0,7.0,7.0,7.0
1.0,0.0,0.0,0.0
…,…,…,…
1.0,1.0,1.0,1.0
0.0,9.0,0.0,0.0
1.0,1.0,0.0,0.0
1.0,0.0,0.0,1.0


In [None]:
(
    df
    .filter(
        pl.col("G217_PQ_PN17") == 0,
    )
    .select('G217_PQ_PN9', 'G217_PQ_PN38')
    .unique()
    .collect()
)

G217_PQ_PN9,G217_PQ_PN38
f64,f64
0.0,9.0
9.0,9.0
1.0,1.0
0.0,0.0
1.0,0.0
1.0,9.0
9.0,0.0
0.0,1.0
7.0,7.0


### Which rows have only missing values for G214_SQ

In [None]:
G214_SQ = Dataset("G214_SQ.sav", data_dir)
lf, _ = G214_SQ.load_data()

In [None]:
df = lf.collect().to_pandas()

In [None]:
df.set_index("ID").loc[[10020, 54570]].dropna(axis="columns")

Unnamed: 0_level_0,SEX,G214_SQ_YWRK_YN,G214_SQ_YHRS_CAT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10020.0,1.0,999.0,999.0
54570.0,1.0,999.0,999.0


In [None]:
lf.filter(~pl.all_horizontal(pl.exclude(["ID", "SEX", "G214_SQ_YWRK_YN", "G214_SQ_YHRS_CAT"]).is_null())).collect()

ID,SEX,G214_SQ_MDECFLAG,G214_SQ_DDECFLAG,G214_SQ_FHOM,G214_SQ_FSOC,G214_SQ_FMON,G214_SQ_PTNR,G214_SQ_BNF,G214_SQ_BNF2,G214_SQ_BNF3,G214_SQ_BNF4,G214_SQ_BNF5,G214_SQ_BNF6,G214_SQ_BNF7,G214_SQ_BNF8,G214_SQ_BNF9,G214_SQ_YWRK_1,G214_SQ_YWRK_YN,G214_SQ_YEMP,G214_SQ_YJOB_CODE,G214_SQ_YHRS,G214_SQ_YHRS_CAT,G214_SQ_YWK1,G214_SQ_YWK2,G214_SQ_YWK3,G214_SQ_YWK4,G214_SQ_YWK5,G214_SQ_YWK6,G214_SQ_YWK7,G214_SQ_YWK8,G214_SQ_YWK9_1,G214_SQ_PWRK_1,G214_SQ_PEMP,G214_SQ_PJOB_CODE,G214_SQ_PHRS,G214_SQ_PWK1,…,G214_SQ_DNWN,G214_SQ_HEMI,G214_SQ_AH1,G214_SQ_AH2,G214_SQ_AH3,G214_SQ_AH19,G214_SQ_AH20,G214_SQ_AH21,G214_SQ_AH22,G214_SQ_AH23,G214_SQ_AH24,G214_SQ_AH25,G214_SQ_AH26,G214_SQ_AH27,G214_SQ_AH28,G214_SQ_AH29,G214_SQ_AH30,G214_SQ_AH31,G214_SQ_AH32,G214_SQ_AH33,G214_SQ_AH34,G214_SQ_AH35,G214_SQ_AH36,G214_SQ_AH37,G214_SQ_AH38,G214_SQ_AH39,G214_SQ_SM26,G214_SQ_SM27,G214_SQ_SM28,G214_SQ_SM29,G214_SQ_VAL2,G214_SQ_QCO1,G214_SQ_DNBY,G214_SQ_CTRY,G214_SQ_Q_DONE,G214_SQ_AGE,G214_SQ_PCOD
f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,date,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
10010.0,0.0,,,0.0,1.0,1.0,0.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,9.0,1.0,0.0,8.0,88.0,888.0,888.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-88.0,-88.0,-88.0,-88.0,…,2003-08-01,1.0,-99.0,4.0,-88.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,8.0,8.0,8.0,8.0,1.0,0.0,4.0,1105.0,1.0,13.7,6107.0
10030.0,1.0,,,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,88.0,888.0,888.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,-88.0,-88.0,-88.0,-88.0,…,2003-06-26,1.0,0.0,-99.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,0.0,0.0,1.0,0.0,2.0,0.0,3.0,1105.0,1.0,13.8,6230.0
10040.0,1.0,,,1.0,-88.0,-88.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,88.0,888.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,-88.0,-88.0,-88.0,-88.0,-88.0,…,2003-06-28,8.0,-99.0,3.0,-88.0,-99.0,-99.0,-99.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,8.0,8.0,8.0,8.0,1.0,0.0,2.0,8888.0,1.0,13.6,8888.0
10050.0,0.0,,,1.0,-88.0,-88.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,88.0,888.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,-88.0,-88.0,-88.0,-88.0,-88.0,…,2003-06-26,8.0,-99.0,3.0,-88.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,8.0,8.0,8.0,8.0,1.0,1.0,2.0,8888.0,1.0,13.7,8888.0
10090.0,1.0,,,1.0,-88.0,-88.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,88.0,888.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,-88.0,-88.0,-88.0,-88.0,-88.0,…,2003-08-27,8.0,-99.0,-99.0,-88.0,0.0,1.0,0.0,-88.0,-88.0,-88.0,0.0,1.0,0.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,0.0,1.0,0.0,8.0,8.0,8.0,8.0,1.0,0.0,6.0,8888.0,1.0,13.7,8888.0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
54490.0,0.0,,,1.0,-88.0,-88.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,88.0,888.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,-88.0,-88.0,-88.0,-88.0,-88.0,…,2006-03-02,8.0,-99.0,2.0,-88.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,8.0,8.0,8.0,8.0,1.0,0.0,2.0,8888.0,1.0,14.0,8888.0
54520.0,0.0,,,1.0,-88.0,-88.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,88.0,888.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,-88.0,-88.0,-88.0,-88.0,-88.0,…,2006-05-01,8.0,-99.0,1.0,-88.0,0.0,1.0,0.0,0.0,1.0,0.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,8.0,8.0,8.0,8.0,1.0,0.0,2.0,8888.0,1.0,14.1,8888.0
54540.0,0.0,,,1.0,-88.0,-88.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,88.0,888.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,-88.0,-88.0,-88.0,-88.0,-88.0,…,2006-04-12,8.0,-99.0,1.0,-88.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,-99.0,0.0,0.0,1.0,-99.0,-99.0,-99.0,8.0,8.0,8.0,8.0,1.0,2.0,2.0,8888.0,1.0,14.0,8888.0
54560.0,1.0,,,1.0,-88.0,-88.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,888.0,8.0,88.0,888.0,888.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0,-88.0,-88.0,-88.0,-88.0,-88.0,…,2006-03-21,8.0,-99.0,-99.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,-88.0,1.0,0.0,0.0,-88.0,-88.0,-88.0,8.0,8.0,8.0,8.0,1.0,3.0,6.0,8888.0,1.0,14.0,8888.0
