# G201

## Set up

In [1]:
from fastcore.test import *
from fastcore.utils import *
import pandas as pd
from pandas import DataFrame
import numpy as np
import pyspssio

import raine_tools as rn
from data_harmonising.data import *
from data_harmonising.transforms import *

In [2]:
INPUT = "../data/raw/G201_Quest_PA.sav"
OUTPUT = "../data/interim/G201_Quest_PA.sav"

In [3]:
df, meta = read_sav(INPUT, index="ID")

In [None]:
# wb = pd.read_excel(WORKBOOK, sheet_name=SHEETNAME)
# wb['vn_old'] = wb['Dataset'] + '_' + wb['Variable Name']
# wb['vn_new'] = wb['Dataset'] + '_' + wb['Harmonised Variable Name']
# VARS = wb.loc[wb['Dataset'] == 'G201', "Variable Name"].to_list()

## Data exploration

### The strange case of `G201_IF13`

Why does `G201_IF13` exist, especially when there's no corresponding `G201_IC13`?  
Furthermore, the values don't align with the previous IC? variables.

In [None]:
df.loc[:, ["G201_IF13"]].value_counts() 

For all other cases of `G201_IF1` to `G201_IF12`, a 0 means that the incident is related to the incident in the previous IF? variable.  
When `G201_IF13` is 0, the majority of cases were N/A, and values range across the board; this doesn't fit with the existing logic.

In [None]:
df.loc[df["G201_IF13"] == 0, "G201_IF12":"G201_IF13"].value_counts()

At least when `G201_IF13` is missing (9), the previous value was also missing (999).

In [None]:
df.loc[df["G201_IF13"] == 9, "G201_IF12":"G201_IF13"].value_counts()

Finally, when `G201_IF13` is 1 or 2, the previous values are 0 or 1; this doesn't specifically highlight any errors.

In [None]:
df.loc[df["G201_IF13"].between(1, 2), "G201_IF12":"G201_IF13"].value_counts()

`G201_IF13` does not align with the previous values, and there are clear logical discrepancies.  
Given there is no corresponding `G201_IC13` to explain the few instances of a 1 or 2, and the 0s appear to be unreliable, I plan to drop it.

### Checking that IF? and IC? variables are correctly aligned

In [None]:
IF_ROWS = ['G201_IF1', 'G201_IF2', 'G201_IF3', 'G201_IF4', 'G201_IF5', 'G201_IF6',
           'G201_IF7', 'G201_IF8', 'G201_IF9', 'G201_IF10', 'G201_IF11', 'G201_IF12']

IC_ROWS = ['G201_IC1', 'G201_IC2', 'G201_IC3', 'G201_IC4', 'G201_IC5', 'G201_IC6',
           'G201_IC7', 'G201_IC8', 'G201_IC9', 'G201_IC10', 'G201_IC11', 'G201_IC12']

If `G201_IC?` is N/A ("88888"), the corresponding `G201_IF?` should also be N/A (88).  
Else if `G201_IC?` is Missing ("99999"), the corresponding `G201_IF?` should also be Missing (999).    
Otherwise, the corresponding `G201_IF?` should not be N/A or Missing.

In [None]:
for n in range(1, 13): 
    assert (df.loc[df[f"G201_IC{n}"] == "88888", f"G201_IF{n}"] == 88).all()
    assert (df.loc[df[f"G201_IC{n}"] == "99999", f"G201_IF{n}"] == 999).all()
    assert not df.loc[~df[f"G201_IC{n}"].isin(["88888", "99999"]), f"G201_IF{n}"].isin([88, 999]).all()

The reverse of this should also be true, such that if `G201_IF?` is N/A, the corresponding `G201_IC?` value should also be N/A, and so on.

In [None]:
for n in range(1, 13): 
    assert (df.loc[df[f"G201_IF{n}"] == 88, f"G201_IC{n}"] == "88888").all()
    assert (df.loc[df[f"G201_IF{n}"] == 999, f"G201_IC{n}"] == "99999").all()
    assert not df.loc[~df[f"G201_IF{n}"].isin([88, 999]), f"G201_IC{n}"].isin(["88888", "99999"]).all()

If `G201_IF1` is No (0), then remaining `G201_IF?` values should be N/A (88).  
However, a single instance of this was located at ID 10080. 

In [None]:
df.loc[df["G201_IF1"] == 0].filter(regex=r'I(C|F)[1-5]\b')

For ID 10080, given both conditions are interrelatated (dermatitis), I believe it's likely the participant would have seen the doctor once for both cases.  
Therefore, it makes sense to change IF1 to 1 and IF2 to 0.

In [None]:
idx = [10080]
df.loc[idx, "G201_IF1"] = 1
df.loc[idx, "G201_IF2"] = 0

Participants with IDs 17890, 19270, and 20370 had missing values in place of N/A values, detected during testing process.

In [None]:
idx = [17890, 19270, 20370]
df.loc[idx, "G201_IC1":"G201_IF12"] = df.loc[idx, "G201_IC1":"G201_IF12"].replace({"99999": "88888", 999: 88})

In [None]:
# If any IF or IC value is missing, all others in that row are also missing
for index, row in df.iterrows():
    if row[IF_ROWS].isin([999]).any():
        assert row[IF_ROWS].isin([999]).all(), f"ID {index}: Not all IF values are missing when one is missing."
    if row[IC_ROWS].isin(['99999']).any():
        assert row[IC_ROWS].isin(['99999']).all(), f"ID {index}: Not all IC values are missing when one is missing."

ID 11642: two values for IF and IC that follow a N/A value.
It appears a couple of cells of data were accidentally copied down a line, for IF3, IF4, IC3 and IC4.

ID 23180: one value for IF and IC that follows a N/A value.
Given IF4 and IC4 match IF2 and IC2, and there's a N/A value in-between, it seems likely they've been accidentally copied over.

In [None]:
df.loc[11642, ["G201_IF3", "G201_IF4"]] = 88
df.loc[11642, ["G201_IC3", "G201_IC4"]] = "88888"

df.loc[23180, ["G201_IF4"]] = 88
df.loc[23180, ["G201_IC4"]] = "88888"

In [None]:
# If any IF or IC value is N/A, all following values are also N/A
for index, row in df.iterrows():
    if row[IF_ROWS].isin([88]).any():
        assert row[IF_ROWS].loc[row[IF_ROWS].isin([88]).idxmax():].isin([88]).all(), f"Row {index}: Not all following IF values are N/A."
    if row[IC_ROWS].isin(['88888']).any():
        assert row[IC_ROWS].loc[row[IC_ROWS].isin(['88888']).idxmax():].isin(['88888']).all(), f"Row {index}: Not all following IC values are N/A."

Finally, confirm that if the first variable for all of `G201_IF1`, `G201_INF1`, and `G201_HOH1` are 88, all subsequent values are 88.

In [None]:
assert all(df.loc[df["G201_IF1"] == 88, "G201_IF1":"G201_IF12"] == 88)
assert all(df.loc[df["G201_INF1"] == 88, "G201_INF1":"G201_INF5"] == 88)
assert all(df.loc[df["G201_HOH1"] == 88, "G201_HOH1":"G201_HOH6"] == 88)

## Create, Delete, and Rename Variables

### Create

Create following binary variables (for all cases: in past 12 months that required a visit to hospital)
- `G201_IL`: illnesses or problems (Yes if `G201_IF1` is Yes); 
- `G201_INJ`: any accidents or injuries (Yes if `G201_INF1` is Yes); and
- `G201_HO`: admitted to hospital (Yes if `G201_HO1` is Yes)

In [None]:
conditions = [
    # When the value for the conditional col is Missing (999) set to Missing
    {"condition": 999, "value": 999},
    # When value is N/A (88), set to 0 (No)
    {"condition": 88, "value": 0},
    # When value is neither N/A nor Missing, set to 1 (Yes)
    {"condition": lambda x: x not in [88, 999], "value": 1}
]

transformations = [
    {
        "target_col": "G201_IL",
        "condition_col": "G201_IF1",
        "conditions": conditions
    },
    {
        "target_col": "G201_INJ",
        "condition_col": "G201_INF1",
        "conditions": conditions
    },
    {
        "target_col": "G201_HO",
        "condition_col": "G201_HOH1",
        "conditions": conditions
    }
]

In [5]:
df = create_variables(df, transformations)

Test to ensure changes were correctly implemented.

In [None]:
CREATE = {
    "G201_IL": "G201_IF1", 
    "G201_INJ": "G201_INF1",
    "G201_HO": "G201_HOH1"
}

for target_col, condition_col in CREATE.items():
    assert all(df.loc[~df[condition_col].isin([88, 999]), target_col] == 1) 
    assert all(df.loc[df[condition_col] == 88, target_col] == 0) 
    assert all(df.loc[df[condition_col] == 999, target_col] == 999) 

### Delete

In [None]:
DELETE = ["DOB", # should not be included in the dataset; captured in Longitudinal Data/Gen2_Ages.sav
          "G201_IF13"] # as stated above, appears unreliable

df.drop(DELETE, axis=1, inplace=True)
meta.drop(DELETE, axis=1, inplace=True)

In [None]:
for var in DELETE:
    assert var not in df.columns
    assert var not in meta.columns

### Rename Variables

In [None]:
RENAME = {} # no vars to rename

df.rename(columns=RENAME, inplace=True)
meta.rename(columns=RENAME, inplace=True)

In [None]:
for original, updated in RENAME.items():
    assert original not in df.columns
    assert original not in meta.columns
    assert updated in df.columns
    assert updated in meta.columns

## Save Interim Data

In [None]:
write_sav(OUTPUT, df, meta)

### Compare interim with raw data

In [None]:
df_old, meta_old = read_sav(INPUT, index="ID")
df_new, meta_new = read_sav(OUTPUT, index="ID")

In [None]:
# Ensure all columns were correctly created, deleted, and/or renamed

old, new = rn.diff(df_old.columns, df_new.columns)

assert all([var in old for var in DELETE])
assert all([var in old for var in RENAME.keys()])
assert all([var in new for var in RENAME.values()])
assert all([var in new for var in CREATE])

In [None]:
# Compare raw data, not including variables that were created or deleted, and renaming the old variables to match for comparison

old_cols = [col for col in df_old.columns if col not in DELETE]
new_cols = [col for col in df_new.columns if col not in CREATE]
dfx = df_new[new_cols].compare(df_old[old_cols].rename(RENAME))
dfx

In [None]:
from pandas import DataFrame
from typing import List

def compare(df1: DataFrame, # updated data
            df2: DataFrame, # original data
            columns: List[str], # columns impact
            idx: List[int], # IDs impacted
            new_value: str|int, # new value
            old_value: str|int # value that was replaced
            ) -> bool: # Output True if all expected changes pass
    "Check that expected changes were correctly implemented."
    dfx = df1.loc[idx, columns].compare(df2.loc[idx, columns], result_names=("new", "old"))
    dfx = dfx.stack(future_stack=True).swaplevel()
    assert (dfx.loc["new"] == new_value).all().all()
    assert (dfx.loc["old"] == old_value).all().all()
    print("All changes correctly updated.")

In [None]:
# Changes should capture each specific changes, meaning column, the IDs that are affected, and the corresponding change

CHANGES = [
    {
        "columns": ["G201_IF1"],
        "idx": [135],
        "old_value": 0,
        "new_value": 1 
    },
    {
        "columns": ["G201_IF2"],
        "idx": [135],
        "old_value": 1,
        "new_value": 0 
    },
    {
        "columns": ["G201_IF3", "G201_IF4"],
        "idx": [1750],
        "old_value": 1,
        "new_value": 88 
    },
    {
        "columns": ["G201_IC3"],
        "idx": [1750],
        "old_value": "00900",
        "new_value": "88888"
    },
    {
        "columns": ["G201_IC4"],
        "idx": [1750],
        "old_value": "05290",
        "new_value": "88888"
    },
    {
        "columns": ["G201_IF4"],
        "idx": [2034],
        "old_value": 1,
        "new_value": 88
    },
    {
        "columns": ["G201_IC4"],
        "idx": [2034],
        "old_value": "V7200",
        "new_value": "88888"
    },
    {
        "columns": ['G201_IC2', 'G201_IC3', 'G201_IC4', 'G201_IC5', 'G201_IC6', 'G201_IC7', 'G201_IC8', 'G201_IC9', 'G201_IC10', 'G201_IC11', 'G201_IC12'],
        "idx": [856, 1648],
        "old_value": "99999",
        "new_value": "88888"
    },
    {
        "columns": ['G201_IF2', 'G201_IF3', 'G201_IF4', 'G201_IF5', 'G201_IF6', 'G201_IF7', 'G201_IF8', 'G201_IF9', 'G201_IF10', 'G201_IF11', 'G201_IF12'],
        "idx": [856, 1648],
        "old_value": 999,
        "new_value": 88
    },
    {
        "columns": ['G201_IC4', 'G201_IC5', 'G201_IC6', 'G201_IC7', 'G201_IC8', 'G201_IC9', 'G201_IC10', 'G201_IC11', 'G201_IC12'],
        "idx": [1945],
        "old_value": "99999",
        "new_value": "88888"
    },
    {
        "columns": ['G201_IF4', 'G201_IF5', 'G201_IF6', 'G201_IF7', 'G201_IF8', 'G201_IF9', 'G201_IF10', 'G201_IF11', 'G201_IF12'],
        "idx": [1945],
        "old_value": 999,
        "new_value": 88
    },
]

In [None]:
for n, change in enumerate(CHANGES, start=1):
    print(f"Change {n}...")
    columns = change["columns"]
    idx = change["idx"]
    new_value = change["new_value"]
    old_value = change["old_value"]
    compare(df_new, df_old, columns, idx, new_value, old_value)

### Compare metadata

In [None]:
# Compare metadata, not including variables that were created or deleted, and renaming the old variables to match for comparison

old_cols = [col for col in meta_old.columns if col not in DELETE]
new_cols = [col for col in meta_new.columns if col not in CREATE]
metax = meta_new[new_cols].compare(meta_old[old_cols].rename(RENAME))
metax