# Preliminaries

In [1]:
using CSV, DataFrames, Statistics, Random;

In [2]:
# read data
df = CSV.read("data/input/diabetes_patients_20231205.csv", DataFrame);

In [3]:
# define key variables
treatment_var = "diabetes_course"
outcomes_var = "phys_health_status";

# Split Train and Test

In [4]:
treatments = df[:, treatment_var]
outcomes = df[:, outcomes_var]
X = df[:, Not([treatment_var, outcomes_var])];

In [5]:
(train_X, train_treatments, train_outcomes), (test_X, test_treatments, test_outcomes) =
    IAI.split_data(:policy_maximize, X, treatments, outcomes, seed = 123, train_proportion = 0.5);

# Identify Variables Missing Data

In [6]:
# check for missing data
var_desc = describe(X, :nmissing, :nnonmissing)
var_desc.propmissing = var_desc.nmissing ./ (var_desc.nmissing .+ var_desc.nnonmissing)

# identify variables to remove
threshold = 0.25
vars_to_remove = filter(row -> row.propmissing > threshold, var_desc).variable

# identify variables to impute
var_to_impute = Symbol.(filter(row -> 0 < row.propmissing <= threshold , var_desc).variable);

In [7]:
# remove variables identified
select!(train_X, Not(vars_to_remove))
select!(test_X, Not(vars_to_remove));

In [8]:
# create learner
lnr = IAI.OptKNNImputationLearner(random_seed = 15095)

# fit learner and transform train data
train_X = IAI.fit_transform!(lnr, train_X)

# transform test data
test_X = IAI.transform(lnr, test_X);

[33m[1m└ [22m[39m61a69b87e52c12e8ab8b3e441abe4229bbb733e16cf47bdd9b99756507901d19


# Re-compile Data

In [9]:
# create train flags
train_X[!, :train_flag] .= 1
test_X[!, :train_flag] .= 0

# re-attach response and treatment
train_X.phys_health_status = train_outcomes
test_X.phys_health_status = test_outcomes
train_X.diabetes_course = train_treatments
test_X.diabetes_course = test_treatments

# combined data
patients = vcat(train_X, test_X);

# Create New Variables

In [10]:
# create binary variable for un-safe household
conditions = [
    (patients.depressed_household .== "yes") .| (patients.alcohol_household .== "yes") .| (patients.drugs_household .== "yes") .| (patients.prison_household .== "yes"),
    (patients.depressed_household .== "no") .& (patients.alcohol_household .== "no") .& (patients.drugs_household .== "no") .& (patients.prison_household .== "no")
]

values = ["yes", "no"]

patients.unsafe_household .= ifelse.(conditions[1], values[1], ifelse.(conditions[2], values[2], ""));

In [11]:
# create condition count variable
patients[!, :physical_conditions] = (patients.ever_any_cancer .== "yes") + (patients.cvd .== "yes") + (patients.chd .== "yes") + (patients.stroke .== "yes") + (patients.ever_asthma .== "yes") + (patients.copd .== "yes") + (patients.arthritis .== "yes") + (patients.kidney_disease .== "yes") + (patients.deaf .== "yes") + (patients.blind .== "yes")
patients[!, :mental_conditions] = (patients.depression .== "yes") + (patients.cognitive_decline .== "yes")
patients[!, :any_conditions] = patients.mental_conditions .+ patients.physical_conditions
patients[!, :activities_daily_living] = (patients.mobility .== "yes") + (patients.dressing_bathing .== "yes") + (patients.errands .== "yes") + (patients.concentration .== "yes");

# Save Data

In [12]:
CSV.write("data/input/imputed_diabetes_patients_20231205.csv", patients)

"data/input/imputed_diabetes_patients_20231205.csv"