# Preliminaries

In [1]:
using CSV, DataFrames, Statistics, Random, Clustering, Distances, StatsPlots, JuMP, Gurobi;

ArgumentError: ArgumentError: Package StatsPlots not found in current path.
- Run `import Pkg; Pkg.add("StatsPlots")` to install the StatsPlots package.

In [2]:
# read data
df =  CSV.read("data/input/cancer_patients_20231122.csv", DataFrame);

In [3]:
# define key variables
treatment_var = "treatment_professional"
outcomes_var = "health_status";

# Remove Missing Treatments and Outcomes

In [4]:
columns_with_missing = Symbol.([treatment_var, outcomes_var])
df = df[completecases(df[:, columns_with_missing]), :]

Row,health_status,age,average_alcohol_month,sex,language,race,veteran,bmi,metro,metro_granular,education,employment,income,partner,children,depressed_household,alcohol_household,drugs_household,prison_household,unsafe_household,exercise_past_month,sleep,ever_smoked_100,current_smoker,ever_e_smoked,current_e_smoker,ever_any_cancer,ever_skin_cancer,cvd,chd,stroke,ever_asthma,copd,arthritis,depression,kidney_disease,deaf,blind,concentration,mobility,dressing_bathing,errands,current_asthma,diabetes,cognitive_decline,pregnant,number_cancers,current_treatment,cancer_type,treatment_professional,health_coverage
Unnamed: 0_level_1,Float64,Float64,Float64?,String7,String7,String15,String7,String15?,String3?,String15?,String31?,String15?,String7?,String7?,String15?,String3?,String3?,String3?,String3?,String3?,String3?,String15?,String3?,String15?,String3?,String15,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,Float64?,String15?,Float64?,Float64?,Float64,String15,String15?,String31?,String7?
1,3.0,55.0,1.0,female,english,white,no,missing,yes,city,college_graduate,self_employed,missing,couple,none,no,no,no,no,no,yes,healthy,yes,none,no,none,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no,0.0,0.0,1.0,completed,breast,oncologist,yes
2,1.0,57.0,3.0,female,english,white,no,normal,yes,city_center,college_graduate,employed,middle,single,none,no,no,no,no,no,no,low,yes,none,no,none,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no,0.0,0.0,1.0,completed,breast,family_practitioner,yes
3,2.0,80.0,0.0,male,english,white,yes,overweight,yes,city,college_graduate,retired,missing,single,none,no,no,no,no,no,no,low,no,none,no,none,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,no,0.0,0.0,2.0,completed,missing,general_practitioner,over_65
4,2.0,72.0,1.0,female,english,white,no,overweight,yes,city,college_graduate,retired,middle,couple,none,no,no,no,no,no,yes,healthy,yes,none,no,none,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no,0.0,0.0,1.0,completed,skin,family_practitioner,over_65
5,1.0,80.0,1.0,female,english,white,no,normal,yes,city,college_graduate,self_employed,missing,single,none,no,no,no,no,no,yes,healthy,yes,none,no,none,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,no,0.0,0.0,1.0,completed,other,general_practitioner,over_65
6,3.0,71.0,missing,male,english,white,no,overweight,yes,city,college_graduate,missing,missing,couple,one,no,no,no,no,no,no,healthy,yes,none,no,none,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,no,0.0,0.0,1.0,completed,prostate,general_practitioner,over_65
7,2.0,77.0,1.0,female,english,white,no,obese,yes,city,college_graduate,retired,high,couple,none,no,no,no,no,no,yes,healthy,yes,none,no,none,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,no,0.0,0.0,1.0,completed,other,general_practitioner,over_65
8,2.0,67.0,0.0,male,english,white,no,overweight,yes,city_center,college_graduate,retired,high,couple,none,no,no,no,no,no,yes,low,no,none,yes,none,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no,0.0,0.0,2.0,completed,skin,family_practitioner,over_65
9,1.0,64.0,2.0,male,english,white,no,overweight,yes,city,college_graduate,self_employed,high,couple,none,no,no,no,no,no,yes,healthy,no,none,no,none,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no,0.0,0.0,1.0,completed,other,surgeon,yes
10,2.0,69.0,1.0,female,english,white,no,underweight,yes,city,college_graduate,retired,missing,single,none,no,no,no,no,no,yes,healthy,no,none,no,none,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,no,0.0,0.0,1.0,completed,breast,other,over_65


# Identify Treatment, Outcomes, and X Data

In [5]:
treatments = df[:, treatment_var]
outcomes = df[:, outcomes_var]
X = df[:, Not([treatment_var, outcomes_var])];

# Split Train and Test

In [6]:
(train_X, train_treatments, train_outcomes), (test_X, test_treatments, test_outcomes) =
    IAI.split_data(:policy_maximize, X, treatments, outcomes, seed = 123, train_proportion = 0.5);

UndefVarError: UndefVarError: `IAI` not defined

# Identify Variables Missing Data

In [7]:
# check for missing data
var_desc = describe(X, :nmissing, :nnonmissing)
var_desc.propmissing = var_desc.nmissing ./ (var_desc.nmissing .+ var_desc.nnonmissing)

# identify variables to remove
threshold = 0.25
vars_to_remove = filter(row -> row.propmissing > threshold, var_desc).variable

# identify variables to impute
var_to_impute = Symbol.(filter(row -> 0 < row.propmissing <= threshold , var_desc).variable);

In [8]:
# remove variables identified
select!(train_X, Not(vars_to_remove))
select!(test_X, Not(vars_to_remove));

UndefVarError: UndefVarError: `train_X` not defined

In [9]:
# create learner
lnr = IAI.OptKNNImputationLearner(random_seed = 15095)

# fit learner and transform train data
train_X = IAI.fit_transform!(lnr, train_X)

# transform test data
test_X = IAI.transform(lnr, test_X);

UndefVarError: UndefVarError: `IAI` not defined