In [66]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [67]:
import numpy as np
import pandas as pd
from inference import inf_model, policy
from evaluate.evaluation import Evaluatemethod 
from miss_fill.context_fill import cluster_fill, KNN_fill, DR_fill
from miss_fill.mean_fill import mean_fill, tr_mean_fill

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns


## Scenario: 
Data with out context with block length 1

### Data

In [68]:
df1 = pd.read_csv('data/two_treatment/dt1_2024_11_11_with_context.csv' ,index_col=0)
df1.head(10)

Unnamed: 0,c,t,patient_id,treatment,outcome
0,0.12573,0,0,1,-0.132105
1,0.640423,1,0,0,2.1049
2,-0.535669,2,0,0,2.361595
3,1.304,3,0,0,2.947081
4,-0.703735,4,0,0,0.734579
5,-0.623274,5,0,0,2.041326
6,0.345584,0,1,1,0.821618
7,0.330437,1,1,1,-1.303157
8,0.905356,2,1,0,2.446375
9,-0.536953,3,1,0,2.581118


#### creating random nan values

In [69]:
df = df1.copy()
column = 'outcome'
# Parameters
thr_t = 4  # Condition: missing at time cycle 0
num_nans = 10    # 10% of the data in column outcome will be set to NaN

# Find indices where column 'B' is greater than the threshold
eligible_indices = df.index[df['t'] == thr_t]

# seed for reproducibility
np.random.seed(0)
# Select random indices from the eligible ones
random_indices = np.random.choice(eligible_indices, size=num_nans, replace=False)

# Set NaN at those random indices
df.loc[random_indices, column] = np.nan

In [70]:
# the patient who have the missing outcomes
missing_patient_id = df.loc[df['outcome'].isna(), 'patient_id'].values

In [71]:
## seperating the dataframe at the point of first missing value
dt = df[df['t']<= thr_t].copy()
dt.head(10)

Unnamed: 0,c,t,patient_id,treatment,outcome
0,0.12573,0,0,1,-0.132105
1,0.640423,1,0,0,2.1049
2,-0.535669,2,0,0,2.361595
3,1.304,3,0,0,2.947081
4,-0.703735,4,0,0,0.734579
6,0.345584,0,1,1,0.821618
7,0.330437,1,1,1,-1.303157
8,0.905356,2,1,0,2.446375
9,-0.536953,3,1,0,2.581118
10,0.364572,4,1,0,2.294132


### making simulation for the next action choice 

In [72]:
# Policy and Inference Model
inference_model =  inf_model.NormalKnownVariance(
    
    prior_mean=0, prior_variance=1, variance=1
)
tmps = policy.ThompsonSampling(inference_model, number_of_treatments=2)

In [73]:
# Initial parameters
block_length = 1
length = 6 * block_length
number_of_actions = 2
number_of_patients = 100

In [74]:
missing_patient_id

array([ 2, 16, 26, 54, 55, 73, 75, 86, 93, 95], dtype=int64)

In [75]:
## intialization for running all the methods
t = 5 # for evaluation method
k = 3
N = 2
m = 1
lamb = 2
context_cols = ['c','treatment', 'patient_id']


In [76]:
## create a dictionary for all the methods

methods = {
    'mean_fill': mean_fill,
    'tr_mean_fill': tr_mean_fill,
    'KNN_fill': KNN_fill,
    'cluster_fill': cluster_fill,
    'DR_fill': DR_fill,
    'original': 0
}

In [77]:
eva = Evaluatemethod(t,block_length, length, df1)

In [78]:
result = pd.DataFrame()

for method_name, method in methods.items():
    if method == KNN_fill:
        filled_dt =method(dt, context_cols, k) 
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr    
    elif method == cluster_fill:
        filled_dt =method(dt, context_cols, N, m)
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr
    elif method == DR_fill:
        filled_dt =method(dt,lamb, context_cols)
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr
        
    elif method == DR_fill:
        filled_dt =method(dt,lamb, context_cols)
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr
    elif method == mean_fill:
        filled_dt_g =method(dt)
        ## GET THE EVALUATION
        n_correct_tr_g, index = eva.get_action_result(filled_dt= filled_dt_g)
        result[f'{method_name} (global)'] = n_correct_tr_g
        
        # for individual
        filled_dt =method(dt, method = 'individual')
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name} (individual)'] = n_correct_tr
        
    elif method == tr_mean_fill:
        filled_dt_g =method(dt)
        ## GET THE EVALUATION
        n_correct_tr_g, index = eva.get_action_result(filled_dt_g)
        result[f'{method_name} (global)'] = n_correct_tr_g
        
        # for individual
        filled_dt =method(dt, method = 'individual')
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name} (individual)'] = n_correct_tr
    else:
        filled_dt =df1[df1['t'] <= thr_t] ## for the original data
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result['original_data'] = n_correct_tr
        

    print(f'done for {method_name}')
result['patient_id'] = index

result = result.set_index('patient_id')

done for mean_fill
done for tr_mean_fill
done for KNN_fill
done for cluster_fill
done for DR_fill
done for original


In [79]:
result.to_csv(f'result_output/result_block{block_length}wct{t}_action_selection.csv', index = True)

#### Getting the posterior mean

In [80]:
eva = Evaluatemethod(t,block_length, length, df1)

In [81]:

## gettig the posterior mean and variance of the simulayed data
mean_0 = []
mean_1 = []
lower_bound_tr0 = []
upper_bound_tr0 = []
lower_bound_tr1 = []
upper_bound_tr1 = []
original_data_mean_0 = []
original_data_mean_1 = []

for method_name, method in methods.items():
    if method == KNN_fill:
        filled_dt =method(dt, context_cols, k)
        ## GET THE EVALUATION
        KNN_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(KNN_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(KNN_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(KNN_out['mean of treatment 0'].mean())
        mean_1.append(KNN_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
                   
        
    elif method == cluster_fill:
        filled_dt =method(dt, context_cols, N, m)
        ## GET THE EVALUATION
        cluster_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(cluster_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(cluster_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(cluster_out['mean of treatment 0'].mean())
        mean_1.append(cluster_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
        
    elif method == DR_fill:
        filled_dt =method(dt,lamb, context_cols)
               ## GET THE EVALUATION
        DR_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(DR_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(DR_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(DR_out['mean of treatment 0'].mean())
        mean_1.append(DR_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        

    elif method == mean_fill:
        filled_dt_g =method(dt)
        ## GET THE EVALUATION
        meanG_out = eva.get_mean_var(filled_dt_g, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(meanG_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(meanG_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(meanG_out['mean of treatment 0'].mean())
        mean_1.append(meanG_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
        # for individual
        filled_dt =method(dt, method = 'individual')
               ## GET THE EVALUATION
        meanI_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(meanI_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(meanI_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(meanI_out['mean of treatment 0'].mean())
        mean_1.append(meanI_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
    elif method == tr_mean_fill:
        filled_dt_g =method(dt)
        ## GET THE EVALUATION
        tr_meanG_out = eva.get_mean_var(filled_dt_g, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(tr_meanG_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(tr_meanG_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(tr_meanG_out['mean of treatment 0'].mean())
        mean_1.append(tr_meanG_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
        # for individual
        filled_dt =method(dt, method = 'individual')
        ## GET THE EVALUATION
        tr_meanI_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(tr_meanI_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(tr_meanI_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(tr_meanI_out['mean of treatment 0'].mean())
        mean_1.append(tr_meanI_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
    else:
        pass
    print(f'done for {method_name}')


done for mean_fill
done for tr_mean_fill
done for KNN_fill
done for cluster_fill
done for DR_fill
done for original


In [82]:
par_estimation_result= {
    'method': ["mean_fill (global)", "mean_fill (individual)", "tr_mean_fill (global)", "tr_mean_fill (individual)", "KNN_fill", "cluster_fill",
               "DR_fill"],
    'posterior mean of treatment 0': mean_0,
    'treatment 0 lower bound (95%)': lower_bound_tr0,
    'treatment 0 upper bound (95%)': upper_bound_tr0,
    'posterior mean of treatment 0 (original data )': original_data_mean_0,
    'posterior mean of treatment 1': mean_1,
    'treatment 1 lower bound (95%)': lower_bound_tr1,
    'treatment 1 upper bound (95%)': upper_bound_tr1,
    'posterior mean of treatment 1 (original data )': original_data_mean_1
}

In [83]:
estimation_result = pd.DataFrame(par_estimation_result)
estimation_result.to_csv(f'result_output/result_block{block_length}wct{t}_estimation.csv', index = True)

In [84]:
sim_outputs = {"mean_fill (global)": meanG_out,
              "mean_fill (individual)": meanI_out,
              "tr_mean_fill (global)": tr_meanG_out,
              "tr_mean_fill (individual)": tr_meanI_out,
              "KNN_fill": KNN_out,
              "cluster_fill": cluster_out,
               "DR_fill": DR_out}

In [85]:
combined_df = pd.concat(sim_outputs, axis=1)
combined_df.to_pickle(f'result_output/result_block{block_length}wct{t}_raw.pkl')