In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import numpy as np
import pandas as pd
from inference import inf_model, policy
from evaluate.evaluation import Evaluatemethod 
from miss_fill.context_fill import cluster_fill, KNN_fill, DR_fill
from miss_fill.mean_fill import mean_fill, tr_mean_fill

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

In this note book we create a way to calculate the next treatment. It will help us to figure out if after filling the missing value, it selects the same treatment as in the original data frame. Here, the inference model for the thompson sampling is same as the adaptive_n_of_1 we used for the data simulation.

### Data

In [None]:
df1 = pd.read_csv('data/two_treatment/dt3_2024_11_11_with_context.csv' ,index_col=0)
df1.head(30)

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,2.12573
1,1,0,0,1.867895
2,2,0,0,2.640423
3,3,0,0,2.1049
4,4,0,0,1.464331
5,5,0,0,2.361595
6,6,0,0,3.304
7,7,0,0,2.947081
8,8,0,0,1.296265
9,9,0,0,0.734579


#### creating random nan values

In [None]:
df = df1.copy()
column = 'outcome'

# Define the range fo block length
start = 0
end = 2

# Find indices where B is within the specified range
in_range_indices = df[df['t'].between(start, end)].index

# seed for reproducibility
np.random.seed(0)
# Select random indices from the eligible ones
random_indices = np.random.choice(in_range_indices, size=int(len(in_range_indices) * 0.1), replace=False)

# Set NaN at those random indices
df.loc[random_indices, column] = np.nan


In [24]:
## seperating the dataframe at the point of first missing value
nan_t = df[df.isna().any(axis=1)]['t']
nan_t = nan_t.sort_values(ascending=True).unique() ## shorting the value in ascending order to make sure we dot the first time cycle
ts = nan_t[0]
dt = df[df['t'] <= ts].copy()
dt.head(30)

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,2.12573
1,1,0,0,1.867895
2,2,0,0,2.640423
3,3,0,0,2.1049
4,4,0,0,1.464331
5,5,0,0,2.361595
6,6,0,0,3.304
18,0,1,0,2.345584
19,1,1,0,2.821618
20,2,1,0,2.330437


Here, the simulation design has the a block length of 5, thus, we fill the missing value for one block and then create the simulation. 

### making simulation for the next action choice 

In [25]:
# Policy and Inference Model
inference_model =  inf_model.NormalKnownVariance(
    
    prior_mean=0, prior_variance=1, variance=1
)
tmps = policy.ThompsonSampling(inference_model, number_of_treatments=2)

In [26]:
# the patient who have the missing outcomes
missing_patient_id = df.loc[df['outcome'].isna(), 'patient_id'].values
missing_patient_id = np.unique(missing_patient_id)

In [27]:
missing_patient_id

array([ 4,  5,  7, 17, 24, 30, 34, 36, 38, 41, 43, 45, 54, 61, 62, 63, 64,
       68, 69, 73, 74, 76, 78, 79, 82, 90, 95, 96, 98], dtype=int64)

In [None]:
## intialization for running all the methods
block_length = 3
length = 6 * block_length
number_of_actions = 2
t = 3 ## as block length is 2next cycle the tratment will be elected is at t = 2
k = 3
N = 2
m = 1
lamb = 2
context_cols = ['treatment', 'patient_id']


In [29]:
## create a dictionary for all the methods

methods = {
    'mean_fill': mean_fill,
    'tr_mean_fill': tr_mean_fill,
    'KNN_fill': KNN_fill,
    'cluster_fill': cluster_fill,
    'DR_fill': DR_fill,
    'original': 0
}

In [30]:
eva = Evaluatemethod(t,block_length, length, df1)

In [None]:
result = pd.DataFrame()

for method_name, method in methods.items():
    if method == KNN_fill:
        filled_dt =method(dt, context_cols, k)
        for i in range(start+1,end+1):
            next_cycle = df[(df['t']> i -1 ) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, context_cols, k)
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr
    elif method == cluster_fill:
        filled_dt =method(dt, context_cols, N, m)
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, context_cols, N, m)
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr
    elif method == DR_fill:
        filled_dt =method(dt,lamb, context_cols)
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, lamb, context_cols)
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr
        
    elif method == DR_fill:
        filled_dt =method(dt,lamb, context_cols)
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, lamb, context_cols)
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name}'] = n_correct_tr
    elif method == mean_fill:
        filled_dt_g =method(dt)
        for i in range(start+1, end+1):
            next_cycle_g = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt_g = pd.concat([filled_dt_g, next_cycle_g], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt_g = next_dt_g.sort_index()
            filled_dt_g = method(next_dt_g)
        ## GET THE EVALUATION
        n_correct_tr_g, index = eva.get_action_result(filled_dt= filled_dt_g)
        result[f'{method_name} (global)'] = n_correct_tr_g
        
        # for individual
        filled_dt =method(dt, method = 'individual')
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, method = 'individual')
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name} (individual)'] = n_correct_tr
        
    elif method == tr_mean_fill:
        filled_dt_g =method(dt)
        for i in range(start+1, end+1):
            next_cycle_g = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt_g = pd.concat([filled_dt_g, next_cycle_g], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt_g = next_dt_g.sort_index()
            filled_dt_g = method(next_dt_g)
        ## GET THE EVALUATION
        n_correct_tr_g, index = eva.get_action_result(filled_dt_g)
        result[f'{method_name} (global)'] = n_correct_tr_g
        
        # for individual
        filled_dt =method(dt, method = 'individual')
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, method = 'individual')
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result[f'{method_name} (individual)'] = n_correct_tr
    else:
        filled_dt =df1[df1['t'] <= end] ## for the original data
        ## GET THE EVALUATION
        n_correct_tr, index = eva.get_action_result(filled_dt)
        result['original_data'] = n_correct_tr

    print(f'done for {method_name}')
result['patient_id'] = index
result = result.set_index('patient_id')

In [None]:
result.to_csv(f'result_output/result_block{block_length}wct{t}_action_selection.csv', index = True)

#### Getting the posterior mean

In [None]:
eva = Evaluatemethod(t,block_length, length, df1)

In [None]:
## gettig the posterior mean and variance of the simulayed data
mean_0 = []
mean_1 = []
lower_bound_tr0 = []
upper_bound_tr0 = []
lower_bound_tr1 = []
upper_bound_tr1 = []
original_data_mean_0 = []
original_data_mean_1 = []

for method_name, method in methods.items():
    if method == KNN_fill:
        filled_dt =method(dt, context_cols, k)
        for i in range(start+1, end+1):
            next_cycle = df[df['t']== i].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, context_cols, k)
        ## GET THE EVALUATION
        KNN_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(KNN_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(KNN_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(KNN_out['mean of treatment 0'].mean())
        mean_1.append(KNN_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
                   
        
    elif method == cluster_fill:
        filled_dt =method(dt, context_cols, N, m)
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, context_cols, N, m)
        ## GET THE EVALUATION
        cluster_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(cluster_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(cluster_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(cluster_out['mean of treatment 0'].mean())
        mean_1.append(cluster_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
        
    elif method == DR_fill:
        filled_dt =method(dt,lamb, context_cols)
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, lamb, context_cols)
               ## GET THE EVALUATION
        DR_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(DR_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(DR_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(DR_out['mean of treatment 0'].mean())
        mean_1.append(DR_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        

    elif method == mean_fill:
        filled_dt_g =method(dt)
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt)
        ## GET THE EVALUATION
        meanG_out = eva.get_mean_var(filled_dt_g, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(meanG_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(meanG_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(meanG_out['mean of treatment 0'].mean())
        mean_1.append(meanG_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
        # for individual
        filled_dt =method(dt, method = 'individual')
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, method = 'individual')
               ## GET THE EVALUATION
        meanI_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(meanI_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(meanI_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(meanI_out['mean of treatment 0'].mean())
        mean_1.append(meanI_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
    elif method == tr_mean_fill:
        filled_dt_g =method(dt)
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt)
        ## GET THE EVALUATION
        tr_meanG_out = eva.get_mean_var(filled_dt_g, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(tr_meanG_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(tr_meanG_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(tr_meanG_out['mean of treatment 0'].mean())
        mean_1.append(tr_meanG_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
        
        # for individual
        filled_dt =method(dt, method = 'individual')
        for i in range(start+1, end+1):
            next_cycle = df[(df['t']> i-1) & (df['t'] <= i)].copy()
            next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
            next_dt = next_dt.sort_index()
            filled_dt = method(next_dt, method = 'individual')
        
        ## GET THE EVALUATION
        tr_meanI_out = eva.get_mean_var(filled_dt, missing_patient_id)
        lower_bound0, upper_bound0 = eva.confidence_interval(tr_meanI_out, 'mean of treatment 0')
        lower_bound1, upper_bound1 = eva.confidence_interval(tr_meanI_out, 'mean of treatment 1')
        mean_a, var_a = inference_model.update_posterior(df1, number_of_actions)
        
        ## append all data
        mean_0.append(tr_meanI_out['mean of treatment 0'].mean())
        mean_1.append(tr_meanI_out['mean of treatment 1'].mean())
        lower_bound_tr0.append(lower_bound0)
        upper_bound_tr0.append(upper_bound0)
        lower_bound_tr1.append(lower_bound1)
        upper_bound_tr1.append(upper_bound1)
        original_data_mean_0.append(mean_a[0])
        original_data_mean_1.append(mean_a[1])
    else:
        pass
    print(f'done for {method_name}')


done for mean_fill
done for tr_mean_fill
done for KNN_fill




done for cluster_fill
done for DR_fill
done for original


In [None]:
par_estimation_result= {
    'method': ["mean_fill (global)", "mean_fill (individual)", "tr_mean_fill (global)", "tr_mean_fill (individual)", "KNN_fill", "cluster_fill",
               "DR_fill"],
    'posterior mean of treatment 0': mean_0,
    'treatment 0 lower bound (95%)': lower_bound_tr0,
    'treatment 0 upper bound (95%)': upper_bound_tr0,
    'posterior mean of treatment 0 (original data )': original_data_mean_0,
    'posterior mean of treatment 1': mean_1,
    'treatment 1 lower bound (95%)': lower_bound_tr1,
    'treatment 1 upper bound (95%)': upper_bound_tr1,
    'posterior mean of treatment 1 (original data )': original_data_mean_1
}
estimation_result = pd.DataFrame(par_estimation_result)

In [None]:
estimation_result.to_csv(f'result_output/result_block{block_length}wct{t}_estimation.csv', index = True)

In [None]:
sim_outputs = {"mean_fill (global)": meanG_out,
              "mean_fill (individual)": meanI_out,
              "tr_mean_fill (global)": tr_meanG_out,
              "tr_mean_fill (individual)": tr_meanI_out,
              "KNN_fill": KNN_out,
              "cluster_fill": cluster_out,
               "DR_fill": DR_out}
combined_df = pd.concat(sim_outputs, axis=1)

In [None]:

combined_df.to_pickle(f'result_output/result_block{block_length}wct{t}_raw.pkl')