In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data/two_treatment/dt0_without_context.csv' ,index_col=0)
df.head()

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.12573
1,1,0,0,-0.132105
2,2,0,0,0.640423
3,3,0,0,0.1049
4,4,0,0,-0.535669


#### creating random nan values

In [4]:
column = 'outcome'
nan_fraction = 0.1  # 30% of the data in column B will be set to NaN

total_rows = len(df)
nan_count = int(total_rows * nan_fraction)

# seed for reproducibility
np.random.seed(0)
nan_indices = np.random.choice(df.index, nan_count, replace=False)

# Set those randomly selected positions to NaN in the specified column
df.loc[nan_indices, column] = np.nan


#### Separating the dataframe to calculate missing data
In a bayesian aaptive N_of_1 trials, we are going to calculate the missing data as we prepare to update the posterior joint distribution. Thus, the first step is to sepaate the dataframe when the first missing value appears.

In [5]:
## seperating the dataframe at the point of first missing value
nan_t = df[df.isna().any(axis=1)]['t']
nan_t = nan_t.sort_values(ascending=True).unique() ## shorting the value in ascending order to make sure we dot the first time cycle
ts = nan_t[0]
dt = df[df['t'] <= ts].copy()
dt.head()
     

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.12573
30,0,1,0,
60,0,2,0,0.189053
90,0,3,0,2.040919
120,0,4,0,-0.651791


Now, we have data set where we encountered the first missig value in the experiments. lets fill the missing value with various method.
#### Mean filling method
For the fist method, we are simply going to fill the missing value with individual mean and/or global mean. We are consdering here the global mean option if the missing value appears on the first cycle [t = 0].

In [1]:
def mean_fill(dt, method = 'global'): # dt is the data Frame and method global represent the way to mean filling
    # getting the first time cycle point 
    dt_fill = dt.copy()
    miss_dt = dt_fill[dt_fill['outcome'].isna()]
    ts = miss_dt.iloc[0]['t']

    if ts == 0 or method =='global':
        dt_fill.fillna({'outcome': dt_fill['outcome'].mean()}, inplace=True)
    
        
    elif  method == 'individual': ## filling missing value with individual mean
        for row in miss_dt.itertuples():
            index = row.Index
            patient_id = row.patient_id
            fill_value = dt_fill[dt_fill['patient_id'] == patient_id]['outcome'].mean()
            dt_fill.loc[index,'outcome'] = fill_value
    
    return dt_fill

In [88]:
print(dt.isna().value_counts())
filled_dt = mean_fill(dt)
print(filled_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      87
                              True       13
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      100
Name: count, dtype: int64


when the time cycle is not 0[t =! 0]

In [89]:
next_cycle = df[(df['t']> nan_t[0]) & (df['t'] <= nan_t[1])].copy()
next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
next_dt = next_dt.sort_index()
next_dt

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.125730
1,1,0,0,-0.132105
30,0,1,0,0.345584
31,1,1,0,0.821618
60,0,2,0,0.189053
...,...,...,...,...
2911,1,97,1,1.160014
2940,0,98,0,-1.087591
2941,1,98,0,-1.244621
2970,0,99,0,0.082494


In [90]:
print(next_dt.isna().value_counts())
filled_next_dt = mean_fill(next_dt, method= 'individual') ## when we are filling with individual mean
print(filled_next_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      191
                              True         9
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      200
Name: count, dtype: int64


In [91]:
print(next_dt.isna().value_counts())
filled_next_dt = mean_fill(next_dt)## when we are filling with global mean
print(filled_next_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      191
                              True         9
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      200
Name: count, dtype: int64


#### mean filling with consideration of treatment

Here, we will also fill with mean value, however, we will consider the global/individual mean of that particular treatment.

In [92]:
def tr_mean_fill(dt, method = 'global'): # dt is the data Frame and method global represent the way to mean filling
    # getting the first time cycle point 
    dt_fill = dt.copy()
    miss_dt = dt_fill[dt_fill['outcome'].isna()]
    
    for row in miss_dt.itertuples():
        index = row.Index
        patient_id = row.patient_id
        treatment = row.treatment
        t = row.t
        global_fill_value = dt_fill[dt_fill['treatment']== treatment]['outcome'].mean()
        
        ## what if treatment doesn't have a global mean??
         # for now, I am replacing the value with normal mean value
        if pd.isna(global_fill_value) & method == 'global':
            global_fill_value = dt_fill['outcome'].mean()
           
          # for individual, replacing with individual mean  (possible to also have choice of global and individual here?)
        elif pd.isna(global_fill_value) & method == 'individual':
            global_fill_value = dt_fill[dt_fill['patient_id'] == patient_id]['outcome'].mean()
            
        if t == 0 or method =='global':
            dt_fill.loc[index,'outcome'] = global_fill_value  
        elif  method == 'individual':
            ## filling missing value with individual's treatment mean
            fill_value = dt_fill[(dt_fill['patient_id'] == patient_id) & (dt_fill['treatment'] == treatment)]['outcome'].mean() 
            ## when Individual doesn't have particular treatmen in the previous time period
            if pd.isna(fill_value): 
                dt_fill.loc[index,'outcome'] = global_fill_value ## replacing with global treatment mean value   
            else:
                dt_fill.loc[index,'outcome'] = fill_value
    
    return dt_fill

In [93]:
print(dt.isna().value_counts())
filled_dt = tr_mean_fill(dt)
print(filled_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      87
                              True       13
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      100
Name: count, dtype: int64


**while t =! 0**

In [94]:
next_cycle = df[(df['t']> nan_t[0]) & (df['t'] <= nan_t[1])].copy()
next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
next_dt = next_dt.sort_index()
next_dt

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.125730
1,1,0,0,-0.132105
30,0,1,0,0.345584
31,1,1,0,0.821618
60,0,2,0,0.189053
...,...,...,...,...
2911,1,97,1,1.160014
2940,0,98,0,-1.087591
2941,1,98,0,-1.244621
2970,0,99,0,0.082494


In [95]:
print(next_dt.isna().value_counts())
next_filled_dt = tr_mean_fill(next_dt) ## still using 'global' treatment mean
print(next_filled_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      191
                              True         9
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      200
Name: count, dtype: int64


In [96]:
print(next_dt.isna().value_counts())
next_filled_dt = tr_mean_fill(next_dt, method= 'individual')
print(next_filled_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      191
                              True         9
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      200
Name: count, dtype: int64
