In [5]:
import pandas as pd
import numpy as np

import os
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [10]:
# Set the OMP_NUM_THREADS environment variable to 1 to avoid memory leak
os.environ['OMP_NUM_THREADS'] = '1'

In [6]:
df1 = pd.read_csv('data/two_treatment/dt0_without_context.csv' ,index_col=0)
df1.head()

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.12573
1,1,0,0,-0.132105
2,2,0,0,0.640423
3,3,0,0,0.1049
4,4,0,0,-0.535669


In [7]:
df = df1.copy()

#### creating random nan values

In [8]:

column = 'outcome'
nan_fraction = 0.1  
total_rows = len(df)
nan_count = int(total_rows * nan_fraction)
# seed for reproducibility
np.random.seed(0)
nan_indices = np.random.choice(df.index, nan_count, replace=False)

# Set those randomly selected positions to NaN in the specified column
df.loc[nan_indices, column] = np.nan


#### Separating the dataframe to calculate missing data
In a bayesian aaptive N_of_1 trials, we are going to calculate the missing data as we prepare to update the posterior joint distribution. Thus, the first step is to sepaate the dataframe when the first missing value appears.

In [9]:
## seperating the dataframe at the point of first missing value
nan_t = df[df.isna().any(axis=1)]['t']
nan_t = nan_t.sort_values(ascending=True).unique() ## shorting the value in ascending order to make sure we dot the first time cycle
ts = nan_t[0]
dt = df[df['t'] <= ts].copy()
dt.head()
     

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.12573
30,0,1,0,
60,0,2,0,0.189053
90,0,3,0,2.040919
120,0,4,0,-0.651791


Now, we have data set where we encountered the first missig value in the experiments. lets fill the missing value with various method.
#### Context filling method
For the context filling method, we are followin similar algorithm as Clustering to select suitable filling value. Then similar to KNN imputation method, we can descide how many value we want to consider. if 1, then that will fill the missng value. If more than 1 (i.e K), then the mean value of K nearest values will fill the missing value.
The context of the each paitent and the treatment altogether is a vector (i.e for paitient id 1 the vector is $[c_1, c_2, ...., c_n, treatment]_{p_1}$). We calculate the distance of the missing value points/paitients vector with all the other available vector. Then, K nearest outcome values mean is used to fill the missing value.

If no context is available, then we are using the treatment as a singular context.

In [3]:
def get_distance(context_vectors, miss_vec):
    
    distances = []
    for vector in context_vectors:
        d = np.linalg.norm(vector - miss_vec)
        distances.append(d) 
    return(distances)

In [90]:
def context_fill(dt, context_cols = list, N = 2, m = 1): 
    # dt is the data Frame
    # context_cols will be list of columns name which are considered as context 
    # N represents the number of clusters for KMean
    
    dt_fill = dt.copy()
    miss_dt = dt_fill[dt_fill['outcome'].isna()]
    without_miss_dt = dt_fill[dt_fill['outcome'].isna() == False].copy()
    context_dt = without_miss_dt[context_cols]
    
    ## apply KMeans clustering to context_dt
    
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(context_dt)

    kmeans = KMeans(n_clusters= N, random_state=0)  # Choose the number of clusters (2 in this case)
    kmeans.fit(scaled_data)

    without_miss_dt['cluster'] = kmeans.labels_  # Add cluster labels to the dataframe
    centroids = kmeans.cluster_centers_  # Get cluster centroids
    
    
    
    
    for row in miss_dt.itertuples():
        miss_vec = np.array([getattr(row, col) for col in context_cols])
        
        ## calculate the distance from the clustres cenroids
        dis = get_distance(centroids, miss_vec)
        dis_dt = pd.Series(dis)
                
        sorted_dis_dt = dis_dt.sort_values()
        selected_cluster = sorted_dis_dt.index[:m].values
        
        ## calculate the missing value with above mentioned formula ()
        clusters_mean = without_miss_dt.groupby('cluster')['outcome'].mean()
        inverse_dis = 1 / dis_dt.loc[selected_cluster]
        numerator = (clusters_mean.loc[selected_cluster] / dis_dt.loc[selected_cluster]).mean()
        denominator = inverse_dis.mean()
        
        fill_value = numerator/denominator 
        
        ## fill the missing value in appropiate place 
        dt_fill.loc[row.Index,'outcome'] = fill_value
    
    return dt_fill

In [91]:
N = 2
m = 1
context_cols = ['treatment']

print(dt.isna().value_counts())
filled_dt = context_fill(dt, context_cols, N, m)
print(filled_dt.isna().value_counts())


t      patient_id  treatment  outcome
False  False       False      False      86
                              True       14
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      100
Name: count, dtype: int64




**when t =! 0**

In [92]:
next_cycle = df[(df['t']> nan_t[0]) & (df['t'] <= nan_t[1])].copy()
next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
next_dt = next_dt.sort_index()
next_dt

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.125730
1,1,0,0,-0.132105
30,0,1,0,-0.130631
31,1,1,0,0.821618
60,0,2,0,0.189053
...,...,...,...,...
2911,1,97,1,1.160014
2940,0,98,0,-1.087591
2941,1,98,0,
2970,0,99,0,0.082494


In [94]:
print(next_dt.isna().value_counts())
next_filled_dt = context_fill(next_dt, context_cols, N, m)
print(next_filled_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      188
                              True        12
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      200
Name: count, dtype: int64


