In [1]:
import pandas as pd
import numpy as np

#### without Context

In [2]:
df1 = pd.read_csv('data/two_treatment/dt0_without_context.csv' ,index_col=0)
df1.head()

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.12573
1,1,0,0,-0.132105
2,2,0,0,0.640423
3,3,0,0,0.1049
4,4,0,0,-0.535669


#### creating random nan values

In [3]:
df = df1.copy()
column = 'outcome'
nan_fraction = 0.1  # 30% of the data in column B will be set to NaN

total_rows = len(df)
nan_count = int(total_rows * nan_fraction)

# seed for reproducibility
np.random.seed(0)
nan_indices = np.random.choice(df.index, nan_count, replace=False)

# Set those randomly selected positions to NaN in the specified column
df.loc[nan_indices, column] = np.nan


#### Separating the dataframe to calculate missing data
In a bayesian aaptive N_of_1 trials, we are going to calculate the missing data as we prepare to update the posterior joint distribution. Thus, the first step is to sepaate the dataframe when the first missing value appears.

In [4]:
## seperating the dataframe at the point of first missing value
nan_t = df[df.isna().any(axis=1)]['t']
nan_t = nan_t.sort_values(ascending=True).unique() ## shorting the value in ascending order to make sure we dot the first time cycle
ts = nan_t[0]
dt = df[df['t'] <= ts].copy()
dt.head()
     

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.12573
30,0,1,0,
60,0,2,0,0.189053
90,0,3,0,2.040919
120,0,4,0,-0.651791


Now, we have data set where we encountered the first missig value in the experiments. lets fill the missing value with various method.
#### DR (doubly robust estimator for linear regresion) filling method
For the context DR filling method, we are building a linear regression model with existinf non-Nan data. Here, $Y_i$ is the outcome and $X_i$ represts the context. For the estiation of $\beta_i$ we are going to use the doubly robst estimator as follows:
\begin{equation}
        \hat\beta_t = \big(\sum_{\tau = 1}^{t}\sum_{i= 1}^{N}X_i(\tau)X_i(\tau)^T + \lambda_t I\big)^{-1} \big(\sum_{\tau = 1}^{t}\sum_{i= 1}^{N}X_i(\tau)Y_i(\tau)\big)
\end{equation}

If no context is available, then we are using the treatment as a singular context.

In [26]:
def fit_estimate_beta(df, context_cols, outcome_col, lamb): ## lamb is the regulization parameter
    ## prepare for estimator calculate beta
    lambI = np.eye(len(context_cols)+ 1)*lamb
    X = df[context_cols].values
    y = df[outcome_col].values
    
    X_b = np.c_[np.ones((X.shape[0], 1)), X]

    beta_DR = np.linalg.inv(X_b.T.dot(X_b) + lambI ).dot(X_b.T).dot(y)
    
    return beta_DR

def predict(beta_DR, X):
    """
    Predict target values for given input features X.
    X: numpy array of shape (n_samples, n_features)
    """
    return beta_DR[0] + X.dot(beta_DR[1:])
    

In [23]:
def DR_fill(dt, lamb, context_cols = list, outcome_col = list): 
    # dt is the data Frame
    # context_cols will be list of columns name which are considered as context 
    
    dt_fill = dt.copy()
    miss_dt = dt_fill[dt_fill['outcome'].isna()]
    without_miss_dt = dt_fill[dt_fill['outcome'].isna() == False].copy()
    beta_DR = fit_estimate_beta(without_miss_dt, context_cols, outcome_col, lamb)
    
    for row in miss_dt.itertuples():
        miss_vec = np.array([getattr(row, col) for col in context_cols])
        
        fill_value = predict(beta_DR, miss_vec)
        dt_fill.loc[row.Index,'outcome'] = fill_value
    
    return dt_fill



In [27]:
lamb = 2
context_cols = ['treatment']
outcome_col = ['outcome']

print(dt.isna().value_counts())
filled_dt = DR_fill(dt,lamb, context_cols, outcome_col)
print(filled_dt.isna().value_counts())


t      patient_id  treatment  outcome
False  False       False      False      86
                              True       14
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      100
Name: count, dtype: int64


**when t =! 0**

In [28]:
next_cycle = df[(df['t']> nan_t[0]) & (df['t'] <= nan_t[1])].copy()
next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
next_dt = next_dt.sort_index()
next_dt

Unnamed: 0,t,patient_id,treatment,outcome
0,0,0,0,0.125730
1,1,0,0,-0.132105
30,0,1,0,-0.124639
31,1,1,0,0.821618
60,0,2,0,0.189053
...,...,...,...,...
2911,1,97,1,1.160014
2940,0,98,0,-1.087591
2941,1,98,0,
2970,0,99,0,0.082494


In [30]:
print(next_dt.isna().value_counts())
next_filled_dt = DR_fill(next_dt, lamb, context_cols, outcome_col)
print(next_filled_dt.isna().value_counts())

t      patient_id  treatment  outcome
False  False       False      False      188
                              True        12
Name: count, dtype: int64
t      patient_id  treatment  outcome
False  False       False      False      200
Name: count, dtype: int64


#### with context data

In [31]:
df2 = pd.read_csv('data/two_treatment/dt0_with_context.csv' ,index_col=0)
df2.head()

Unnamed: 0,c,t,patient_id,treatment,outcome
0,0.12573,0,0,0,-0.132105
1,0.640423,1,0,0,0.1049
2,-0.535669,2,0,0,0.361595
3,1.304,3,0,0,0.947081
4,-0.703735,4,0,0,-1.265421


In [32]:
## creating random nan values
df = df2.copy()
column = 'outcome'
nan_fraction = 0.1  

total_rows = len(df)
nan_count = int(total_rows * nan_fraction)

# seed for reproducibility
np.random.seed(0)
nan_indices = np.random.choice(df.index, nan_count, replace=False)

# Set those randomly selected positions to NaN in the specified column
df.loc[nan_indices, column] = np.nan

#### Separating the dataframe to calculate missing data
In a bayesian aaptive N_of_1 trials, we are going to calculate the missing data as we prepare to update the posterior joint distribution. Thus, the first step is to sepaate the dataframe when the first missing value appears.

In [33]:
## seperating the dataframe at the point of first missing value
nan_t = df[df.isna().any(axis=1)]['t']
nan_t = nan_t.sort_values(ascending=True).unique() ## shorting the value in ascending order to make sure we dot the first time cycle
ts = nan_t[0]
dt = df[df['t'] <= ts].copy()
dt.head()

Unnamed: 0,c,t,patient_id,treatment,outcome
0,0.12573,0,0,0,-0.132105
30,0.345584,0,1,0,
60,0.189053,0,2,1,-0.522748
90,2.040919,0,3,1,-2.555665
120,-0.651791,0,4,0,-0.174717


**while t = 0**

In [34]:
lamb = 2
outcome_col = ['outcome']
context_cols = ['c','treatment']

print(dt.isna().value_counts())
filled_dt = DR_fill(dt, lamb, context_cols, outcome_col)
print(filled_dt.isna().value_counts())

c      t      patient_id  treatment  outcome
False  False  False       False      False      86
                                     True       14
Name: count, dtype: int64
c      t      patient_id  treatment  outcome
False  False  False       False      False      100
Name: count, dtype: int64


**when t =! 0**

In [36]:
next_cycle = df[(df['t']> nan_t[0]) & (df['t'] <= nan_t[1])].copy()
next_dt = pd.concat([filled_dt, next_cycle], axis= 0, ignore_index=False) # add the next cycle to the data
next_dt = next_dt.sort_index()
next_dt

Unnamed: 0,c,t,patient_id,treatment,outcome
0,0.125730,0,0,0,-0.132105
1,0.640423,1,0,0,0.104900
30,0.345584,0,1,0,-0.156153
31,0.330437,1,1,0,-1.303157
60,0.189053,0,2,1,-0.522748
...,...,...,...,...,...
2911,-0.425905,1,97,0,-0.985005
2940,-1.087591,0,98,0,-1.244621
2941,0.560527,1,98,0,
2970,0.082494,0,99,1,-0.464418


In [37]:
print(next_dt.isna().value_counts())
next_filled_dt = DR_fill(next_dt, lamb, context_cols, outcome_col)
print(next_filled_dt.isna().value_counts())

c      t      patient_id  treatment  outcome
False  False  False       False      False      188
                                     True        12
Name: count, dtype: int64
c      t      patient_id  treatment  outcome
False  False  False       False      False      200
Name: count, dtype: int64
