In [19]:
import numpy as np
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter
import matplotlib.patches as mpatches
import scipy
from sklearn.manifold import TSNE
from scipy.stats import linregress

fontsize = 18
SMALL_SIZE = 8
MEDIUM_SIZE = 10
BIGGER_SIZE = 12
plt.rc('font', size=MEDIUM_SIZE)  # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)  # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)  # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)  # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title
plt.rc('xtick', labelsize=20)
plt.rc('ytick', labelsize=20)

font = {'family': 'normal',
        'weight': 'bold',
        'size': 24}

plt.rc('font', **font)
params = {'legend.fontsize': 'x-large',
          # 'figure.figsize': (15, 5),
          'axes.labelsize': 'x-large',
          'axes.titlesize': 'x-large',
          'xtick.labelsize': 'x-large',
          'ytick.labelsize': 'x-large'}

plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

plt.rcParams.update(params)
import seaborn as sns
import pandas
sns.set_style('white')
sns.set_context('paper')
sns.set()
seed=31415
np.random.seed(seed)

## ACTG-Synthentic Data

We simulate potential outcomes according to a Gompertz-Cox distribution with selection bias from a simple logistic model for $P(A=1| X=x )$ and AFT-based censoring mechanism.  Below is our generative scheme: 

$$X = \text{ACTG covariates}$$
$$P(A=1|X=x) = \frac{1}{b} \times \left(a + \sigma\left( \eta ({\rm AGE} - \mu_{\rm AGE} + {\rm CD40} - \mu_{\rm CD40}) \right) \right)$$
$$ U  \sim {\rm Uniform} (0, 1 )$$
$$T_A  =  \frac{1}{\alpha_A} \log \left[1 - \frac{\alpha_A \log U}{ \lambda_A  \exp\left( x ^T  \beta_A\right)  }  \right]$$
$$\log C  \sim {\rm Normal} (\mu_c, \sigma_c^2)$$
$$Y = \min(T_A, C)$$

where $\{ \beta_A, \alpha_A, \lambda_A, b, a, \eta, \mu_c, \sigma_c \}$ are hyper-parameters and $ \{\mu_{\rm AGE},  \mu_{\rm CD40}\}$ are the means for age and CD40 respectively.

## Load ACTG175 Data

In [20]:
#https://rdrr.io/cran/speff2trial/man/ACTG175.html
data_frame = pandas.read_csv('data/actg175/ACTG175.csv', index_col=0)
print("head of data:{}, data shape:{}".format(data_frame.head(), data_frame.shape))
print("unique", len(np.unique(data_frame[['pidnum']])))

print(data_frame.columns)

#categorical = ['hemo, 'homo', 'drugs', 'oprior', 'z30', 'zprior', 'race', 'gender', 
#'str2', 'strat', 'symptom', 'treat','offtrt', 'r',  ]
# outcome = ['cens', 'days']
# treatment = 'arms'
# treatment arm (0=zidovudine, 1=zidovudine and didanosine, 2=zidovudine and zalcitabine, 3=didanosine)

to_drop = ['cens', 'days', 'arms', 'pidnum']



#print("head of x_data:", x_data.shape)
print("head of data:",  data_frame.shape)

head of data:   pidnum  age     wtkg  hemo  homo  drugs  karnof  oprior  z30  zprior  ...  \
1   10056   48  89.8128     0     0      0     100       0    0       1  ...   
2   10059   61  49.4424     0     0      0      90       0    1       1  ...   
3   10089   45  88.4520     0     1      1      90       0    1       1  ...   
4   10093   47  85.2768     0     1      0     100       0    1       1  ...   
5   10124   43  66.6792     0     1      0     100       0    1       1  ...   

   offtrt  cd40  cd420  cd496  r  cd80  cd820  cens  days  arms  
1       0   422    477  660.0  1   566    324     0   948     2  
2       0   162    218    NaN  0   392    564     1  1002     3  
3       1   326    274  122.0  1  2063   1893     0   961     3  
4       0   287    394    NaN  0  1590    966     0  1166     3  
5       0   504    353  660.0  1   870    782     0  1090     0  

[5 rows x 27 columns], data shape:(2139, 27)
unique 2139
Index(['pidnum', 'age', 'wtkg', 'hemo', 'homo', 'dru

In [21]:
print(data_frame.isna().any())

pidnum     False
age        False
wtkg       False
hemo       False
homo       False
drugs      False
karnof     False
oprior     False
z30        False
zprior     False
preanti    False
race       False
gender     False
str2       False
strat      False
symptom    False
treat      False
offtrt     False
cd40       False
cd420      False
cd496       True
r          False
cd80       False
cd820      False
cens       False
days       False
arms       False
dtype: bool


In [22]:
na_columns = ['cd496']
na_data = data_frame[na_columns]
print("na_data description:{}".format(na_data.describe()))

na_data description:             cd496
count  1342.000000
mean    328.570790
std     174.656153
min       0.000000
25%     209.250000
50%     321.000000
75%     440.000000
max    1190.000000


In [23]:
def print_missing_prop(covariates):
    missing = np.array(np.isnan(covariates), dtype=float)
    shape = np.shape(covariates)
    proportion = np.sum(missing) / (shape[0] * shape[1]) * 100
    print("missing_proportion:{}".format(proportion))
    

print_missing_prop(data_frame)

missing_proportion:1.3800148910013332


In [24]:
data_frame = data_frame.fillna(data_frame.median())

In [25]:
print(data_frame.isna().any())


pidnum     False
age        False
wtkg       False
hemo       False
homo       False
drugs      False
karnof     False
oprior     False
z30        False
zprior     False
preanti    False
race       False
gender     False
str2       False
strat      False
symptom    False
treat      False
offtrt     False
cd40       False
cd420      False
cd496      False
r          False
cd80       False
cd820      False
cens       False
days       False
arms       False
dtype: bool


In [26]:
age_data = data_frame[['age']]
print("age description:{}".format(age_data.describe()))
age_data =np.array(age_data).reshape(len(age_data))
print(age_data.shape)
mu_age = np.mean(age_data)

age description:               age
count  2139.000000
mean     35.248247
std       8.709026
min      12.000000
25%      29.000000
50%      34.000000
75%      40.000000
max      70.000000
(2139,)


In [27]:
cd40_data = data_frame[['cd40']]
print("cd40_data description:{}".format(cd40_data.describe()))
cd40_data=np.array(cd40_data).reshape(len(cd40_data))
print(cd40_data.shape)
mu_cd40 = np.mean(cd40_data) 

cd40_data description:              cd40
count  2139.000000
mean    350.501169
std     118.573863
min       0.000000
25%     263.500000
50%     340.000000
75%     423.000000
max    1199.000000
(2139,)


In [28]:
x_data =  data_frame.drop(labels=to_drop, axis=1)
print("covariate description:{}".format(x_data.describe()))
x_data =np.array(x_data).reshape(x_data.shape)
print(x_data.shape)

covariate description:               age         wtkg         hemo         homo        drugs  \
count  2139.000000  2139.000000  2139.000000  2139.000000  2139.000000   
mean     35.248247    75.125311     0.084151     0.661057     0.131370   
std       8.709026    13.263164     0.277680     0.473461     0.337883   
min      12.000000    31.000000     0.000000     0.000000     0.000000   
25%      29.000000    66.679200     0.000000     0.000000     0.000000   
50%      34.000000    74.390400     0.000000     1.000000     0.000000   
75%      40.000000    82.555200     0.000000     1.000000     0.000000   
max      70.000000   159.939360     1.000000     1.000000     1.000000   

            karnof       oprior          z30  zprior      preanti  ...  \
count  2139.000000  2139.000000  2139.000000  2139.0  2139.000000  ...   
mean     95.446470     0.021973     0.550257     1.0   379.175783  ...   
std       5.900985     0.146629     0.497584     0.0   468.657526  ...   
min      70.000

# Semi-Simulations

In [29]:
# Beta for T=1
#    age          wtkg          hemo          homo         drugs        karnof        oprior           z30        zprior       preanti 
#  0.0026987044  0.0094957416 -0.2047708817 -0.0518243280 -0.2168722467  0.0076266828 -0.0796099695  0.6258748940            NA  0.0009670592 
#          race        gender          str2         strat       symptom         treat        offtrt          cd40         cd420         cd496 
# -1.0101809693 -0.4038655688 -1.5959739338 -0.0563572096  0.5244218189            NA  0.2280296997  0.0035548596 -0.0047974742 -0.0121293815 
#             r          cd80         cd820 
# -1.0625208970 -0.0004266264  0.0005844290 

beta_one = [ 0.0026987044,  0.0094957416, -0.2047708817, -0.0518243280, -0.2168722467,  0.0076266828, -0.0796099695,  
            0.6258748940, 0, 0.0009670592, -1.0101809693, -0.4038655688, -1.5959739338, -0.0563572096, 0.5244218189,    
            0,  0.2280296997,  0.0035548596, -0.0047974742, -0.0121293815, -1.0625208970, -0.0004266264,0.0005844290 ]

beta_one = np.array(beta_one)
print("beta_one: ", beta_one.shape)

assert(beta_one.shape[0] == x_data.shape[1])






beta_one:  (23,)


In [30]:
## Beta for T=0

#          age          wtkg          hemo          homo         drugs        karnof        oprior           z30        zprior       preanti 
#  1.148569e-02  3.896347e-03 -3.337743e-02 -1.215442e-01 -6.036002e-01  4.563380e-03 -5.217492e-02  1.414948e+00            NA  9.294612e-06 
#          race        gender          str2         strat       symptom         treat        offtrt          cd40         cd420         cd496 
#  7.863787e-02  4.756738e-01 -7.807835e-01 -1.766999e-01  1.622865e-01            NA  1.551692e-01  2.793350e-03 -6.417969e-03 -9.856514e-03 
#             r          cd80         cd820 
# -1.127284e+00  2.247806e-04  1.952943e-04 


beta_zero = [1.148569e-02,  3.896347e-03, -3.337743e-02, -1.215442e-01, -6.036002e-01,  4.563380e-03, -5.217492e-02,
             1.414948e+00, 0,  9.294612e-06, 7.863787e-02,  4.756738e-01, -7.807835e-01, -1.766999e-01,  1.622865e-01,
             0,  1.551692e-01,  2.793350e-03, -6.417969e-03, -9.856514e-03,  -1.127284e+00, 
             2.247806e-04,  1.952943e-04] 
beta_zero = np.array(beta_zero)
print("beta_zero: ", beta_zero.shape)

assert(beta_zero.shape[0] == x_data.shape[1])


beta_zero:  (23,)


In [31]:
def sigmoid(a):
    return 1/(1 + np.exp(-a))

In [32]:
# random varibles for data (x, y, \delta, a)
N = x_data.shape[0]

T_F = np.zeros(N)
T_CF = np.zeros(N)
Y_F = np.zeros(N)
Y_CF = np.zeros(N)
delta_F = np.zeros(N)
delta_CF = np.zeros(N)
A =  np.zeros(N)
prop =  np.zeros(N)

time = 'days'
c_mean_time = 1000 # mean censoring time
c_std_time = 100 # std censoring time

lamd_zero = 6 * 1e-4
lamd_one = 6 * 1e-4
alpha = 0.0055

U_0 =  np.random.uniform(0,1, size=(N))
U_1 =  np.random.uniform(0,1, size=(N))
#C = np.random.uniform(c_start_time, c_end_time, size=N) # Non-Informative censoring
C = np.random.normal(c_mean_time, c_std_time, size=(N))
gamma = -30
b_zero = 0


for i in range(N):
    
    pos_age_i = age_data[i]
    beta_i = gamma * ((pos_age_i - mu_age) + (cd40_data[i]-mu_cd40))# counfounding
    #beta_i =  gamma * (pos_age_i - mu_age) 
    
    balance = 1.5 # parameter to balance
    prop_i = 1/balance * (0.3 + sigmoid(beta_i))
    prop[i] = prop_i
    
    A_i = np.random.binomial(n=1, p=prop_i, size=1)[0]
    A[i] = A_i
    
    cov_eff_T_0 = lamd_zero * np.exp(np.dot(x_data[i], beta_zero))
    cov_eff_T_1 = lamd_one * np.exp(np.dot(x_data[i], beta_one))
    
                       
    stoch_0 = alpha * np.log(U_0[i])
    stoch_1 = alpha * np.log(U_1[i])
    

    T_1_i = 1/alpha * np.log(1 - stoch_1/cov_eff_T_1) + b_zero
    T_0_i = 1/alpha * np.log(1 - stoch_0/cov_eff_T_0)  
    
    T_F_i =  A_i * T_1_i + (1-A_i) * T_0_i
    T_CF_i = (1-A_i) * T_1_i + A_i * T_0_i
    
    
    C_i = C[i]

    Y_F_i = min(T_F_i, C_i)
    Y_CF_i = min(T_CF_i, C_i)
    
    delta_F_i = T_F_i <= C_i
    delta_F[i] = delta_F_i
    
    delta_CF_i = T_CF_i <= C_i
    delta_CF[i] = delta_CF_i 
    
    T_F[i] = T_F_i
    T_CF[i] = T_CF_i
    
    
    Y_F[i] = Y_F_i
    Y_CF[i] = Y_CF_i
    



  


# Save Data 

In [33]:
np.save('data/actg175_simulated/covariates', x_data)
np.save('data/actg175_simulated/treatment', A)

data_F = {'y_f': Y_F, 'e_f': delta_F, 't_f': T_F, 'y_cf': Y_CF, 'e_cf': delta_CF, 't_cf': T_CF}
df = pandas.DataFrame.from_dict(data_F)
df.to_csv('data/actg175_simulated/event_pairs.csv', index=False)