In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

import simulate_data.simulate_data as sd

In [None]:
for i in range(200):

    n=250
    # data
    data = sd.make_zaidi_data_A(n, seed=i)
    X = data["X"]
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((X.copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    X_names = ["X"+str(i) for i in range(X.shape[1])]
    column_names=X_names+['Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                X, 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_A/"+str(i%10)+"/zaidi_data_A_seed="+str(i)+".csv", index=False)

In [None]:
for i in range(200):
    n=250
    # data
    data = sd.make_zaidi_data_B(n, seed=i)
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    column_names=['X0','X1','X2','X3','X4', 'Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                data["X"], 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_B/zaidi_data_B_seed="+str(i)+".csv", index=False)

In [11]:
linnlin = ["nonlinear"]#["linear","nonlinear"]
hethom = ["heterogeneous"]#["homogeneous", "heterogeneous"]
n_size = [100000]#[250,500]
for ll in range(len(linnlin)):
    for hh in range(len(hethom)):
        for n in range(len(n_size)):
            for i in range(200):
                # data
                data = sd.make_hahn_data(
                    function_type=linnlin[ll], 
                    effect_type=hethom[hh], 
                    n_in_study=n_size[n],
                    seed=i
                )
                Y=data["Y"]
                W=data["W"]
                pi=data["p"]
                data["Y_i_star"] = sd.get_Y_i_star(Y,W,pi)
                
                X=data[["X0","X1","X2","X3","X4_2","X4_3","X5","X1_X3"]].copy()
                X['intercept'] = 1
                max_abs_scaler = preprocessing.MaxAbsScaler()
                X_maxabs = max_abs_scaler.fit_transform(X)
                clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
                p_hat = clf.predict_proba(X_maxabs)[:,1]
                
                data.to_csv("simulate_data/hahn_data_"+linnlin[ll]+"_"+ hethom[hh]+"_n="+ str(n_size[n])+"/"+str(i%20)+"/hahn_data_seed="+str(i)+".csv", index=False)


In [8]:
for i in tqdm(range(200)):

    n=250
    # data
    data = sd.make_zaidi_data_A(n, seed=i, variance=1)
    X = data["X"]
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    X_names = ["X"+str(i) for i in range(X.shape[1])]
    column_names=X_names+['Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                X, 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_A_var=1/"+str(i%10)+"/zaidi_data_A_seed="+str(i)+".csv", index=False)
    

  5%|▌         | 10/200 [00:00<00:06, 29.40it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'simulate_data/zaidi_data_A_var=1/10/zaidi_data_A_seed=10.csv'

In [None]:
for i in tqdm(range(200)):
    n=250
    # data
    data = sd.make_zaidi_data_B(n, seed=i, variance=1)
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    column_names=['X0','X1','X2','X3','X4', 'Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                data["X"], 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_B_var=1/zaidi_data_B_seed="+str(i)+".csv", index=False)

In [None]:
for i in tqdm(range(200)):

    n=250
    # data
    data = sd.make_zaidi_data_A(n, seed=i, variance=25)
    X = data["X"]
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    X_names = ["X"+str(i) for i in range(X.shape[1])]
    column_names=X_names+['Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                X, 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_A_var=25/"+str(i%10)+"/zaidi_data_A_seed="+str(i)+".csv", index=False)
    

In [None]:
for i in tqdm(range(200)):
    n=250
    # data
    data = sd.make_zaidi_data_B(n, seed=i, variance=25)
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    column_names=['X0','X1','X2','X3','X4', 'Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                data["X"], 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_B_var=25/zaidi_data_B_seed="+str(i)+".csv", index=False)

In [None]:
for i in tqdm(range(200)):

    n=250
    # data
    data = sd.make_zaidi_data_A(n, seed=i, variance=1000)
    X = data["X"]
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    X_names = ["X"+str(i) for i in range(X.shape[1])]
    column_names=X_names+['Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                X, 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_A_var=1000/"+str(i%10)+"/zaidi_data_A_seed="+str(i)+".csv", index=False)
    

In [None]:
for i in tqdm(range(200)):

    n=250
    # data
    data = sd.make_zaidi_data_B(n, seed=i, variance=1000)
    X = data["X"]
    Y=data["Y"]
    W=data["W"]
    pi=data["p"]
    Y_i_star = sd.get_Y_i_star(Y,W,pi)
    
    X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_maxabs = max_abs_scaler.fit_transform(X_copy)
    clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
    p_hat = clf.predict_proba(X_maxabs)[:,1]
    
    X_names = ["X"+str(i) for i in range(X.shape[1])]
    column_names=X_names+['Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)']
    output_data = pd.DataFrame(
        np.hstack(
            (
                X, 
                data["Y"].reshape(n,1), 
                data["W"].reshape(n,1), 
                data['p'].reshape(n,1), 
                data['tau'].reshape(n,1), 
                data['Y1'].reshape(n,1), 
                data['Y0'].reshape(n,1),
                Y_i_star.reshape(n,1),
                p_hat.reshape(n,1),
                data['h(x)'].reshape(n,1), 
            )
        ),
        columns=column_names
    )
    output_data.to_csv("simulate_data/zaidi_data_B_var=1000/zaidi_data_B_seed="+str(i)+".csv", index=False)
    

In [3]:
percent_variation = [.1]#[0.001,0.005,0.01,0.05,0.1,0.5,1.0, 2.0,10.0,100.0]
for i in tqdm(range(200)):
    for pv in percent_variation:
        n=250#1000
        # data
        data = sd.make_CMM_data_B(n, per_var=pv, seed=i)
        X = data["X"]
        Y=data["Y_obs"]
        W=data["W"]
        pi=data["p"]
        Y_i_star = data["Y_i_star"]
        sig = np.ones(n)*data['sig']
        
        X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
        max_abs_scaler = preprocessing.MaxAbsScaler()
        X_maxabs = max_abs_scaler.fit_transform(X_copy)
        clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
        p_hat = clf.predict_proba(X_maxabs)[:,1]
        
        X_names = ["X"+str(i) for i in range(X.shape[1])]
        column_names=X_names+['Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)','sig']
        output_data = pd.DataFrame(
            np.hstack(
                (
                    X, 
                    data["Y_obs"].reshape(n,1), 
                    data["W"].reshape(n,1), 
                    data['p'].reshape(n,1), 
                    data['g(x)'].reshape(n,1), 
                    data['Y1'].reshape(n,1), 
                    data['Y0'].reshape(n,1),
                    Y_i_star.reshape(n,1),
                    p_hat.reshape(n,1),
                    data['h(x)'].reshape(n,1),
                    sig.reshape(n,1),
                )
            ),
            columns=column_names
        )
        output_data.to_csv("simulate_data/CMM_data_B_var_percent="+str(pv)+"/CMM_data_B_seed="+str(i)+".csv", index=False)
        

100%|██████████| 200/200 [00:02<00:00, 93.39it/s]


In [2]:
percent_variation = [0.1]#[0.001,0.005,0.01,0.05,0.1,0.5,1.0, 2.0,10.0,100.0]
for i in tqdm(range(200)):
    for pv in percent_variation:
        n=250#1000
        # data
        data = sd.make_CMM_data_C(n, per_var=pv, seed=i)
        X = data["X"]
        Y=data["Y_obs"]
        W=data["W"]
        pi=data["p"]
        Y_i_star = data["Y_i_star"]
        sig = np.ones(n)*data['sig']
        
        X_copy = np.hstack((data["X"].copy(), np.ones(len(Y)).reshape(n,1)))
        max_abs_scaler = preprocessing.MaxAbsScaler()
        X_maxabs = max_abs_scaler.fit_transform(X_copy)
        clf = LogisticRegression(random_state=0).fit(X_maxabs, W)
        p_hat = clf.predict_proba(X_maxabs)[:,1]
        
        X_names = ["X"+str(i) for i in range(X.shape[1])]
        column_names=X_names+['Y', 'W', 'p', 'tau', 'Y1', 'Y0', 'Y_i_star', 'p_hat','h(x)','sig']
        output_data = pd.DataFrame(
            np.hstack(
                (
                    X, 
                    data["Y_obs"].reshape(n,1), 
                    data["W"].reshape(n,1), 
                    data['p'].reshape(n,1), 
                    data['g(x)'].reshape(n,1), 
                    data['Y1'].reshape(n,1), 
                    data['Y0'].reshape(n,1),
                    Y_i_star.reshape(n,1),
                    p_hat.reshape(n,1),
                    data['h(x)'].reshape(n,1),
                    sig.reshape(n,1),
                )
            ),
            columns=column_names
        )
        output_data.to_csv("simulate_data/CMM_data_C_var_percent="+str(pv)+"/CMM_data_C_seed="+str(i)+".csv", index=False)
        

100%|██████████| 200/200 [00:01<00:00, 107.33it/s]
