In [10]:
import pandas as pd
import os
import numpy as np
import statsmodels.api as sm

In [6]:
file_path = "extraction/data_censored.csv"

# Read the CSV file into a DataFrame
try:
    data_censored = pd.read_csv(file_path)
    print("Data loaded successfully!")
    print(data_censored.head())  # Display the first few rows
except FileNotFoundError:
    print(f"File not found at {file_path}")

Data loaded successfully!
   id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  \
0   1       0          1   1  1.146148   0  0.734203   36  0.083333        0   
1   1       1          1   1  0.002200   0  0.734203   37  0.166667        0   
2   1       2          1   0 -0.481762   0  0.734203   38  0.250000        0   
3   1       3          1   0  0.007872   0  0.734203   39  0.333333        0   
4   1       4          1   1  0.216054   0  0.734203   40  0.416667        0   

   censored  eligible  
0         0         1  
1         0         0  
2         0         0  
3         0         0  
4         0         0  


In [7]:
# Define directories for saving results
trial_pp_dir = os.path.join(os.getcwd(), "trial_pp")
os.makedirs(trial_pp_dir, exist_ok=True)

trial_itt_dir = os.path.join(os.getcwd(), "trial_itt")
os.makedirs(trial_itt_dir, exist_ok=True)

In [8]:

# Define a function to structure the trial data
def set_data(trial_name, data, id_col, period_col, treatment_col, outcome_col, eligible_col):
    """Prepare a dictionary to structure trial data."""
    return {
        "trial_name": trial_name,
        "data": data,
        "id": data[id_col],
        "period": data[period_col],
        "treatment": data[treatment_col],
        "outcome": data[outcome_col],
        "eligible": data[eligible_col],
    }

# Per-Protocol (PP)
trial_pp = set_data(
    trial_name="PP",
    data=data_censored,
    id_col="id",
    period_col="period",
    treatment_col="treatment",
    outcome_col="outcome",
    eligible_col="eligible",
)

# Intention-To-Treat (ITT)
trial_itt = set_data(
    trial_name="ITT",
    data=data_censored,
    id_col="id",
    period_col="period",
    treatment_col="treatment",
    outcome_col="outcome",
    eligible_col="eligible",
)

# Print the structured ITT trial data
print(trial_itt)

{'trial_name': 'ITT', 'data':      id  period  treatment  x1        x2  x3        x4  age     age_s  \
0     1       0          1   1  1.146148   0  0.734203   36  0.083333   
1     1       1          1   1  0.002200   0  0.734203   37  0.166667   
2     1       2          1   0 -0.481762   0  0.734203   38  0.250000   
3     1       3          1   0  0.007872   0  0.734203   39  0.333333   
4     1       4          1   1  0.216054   0  0.734203   40  0.416667   
..   ..     ...        ...  ..       ...  ..       ...  ...       ...   
720  99       3          0   0 -0.747906   1  0.575268   68  2.750000   
721  99       4          0   0 -0.790056   1  0.575268   69  2.833333   
722  99       5          1   1  0.387429   1  0.575268   70  2.916667   
723  99       6          1   1 -0.033762   1  0.575268   71  3.000000   
724  99       7          0   0 -1.340497   1  0.575268   72  3.083333   

     outcome  censored  eligible  
0          0         0         1  
1          0         0 

In [11]:
# Define directory for saving models
trial_pp_dir = os.path.join(os.getcwd(), "trial_pp")
os.makedirs(trial_pp_dir, exist_ok=True)

# Separate data for treatment = 1 and treatment = 0 in the previous period
data_treated = data_censored[data_censored['treatment'].shift(1) == 1]
data_untreated = data_censored[data_censored['treatment'].shift(1) == 0]

# Define function to fit logistic regression models
def fit_logit_model(data, formula, save_path):
    """Fits a logistic regression model and saves it."""
    y = data['treatment']  # Dependent variable (treatment in current period)
    X = data[formula]  # Independent variables
    X = sm.add_constant(X)  # Add intercept term
    
    model = sm.Logit(y, X).fit()
    
    # Save model summary
    with open(save_path, "w") as f:
        f.write(model.summary().as_text())
    
    return model

# Fit numerator model (only age as predictor)
numerator_model = fit_logit_model(data_censored, ["age"], os.path.join(trial_pp_dir, "switch_numerator_model.txt"))

# Fit denominator model (age + x1 + x3 as predictors)
denominator_model = fit_logit_model(data_censored, ["age", "x1", "x3"], os.path.join(trial_pp_dir, "switch_denominator_model.txt"))

# Compute stabilized weights
data_censored["numerator_prob"] = numerator_model.predict(sm.add_constant(data_censored[["age"]]))
data_censored["denominator_prob"] = denominator_model.predict(sm.add_constant(data_censored[["age", "x1", "x3"]]))
data_censored["switch_weight"] = data_censored["numerator_prob"] / data_censored["denominator_prob"]

# Print first few switch weights
print(data_censored[["id", "switch_weight"]].head())

Optimization terminated successfully.
         Current function value: 0.662406
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.660234
         Iterations 5
   id  switch_weight
0   1       0.930088
1   1       0.928634
2   1       1.039459
3   1       1.040816
4   1       0.924292
