In [7]:
import pandas as pd
import numpy as np

# Set total number of patients
N = 400  
N_treated = N // 2  # 50% treated
N_control = N // 2  # 50% control

# Ensure exactly half treated (1) and half control (0)
treated_labels = np.array([0] * N_control + [1] * N_treated)
np.random.shuffle(treated_labels)  # Shuffle for random assignment

# Generate baseline symptom data
base_pain = np.random.normal(loc=5, scale=2, size=N).clip(0, 9).round().astype(int)
base_urgency = np.random.normal(loc=5, scale=1.5, size=N).clip(0, 9).round().astype(int)
base_frequency = np.random.normal(loc=3, scale=1, size=N).clip(0, 9).round().astype(int)

# Create a DataFrame for all patients
data = pd.DataFrame({
    "id": np.arange(1, N + 1),
    "treated": treated_labels,
    "pain": base_pain,
    "urgency": base_urgency,
    "frequency": base_frequency,
})

# Assign treatment time (only for treated patients, others get NaN)
data["t"] = np.where(data["treated"] == 1, 
                     np.round(np.random.uniform(0, 30, size=N), 1), 
                     np.nan)

# Generate treatment values at time `t`
data["pain_at_t"] = (base_pain + np.where(treated_labels == 1, np.random.normal(1, 0.5, N), np.random.normal(0, 0.3, N))).clip(0, 9).round().astype(int)
data["urgency_at_t"] = (base_urgency + np.where(treated_labels == 1, np.random.normal(1, 0.5, N), np.random.normal(0, 0.3, N))).clip(0, 9).round().astype(int)
data["frequency_at_t"] = (base_frequency + np.where(treated_labels == 1, np.random.normal(1, 0.5, N), np.random.normal(0, 0.3, N))).clip(0, 9).round().astype(int)

# Simulate 3-month and 6-month outcomes
reductionAt3mo = 1
reductionAt6mo = 0.5

data["pain_3mo"] = (data["pain_at_t"] - np.where(data["treated"] == 1, reductionAt3mo, 0) + np.random.normal(0, 1, N)).clip(0, 9).round().astype(int)
data["pain_6mo"] = (data["pain_at_t"] - np.where(data["treated"] == 1, reductionAt6mo, 0) + np.random.normal(0, 1, N)).clip(0, 9).round().astype(int)

data["urgency_3mo"] = (data["urgency_at_t"] - np.where(data["treated"] == 1, reductionAt3mo, 0) + np.random.normal(0, 1, N)).clip(0, 9).round().astype(int)
data["urgency_6mo"] = (data["urgency_at_t"] - np.where(data["treated"] == 1, reductionAt6mo, 0) + np.random.normal(0, 1, N)).clip(0, 9).round().astype(int)

data["frequency_3mo"] = (data["frequency_at_t"] - np.where(data["treated"] == 1, reductionAt3mo, 0) + np.random.normal(0, 1, N)).clip(0, 9).round().astype(int)
data["frequency_6mo"] = (data["frequency_at_t"] - np.where(data["treated"] == 1, reductionAt6mo, 0) + np.random.normal(0, 1, N)).clip(0, 9).round().astype(int)

data.to_csv("data/patient_data.csv", index=False)
