In [17]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

########################################
# 1. LOAD AND CLEAN THE PORCINE DATA
########################################

porcine_df = pd.read_csv("porcine_data.csv")

# Columns we will use as features
feature_cols = ["Pleth_av", "RR_var", "ECG_HR",
                "EIT_thorax", "NIRS_abdomen", "EIS_thorax"]

# Drop rows with NaNs in those columns
porcine_df = porcine_df.dropna(subset=feature_cols)

# Features (X) and label (y)
X = porcine_df[feature_cols]
y = porcine_df["label"]   # 0 = healthy, 1 = sick

########################################
# 2. SPLIT + RANDOM FOREST + GRID SEARCH
########################################

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=123
)

param_grid = {
    "n_estimators": [50, 100, 200],
    "max_depth": [None, 5, 10],
    "min_samples_leaf": [1, 2, 5],
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

print("Best hyperparameters:", grid_search.best_params_)
print("Training accuracy:", best_rf.score(X_train, y_train))
print("Test accuracy:", best_rf.score(X_test, y_test))

########################################
# 3. GET RISK SCORES FOR THE FULL DATA
########################################

risk_probs = best_rf.predict_proba(X)[:, 1]  # Probability of being sick
porcine_df["pred_risk"] = risk_probs

########################################
# 4. ASSIGN RISK BUCKETS (5 BUCKETS)
########################################

def risk_to_bucket(r):
    if r < 0.2:
        return 0
    elif r < 0.4:
        return 1
    elif r < 0.6:
        return 2
    elif r < 0.8:
        return 3
    else:
        return 4

porcine_df["risk_bucket"] = porcine_df["pred_risk"].apply(risk_to_bucket)

########################################
# 5. BUILD MARKOV MATRICES (row-by-row)
########################################

num_buckets = 5

trans_counts_healthy = np.zeros((num_buckets, num_buckets), dtype=np.float64)
trans_counts_sick    = np.zeros((num_buckets, num_buckets), dtype=np.float64)

# We will handle transitions *per time step* within each pig_id
# But if pig label changes from time t to t+1, we count that as the
# label at time t controlling which matrix to increment.

# Sort so we can iterate time-consecutive rows
porcine_df = porcine_df.sort_values(by=["pig_id", "sample"])

for pig_id, pig_group in porcine_df.groupby("pig_id"):
    pig_group = pig_group.sort_values("sample")
    
    # Extract arrays for risk_bucket and label
    buckets = pig_group["risk_bucket"].to_numpy()
    labels  = pig_group["label"].to_numpy()
    
    # For consecutive points
    for i in range(len(buckets) - 1):
        s_t     = buckets[i]
        s_next  = buckets[i+1]
        label_t = labels[i]
        
        if label_t == 0:
            trans_counts_healthy[s_t, s_next] += 1
        else:
            trans_counts_sick[s_t, s_next] += 1

def row_normalize(counts_2d):
    """
    If a row sums to zero, replace it with uniform (0.2,...,0.2).
    Otherwise, divide by row sum.
    """
    for i in range(counts_2d.shape[0]):
        row_sum = counts_2d[i, :].sum()
        if row_sum == 0:
            # fallback: uniform
            counts_2d[i, :] = 1.0 / counts_2d.shape[1]
        else:
            counts_2d[i, :] /= row_sum
    return counts_2d

M_healthy = row_normalize(trans_counts_healthy)
M_sick    = row_normalize(trans_counts_sick)

print("\nM_healthy:\n", M_healthy)
print("\nM_sick:\n", M_sick)

########################################
# 6. GENERATE SYNTHETIC PATIENTS
########################################

N = 600
T_max = 20
p_sick = 0.2  # fraction of new patients that start out sick

# We'll define an initial bucket distribution from the *overall* bucket usage
bucket_counts = np.bincount(porcine_df["risk_bucket"])
bucket_probs  = bucket_counts / bucket_counts.sum()

def sample_initial_bucket():
    return np.random.choice(num_buckets, p=bucket_probs)

bucket_edges = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

def sample_risk_from_bucket(b):
    return np.random.uniform(bucket_edges[b], bucket_edges[b+1])

synthetic_rows = []

for patient_id in range(N):
    # Decide if the *entire* patient is sick or healthy
    # (If you want them to become sick partway, you'd do a more advanced approach.)
    is_sick = (np.random.rand() < p_sick)
    label   = 1 if is_sick else 0
    
    # Choose the appropriate Markov matrix
    M = M_sick if label == 1 else M_healthy
    
    # Initial bucket
    s = sample_initial_bucket()
    
    for t in range(T_max + 1):
        r = sample_risk_from_bucket(s)
        
        synthetic_rows.append({
            "patient_id":  patient_id,
            "time":        t,
            "risk_bucket": s,
            "risk_score":  r,
            "label":       label
        })
        
        # Next bucket
        s_next = np.random.choice(num_buckets, p=M[s, :])
        s = s_next

synthetic_df = pd.DataFrame(synthetic_rows)

########################################
# 7. SAVE TO CSV
########################################

synthetic_df.to_csv("synthetic_patients_rf.csv", index=False)
print("\nDone! Synthetic dataset saved to 'synthetic_data_rf.csv'.")

Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 5, 'n_estimators': 100}
Training accuracy: 0.9136904761904762
Test accuracy: 0.8388888888888889

M_healthy:
 [[8.57975237e-01 1.14348143e-01 2.47632921e-02 2.18499636e-03
  7.28332119e-04]
 [5.46666667e-01 3.46666667e-01 8.00000000e-02 2.66666667e-02
  0.00000000e+00]
 [3.52112676e-01 3.66197183e-01 2.11267606e-01 5.63380282e-02
  1.40845070e-02]
 [9.52380952e-02 4.76190476e-01 1.90476190e-01 1.90476190e-01
  4.76190476e-02]
 [0.00000000e+00 3.33333333e-01 6.66666667e-01 0.00000000e+00
  0.00000000e+00]]

M_sick:
 [[0.19047619 0.28571429 0.35714286 0.11904762 0.04761905]
 [0.13684211 0.27368421 0.41052632 0.11578947 0.06315789]
 [0.0979021  0.23776224 0.25874126 0.28671329 0.11888112]
 [0.05084746 0.09039548 0.1920904  0.42937853 0.23728814]
 [0.01481481 0.05925926 0.08148148 0.34074074 0.5037037 ]]

Done! Synthetic dataset saved to 'synthetic_data_rf.csv'.


In [15]:
import csv
import random
import numpy as np

def generate_synthetic_patients(
    output_csv="synthetic_patients.csv",
    num_patients=600,
    time_horizon=20,
    sick_fraction=0.20,
    seed=1234
):
    """
    Generate a CSV with columns [patient_id, time, risk_bucket, risk_score, label].
    
    - 'risk_bucket' is an integer 0..4,
    - 'risk_score' is a float in [0,1], chosen uniformly within the bucket’s subrange,
    - 'label' = 1 for sick, 0 for healthy,
    - We'll use two Markov chains: P_healthy, P_sick.
    """

    np.random.seed(seed)
    random.seed(seed)

    # --- Define 5 buckets and their numeric score ranges ---
    #    Bucket i => risk_score is in [0.2*i, 0.2*(i+1))
    #    for i=0..4. (Bucket 4 includes up to 1.0).
    bucket_ranges = [
        (0.0, 0.2),
        (0.2, 0.4),
        (0.4, 0.6),
        (0.6, 0.8),
        (0.8, 1.0)
    ]

    # --- Transition matrix for healthy patients (5x5) --- This was extracted from poricne data
    P_healthy = np.array([
        [0.70, 0.20, 0.10, 0.00, 0.00],  # from bucket 0
        [0.20, 0.65, 0.15, 0.00, 0.00],  # from bucket 1
        [0.15, 0.50, 0.35, 0.00, 0.00],  # from bucket 2
        [0.01, 0.19, 0.60, 0.20, 0.00],  # from bucket 3
        [0.00, 0.40, 0.50, 0.10, 0.00],  # from bucket 4
    ])

    # --- Transition matrix for sick patients (5x5) ---
    P_sick = np.array([
        [0.05, 0.40, 0.40, 0.10, 0.05],  # from bucket 0
        [0.00, 0.20, 0.50, 0.20, 0.10],  # from bucket 1
        [0.00, 0.00, 0.10, 0.70, 0.20],  # from bucket 2
        [0.00, 0.00, 0.00, 0.40, 0.60],  # from bucket 3
        [0.00, 0.00, 0.00, 0.10, 0.90],  # from bucket 4
    ])

    # --- Create output CSV ---
    with open(output_csv, mode="w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["patient_id", "time", "risk_bucket", "risk_score", "label"])

        for pid in range(num_patients):
            # Decide if this patient is sick or healthy
            is_sick = (random.random() < sick_fraction)
            label = 1 if is_sick else 0

            # Random initial bucket
            cur_bucket = np.random.choice([0,1,2,3,4])

            # For each time step 0..time_horizon
            for t in range(time_horizon+1):
                # Convert bucket -> a random float in [bucket_ranges[i][0], bucket_ranges[i][1])
                low, high = bucket_ranges[cur_bucket]
                # Make sure bucket 4 includes up to 1.0
                if cur_bucket == 4:
                    risk_score = random.uniform(0.8, 1.0)
                else:
                    risk_score = random.uniform(low, high)

                # Write this row
                writer.writerow([pid, t, cur_bucket, f"{risk_score:.4f}", label])

                # Sample next bucket from the appropriate Markov matrix
                if is_sick:
                    trans_probs = P_sick[cur_bucket]
                else:
                    trans_probs = P_healthy[cur_bucket]
                cur_bucket = np.random.choice([0,1,2,3,4], p=trans_probs)

    print(f"Done! Generated {num_patients} synthetic patients in {output_csv}.")

if __name__ == "__main__":
    generate_synthetic_patients(
        output_csv="synthetic_patients.csv",
        num_patients=600,     # 500 for train + 100 for test
        time_horizon=20,
        sick_fraction=0.20,   # ~20% truly hemorrhagic
        seed=1234
    )

Done! Generated 600 synthetic patients in synthetic_patients.csv.
