In [7]:
import numpy as np
import pandas as pd
import random

def build_transition_dict(df_subset):
    """
    Build a dictionary of transitions for a specific label group (healthy or sick).
    Keys are tuples (t, EIT, NIRS, EIS) with the continuous signals rounded.
    Values are lists of possible next states: (t+1, EIT, NIRS, EIS), also rounded.
    """
    transition_dict = {}

    # Sort by pig_id, then by sample (time) so we can iterate over consecutive rows
    df_sorted = df_subset.sort_values(by=['pig_id', 'sample']).reset_index(drop=True)

    # Group by pig_id so we only consider consecutive times for each pig
    grouped = df_sorted.groupby('pig_id')

    for pig_id, group in grouped:
        group = group.sort_values(by='sample')
        rows = group[['sample', 'EIT', 'NIRS', 'EIS']].values.tolist()

        for i in range(len(rows) - 1):
            t1, eit1, nirs1, eis1 = rows[i]
            t2, eit2, nirs2, eis2 = rows[i+1]

            # Create keys and next state with rounding (2 decimals)
            current_state = (int(t1), round(float(eit1), 2), round(float(nirs1), 2), round(float(eis1), 2))
            next_state = (int(t2), round(float(eit2), 2), round(float(nirs2), 2), round(float(eis2), 2))

            if current_state not in transition_dict:
                transition_dict[current_state] = []
            transition_dict[current_state].append(next_state)
            
    return transition_dict

def sample_initial_state(df_subset):

    init_df = df_subset[df_subset['sample'] == 0]
    if len(init_df) == 0:
        init_df = df_subset

    row = init_df.sample(n=1).iloc[0]
    return (int(row['sample']), round(float(row['EIT']), 2), round(float(row['NIRS']), 2), round(float(row['EIS']), 2))

def generate_one_patient(transition_dict, df_subset, label_val, patient_id):
    """
    Generate one synthetic patient trajectory for t=0..20 using the given transition dictionary.
    label_val: 0 for healthy, 1 for sick.
    patient_id: Unique integer ID.
    """
    synthetic_rows = []

    # Pick an initial state, forcing time to 0
    current_state = sample_initial_state(df_subset)
    _, eit0, nirs0, eis0 = current_state
    current_state = (0, eit0, nirs0, eis0)

    # Record the initial state at time 0
    synthetic_rows.append({
        'patient_id': patient_id,
        'time': 0,
        'EIT': current_state[1],
        'NIRS': current_state[2],
        'EIS': current_state[3],
        'label': label_val
    })

    # Generate the trajectory for time steps 1 to 20
    for t in range(1, 21):
        # Build key from previous state with rounding
        key = (t-1, round(current_state[1], 2), round(current_state[2], 2), round(current_state[3], 2))
        if key not in transition_dict:
            # Try to find a similar key from the same time step t-1
            possible_keys = [k for k in transition_dict.keys() if k[0] == t-1]
            if possible_keys:
                key = random.choice(possible_keys)
            else:
                # If no similar key is found, create a self-transition
                key = (t-1, round(current_state[1], 2), round(current_state[2], 2), round(current_state[3], 2))
                transition_dict[key] = [(t, round(current_state[1], 2), round(current_state[2], 2), round(current_state[3], 2))]
        
        possible_next_states = transition_dict[key]
        chosen_next = random.choice(possible_next_states)
        
        # For consistency, force the time value to t
        next_state = (t, chosen_next[1], chosen_next[2], chosen_next[3])
        
        synthetic_rows.append({
            'patient_id': patient_id,
            'time': t,
            'EIT': next_state[1],
            'NIRS': next_state[2],
            'EIS': next_state[3],
            'label': label_val
        })
        
        current_state = next_state

    return synthetic_rows

def main():
    # 1. Load the porcine dataset (expected columns: [pig_id, sample, label, EIT, NIRS, EIS])
    df = pd.read_csv('porcine_data.csv')

    # 2. Split data into healthy and sick subsets
    df_healthy = df[df['label'] == 0].copy()
    df_sick = df[df['label'] == 1].copy()

    # 3. Build transition dictionaries for healthy and sick groups
    healthy_transitions = build_transition_dict(df_healthy)
    sick_transitions = build_transition_dict(df_sick)

    # 4. Generate 600 synthetic patients with 20% sick and 80% healthy
    num_patients = 600
    p_sick = 0.20  # 20% chance of being sick

    synthetic_data = []
    for pid in range(1, num_patients + 1):
        is_sick = (random.random() < p_sick)
        if is_sick:
            label_val = 1
            rows = generate_one_patient(sick_transitions, df_sick, label_val, pid)
        else:
            label_val = 0
            rows = generate_one_patient(healthy_transitions, df_healthy, label_val, pid)
        synthetic_data.extend(rows)

    # 5. Convert the synthetic data to a DataFrame
    synthetic_df = pd.DataFrame(synthetic_data, columns=['patient_id', 'time', 'EIT', 'NIRS', 'EIS', 'label'])

    # 6. Save or display the final synthetic data
    synthetic_df.to_csv('synthetic_patients.csv', index=False)
    print("Synthetic data generation complete!")
    print(synthetic_df.head(30))

if __name__ == "__main__":
    main()

Synthetic data generation complete!
    patient_id  time    EIT   NIRS     EIS  label
0            1     0  14.92  77.93  505.98      0
1            1     1  14.92  77.93  505.98      0
2            1     2  14.61  75.27  493.36      0
3            1     3  14.54  76.53  483.18      0
4            1     4  14.52  74.31  485.06      0
5            1     5  14.60  73.24  490.62      0
6            1     6  14.45  76.68  493.30      0
7            1     7  14.27  77.64  484.40      0
8            1     8  14.36  77.41  485.44      0
9            1     9  14.50  74.73  483.55      0
10           1    10  14.56  73.05  491.05      0
11           1    11  14.56  74.97  501.82      0
12           1    12  14.54  73.41  501.76      0
13           1    13  14.56  74.36  501.34      0
14           1    14  14.62  75.13  506.31      0
15           1    15  14.48  74.64  503.91      0
16           1    16  14.52  73.00  511.79      0
17           1    17  14.62  73.77  504.38      0
18           1