In [None]:

# Drop 'Case ID' since it's not a relevant feature for PCA
df_pca = df_reduced.drop(columns=['Case ID'])

# Standardize the data before PCA (important for correct scaling)
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_pca)

# Apply PCA
pca = PCA()
pca_components = pca.fit_transform(df_scaled)

# Create a DataFrame with explained variance ratio
explained_variance = pca.explained_variance_ratio_
df_pca_variance = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(len(explained_variance))],
    'Explained Variance Ratio': explained_variance
})

# Determine the number of components to retain at least 90% variance
cumulative_variance = pca.explained_variance_ratio_.cumsum()
num_components = (cumulative_variance < 0.90).sum() + 1

# Apply PCA with the optimal number of components
pca_optimal = PCA(n_components=num_components)
df_pca_transformed = pca_optimal.fit_transform(df_scaled)

# Convert to DataFrame for easier interpretation
df_pca_final = pd.DataFrame(df_pca_transformed, columns=[f'PC{i+1}' for i in range(num_components)])

# Retrieve the feature contributions (loadings) for each principal component
pca_loadings = pd.DataFrame(pca_optimal.components_, columns=df_pca.columns, index=[f'PC{i+1}' for i in range(num_components)])

# Find the top contributing features for each principal component
top_features_per_pc = {}
for pc in pca_loadings.index:
    top_features = pca_loadings.loc[pc].abs().nlargest(3).index.tolist()  # Get top 3 features contributing to each PC
    top_features_per_pc[pc] = top_features

# Convert to DataFrame for better visualization
df_top_features_pca = pd.DataFrame.from_dict(top_features_per_pc, orient='index', columns=['Top Feature 1', 'Top Feature 2', 'Top Feature 3'])

# Display results
df_pca_variance.head(15)

df_pca_final.head()
df_top_features_pca.head(12)

In [None]:
# Step 1: Filter relevant columns
columns_to_keep = ['Case ID', 'Activity', 'Leucocytes', 'CRP', 'LacticAcid']
df_filtered = df[columns_to_keep].copy()

# Step 2: Reshape the DataFrame to have 'Case ID', 'Activity', 'Value', 'Next Activity'
df_melted = df_filtered.melt(id_vars=['Case ID', 'Activity'], 
                              value_vars=['Leucocytes', 'CRP', 'LacticAcid'], 
                              var_name='Biomarker', 
                              value_name='Value')

# Step 3: Create a new column for the next activity
df_melted['Next Activity'] = df_melted.groupby('Case ID')['Activity'].shift(-1)

# Step 4: Drop rows where Value or Next Activity is NaN
df_biomarkers_next_activity = df_melted.dropna().reset_index(drop=True)

# Display the resulting DataFrame
df_biomarkers_next_activity.head(20)

In [None]:
df_biomarkers_next_activity.to_csv('Sepsis_Cases_Biomarkers.csv', index=False)
# Check unique biomarkers in the dataset
unique_biomarkers = df_biomarkers_next_activity['Biomarker'].unique()

# Get summary statistics for each biomarker
biomarker_stats = df_biomarkers_next_activity.groupby('Biomarker')['Value'].describe()

# Display the statistics
print(biomarker_stats)


# Leucocytes ranges
leucocytes_bins = [0, 7.5, 12.5, 15.0, 30.0, np.inf]
leucocytes_labels = ['Low', 'Normal', 'Elevated', 'High', 'Critical']

# CRP ranges
crp_bins = [0, 50, 100, 150, 250, np.inf]
crp_labels = ['Low', 'Mild', 'Moderate', 'Severe', 'Critical']

# Lactic Acid ranges
lactic_bins = [0, 1.2, 1.8, 2.5, 4.0, np.inf]
lactic_labels = ['Normal', 'Borderline', 'Elevated', 'High', 'Critical']

# Apply ranges to dataset
df_biomarkers_next_activity['Range'] = df_biomarkers_next_activity.apply(
    lambda row: pd.cut([row['Value']], 
                       bins=(leucocytes_bins if row['Biomarker'] == 'Leucocytes' else
                             crp_bins if row['Biomarker'] == 'CRP' else
                             lactic_bins),
                       labels=(leucocytes_labels if row['Biomarker'] == 'Leucocytes' else
                               crp_labels if row['Biomarker'] == 'CRP' else
                               lactic_labels))[0], axis=1)

# Determine the most frequent next activity per biomarker range
most_common_next_activity = df_biomarkers_next_activity.groupby(['Biomarker', 'Range'])['Next Activity'].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)

# Convert to DataFrame for easier viewing
df_most_common_next_activity = most_common_next_activity.reset_index()

# Display results
print(df_most_common_next_activity)

In [None]:
import matplotlib.pyplot as plt

# Scatter plot of Age vs Non-NaN Count
plt.figure(figsize=(10, 6))
plt.scatter(merged_df['Age'], merged_df['Non-NaN Count'], color='blue', alpha=0.6)

# Add labels and title
plt.title('Scatter Plot of Age vs Non-NaN Count')
plt.xlabel('Age')
plt.ylabel('Non-NaN Count')

# Display the plot
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Step 1: Melt the data to a long format
activity_columns = [f"Activity {i}" for i in range(1, 50)]  # Activity 1 to Activity 49
melted_df = merged_df.melt(
    id_vars=['Case ID', 'Leucocytes', 'CRP', 'LacticAcid'], 
    value_vars=activity_columns, 
    var_name='ActivityOrder', 
    value_name='Activity_Value'  # Changed from 'Activity' to 'Activity_Value'
)

# Convert ActivityOrder to numeric for ordering
melted_df['ActivityOrder'] = melted_df['ActivityOrder'].str.extract('(\d+)').astype(int)

# Step 2: Sort by Case ID and ActivityOrder, then create CurrentActivity and NextActivity
melted_df = melted_df.sort_values(['Case ID', 'ActivityOrder'])
melted_df['NextActivity'] = melted_df.groupby('Case ID')['Activity_Value'].shift(-1)

# Drop rows where NextActivity is NaN (end of sequence)
melted_df = melted_df.dropna(subset=['NextActivity'])

# Step 3: Analyze Leucocytes, CRP, and LacticAcid for CurrentActivity -> NextActivity transitions
# Group by CurrentActivity -> NextActivity and calculate stats for Leucocytes
leucocytes_df = (
    melted_df.groupby(['Activity_Value', 'NextActivity'])['Leucocytes']
    .agg(['mean', 'min', 'max'])
    .reset_index()
    .rename(columns={'mean': 'Leucocytes_Mean', 'min': 'Leucocytes_Min', 'max': 'Leucocytes_Max'})
)

# Group by CurrentActivity -> NextActivity and calculate stats for CRP
crp_df = (
    melted_df.groupby(['Activity_Value', 'NextActivity'])['CRP']
    .agg(['mean', 'min', 'max'])
    .reset_index()
    .rename(columns={'mean': 'CRP_Mean', 'min': 'CRP_Min', 'max': 'CRP_Max'})
)

# Group by CurrentActivity -> NextActivity and calculate stats for LacticAcid
lactic_acid_df = (
    melted_df.groupby(['Activity_Value', 'NextActivity'])['LacticAcid']
    .agg(['mean', 'min', 'max'])
    .reset_index()
    .rename(columns={'mean': 'LacticAcid_Mean', 'min': 'LacticAcid_Min', 'max': 'LacticAcid_Max'})
)




In [None]:
import pandas as pd

# Example data (replace with your actual dataframe)
# merged_df should have columns like 'CurrentActivity', 'NextActivity', 'Leucocytes', 'CRP', 'LacticAcid'

# Step 1: Group by 'CurrentActivity' -> 'NextActivity' and analyze Leucocytes
leucocytes_df = (
    merged_df.groupby(['CurrentActivity', 'NextActivity'])['Leucocytes']
    .agg(['mean', 'min', 'max'])
    .reset_index()
    .rename(columns={'mean': 'Leucocytes_Mean', 'min': 'Leucocytes_Min', 'max': 'Leucocytes_Max'})
)

# Step 2: Group by 'CurrentActivity' -> 'NextActivity' and analyze CRP
crp_df = (
    merged_df.groupby(['CurrentActivity', 'NextActivity'])['CRP']
    .agg(['mean', 'min', 'max'])
    .reset_index()
    .rename(columns={'mean': 'CRP_Mean', 'min': 'CRP_Min', 'max': 'CRP_Max'})
)

# Step 3: Group by 'CurrentActivity' -> 'NextActivity' and analyze Lactic Acid
lactic_acid_df = (
    merged_df.groupby(['CurrentActivity', 'NextActivity'])['LacticAcid']
    .agg(['mean', 'min', 'max'])
    .reset_index()
    .rename(columns={'mean': 'LacticAcid_Mean', 'min': 'LacticAcid_Min', 'max': 'LacticAcid_Max'})
)

# Display the results
print("Leucocytes Analysis:")
print(leucocytes_df)

print("\nCRP Analysis:")
print(crp_df)

print("\nLactic Acid Analysis:")
print(lactic_acid_df)


In [None]:
# Calculate thresholds for diagnostic features from the dataset
leucocytes_threshold = merged_df['Leucocytes'].quantile(0.75)  # 75th percentile
crp_threshold = merged_df['CRP'].quantile(0.75)  # 75th percentile
lactic_acid_threshold = merged_df['LacticAcid'].quantile(0.75)  # 75th percentile

print(f"Leucocytes Threshold: {leucocytes_threshold}")
print(f"CRP Threshold: {crp_threshold}")
print(f"Lactic Acid Threshold: {lactic_acid_threshold}")


In [None]:
import pandas as pd
import numpy as np


# Extract relevant columns for analysis
diagnosis_df = merged_df[['Case ID', 'Activity', 'Leucocytes', 'CRP', 'LacticAcid']]
activity_flow_df = merged_df[[col for col in merged_df.columns if "Activity" in col or col == 'Case ID']]

# Initialize storage for thresholds
activity_transitions = {}

# Analyze transitions for each case
for case_id, case_group in diagnosis_df.groupby("Case ID"):
    # Get the corresponding activity flow
    activity_row = activity_flow_df[activity_flow_df['Case ID'] == case_id].iloc[0]
    activities = activity_row.dropna().values[1:]  # Exclude 'Case ID'

    # Analyze diagnostic values and their influence on transitions
    for i in range(len(activities) - 1):
        current_activity = activities[i]
        next_activity = activities[i + 1]

        # Get diagnostic values for the current activity
        diag_values = case_group[case_group['Activity'] == current_activity]
        if diag_values.empty:
            continue

        leucocytes = diag_values['Leucocytes'].values[0] if 'Leucocytes' in diag_values else None
        crp = diag_values['CRP'].values[0] if 'CRP' in diag_values else None
        lactic_acid = diag_values['LacticAcid'].values[0] if 'LacticAcid' in diag_values else None

        # Store the transition and values
        key = (current_activity, next_activity)
        if key not in activity_transitions:
            activity_transitions[key] = {'Leucocytes': [], 'CRP': [], 'LacticAcid': []}
        
        if pd.notna(leucocytes):
            activity_transitions[key]['Leucocytes'].append(leucocytes)
        if pd.notna(crp):
            activity_transitions[key]['CRP'].append(crp)
        if pd.notna(lactic_acid):
            activity_transitions[key]['LacticAcid'].append(lactic_acid)

# Calculate thresholds and patterns
transition_rules = {}
for transition, values in activity_transitions.items():
    thresholds = {}
    for test, test_values in values.items():
        if test_values:
            thresholds[test] = {
                'mean': np.mean(test_values),
                'min': np.min(test_values),
                'max': np.max(test_values)
            }
    transition_rules[transition] = thresholds

# Output the identified thresholds and patterns
for transition, thresholds in transition_rules.items():
    print(f"Transition: {transition}")
    for test, stats in thresholds.items():
        print(f"  {test}: Mean={stats['mean']:.2f}, Min={stats['min']:.2f}, Max={stats['max']:.2f}")
    print()


In [None]:
def predict_next_activity(current_activity, leucocytes=None, crp=None, lactic_acid=None):
    """
    Predict the next activity based on current activity and diagnostic values.
    """
    # Define thresholds for transitions
    transition_rules = {
        ('Leucocytes', 'LacticAcid'): {'Leucocytes': 13.75},
        ('LacticAcid', 'CRP'): {'LacticAcid': 2.47},
        ('CRP', 'IV Liquid'): {'CRP': 105.43},
        ('Leucocytes', 'CRP'): {'Leucocytes': 14.58},
        ('CRP', 'LacticAcid'): {'CRP': 123.35},
        ('LacticAcid', 'IV Antibiotics'): {'LacticAcid': 2.21},
        ('Leucocytes', 'IV Antibiotics'): {'Leucocytes': 15.20},
        ('CRP', 'IV Antibiotics'): {'CRP': 104.66},
        ('CRP', 'Admission NC'): {'CRP': 111.18},
        ('LacticAcid', 'Admission IC'): {'LacticAcid': 4.46},
    }
    
    # Check for a matching transition rule
    for (start, end), thresholds in transition_rules.items():
        if current_activity == start:
            # Check if diagnostic values meet the threshold
            if (
                (leucocytes is None or leucocytes >= thresholds.get('Leucocytes', float('-inf'))) and
                (crp is None or crp >= thresholds.get('CRP', float('-inf'))) and
                (lactic_acid is None or lactic_acid >= thresholds.get('LacticAcid', float('-inf')))
            ):
                return end  # Return the predicted next activity

    # Default: Unable to predict next activity
    return "Unknown"

# Example usage
current_activity = "CRP"
leucocytes = None
crp = 130  # High CRP value
lactic_acid = None

predicted_next_activity = predict_next_activity(current_activity, leucocytes, crp, lactic_acid)
print(f"Predicted Next Activity: {predicted_next_activity}")


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# ---------- STEP 1: DATA PREPROCESSING ----------

# Features
features = [
    'Leucocytes', 'CRP', 'LacticAcid', 'InfectionSuspected', 'DiagnosticBlood',
    'DisfuncOrg', 'SIRSCritTachypnea', 'Hypotensie', 'SIRSCritHeartRate', 'Infusion',
    'DiagnosticArtAstrup', 'Age', 'DiagnosticIC', 'DiagnosticSputum',
    'SIRSCriteria2OrMore', 'DiagnosticXthorax', 'SIRSCritTemperature',
    'DiagnosticUrinaryCulture', 'DiagnosticLacticAcid', 'DiagnosticUrinarySediment', 'DiagnosticECG'
]

# Prepare features (X)
X = merged_df[features]
scaler = StandardScaler()
X['Age'] = scaler.fit_transform(X[['Age']])  # Scale 'Age'

# Prepare storage for models and mappings
models = {}
activity_columns = [f'Activity {i}' for i in range(1, 51)]  # Limit to 50 activities
accuracy_scores = []

# ---------- STEP 2: TRAIN MODELS FOR EACH ACTIVITY ----------

for i in range(3, 51):  # Start from Activity 3
    activity_col = f'Activity {i}'
    
    # Skip if activity column doesn't exist
    if activity_col not in merged_df.columns:
        break
    
    # Get target for the activity
    y = merged_df[activity_col]
    
    # Encode labels dynamically
    unique_labels = y.unique()
    activity_mapping = {label: idx for idx, label in enumerate(unique_labels)}
    y_encoded = y.map(activity_mapping)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
    
    # Train Random Forest Model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Calculate and store accuracy
    y_pred = rf_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_scores.append(acc)
    
    # Store the model and label mappings
    models[activity_col] = {
        'model': rf_model,
        'mapping': activity_mapping,
        'reverse_mapping': {v: k for k, v in activity_mapping.items()}
    }

# ---------- STEP 3: FUNCTION TO HANDLE DIAGNOSTICS ----------

# Predict next activity based on diagnostic thresholds
def predict_next_activity(current_activity, leucocytes, crp, lactic_acid):
    if current_activity == 'Leucocytes':
        if leucocytes > 15:  # Example threshold
            return 'IV Antibiotics'
        else:
            return 'CRP'
    elif current_activity == 'CRP':
        if crp > 100:  # Example threshold
            return 'IV Antibiotics'
        else:
            return 'LacticAcid'
    elif current_activity == 'LacticAcid':
        if lactic_acid > 2.0:  # Example threshold
            return 'ICU Admit'
        else:
            return 'Discharge'
    else:
        return 'END'

# ---------- STEP 4: PREDICT ACTIVITY SEQUENCE ----------

# Define stopping criteria
stopping_activities = ['Release A', 'Release B', 'Release C', 'Release D', 'Release E', 'Return ER']

def predict_sequence(initial_features, leucocytes, crp, lactic_acid):
    activities_sequence = ['ER Registration', 'ER Triage']  # Start with first two activities
    
    # Iterate through activity predictions
    for i in range(3, 51):
        activity_col = f'Activity {i}'
        if activity_col not in models:
            break
        
        # Predict next activity
        input_data = np.array([initial_features])
        input_data[:, 11] = scaler.transform(input_data[:, [11]])  # Scale 'Age'
        predicted_label = models[activity_col]['model'].predict(input_data)[0]
        predicted_activity = models[activity_col]['reverse_mapping'][predicted_label]
        
        # Stop if activity is NaN
        if pd.isna(predicted_activity):
            break
        # Stop if final activity matches stopping criteria
        if predicted_activity in stopping_activities:
            activities_sequence.append(predicted_activity)
            break
        
        # Append predicted activity to sequence
        activities_sequence.append(predicted_activity)
        
        # Update diagnostic values dynamically
        if predicted_activity == 'Leucocytes':
            predicted_activity = predict_next_activity('Leucocytes', leucocytes, crp, lactic_acid)
        elif predicted_activity == 'CRP':
            predicted_activity = predict_next_activity('CRP', leucocytes, crp, lactic_acid)
        elif predicted_activity == 'LacticAcid':
            predicted_activity = predict_next_activity('LacticAcid', leucocytes, crp, lactic_acid)

    return activities_sequence





In [None]:
# ---------- STEP 5: USER INPUT AND TEST ----------

# Collect user input
print("Enter patient details:")
age = float(input("Age: "))
infection_suspected = int(input("Infection Suspected (0 or 1): "))
disfunc_org = int(input("Organ Dysfunction (0 or 1): "))
sirs_criteria = int(input("SIRS Criteria 2 or more (0 or 1): "))
hypotensie = int(input("Hypotension (0 or 1): "))
diagnostic_blood = int(input("Diagnostic Blood (0 or 1): "))
diagnostic_lactic_acid = int(input("Diagnostic Lactic Acid (0 or 1): "))

# Initial features
example_input = [
    40, 90, 15.5, infection_suspected, diagnostic_blood, disfunc_org, 0, hypotensie, 0, 0,
    0, age, 1, 0, sirs_criteria, 0, 0, 1, diagnostic_lactic_acid, 0, 0
]

# Initial diagnostic values
leucocytes = 100
crp = 150
lactic_acid = 10.5

predicted_sequence = predict_sequence(example_input, leucocytes, crp, lactic_acid)

# Output the final sequence
print("\nPredicted Activity Flow:", predicted_sequence)

# Print the final accuracy score
final_accuracy = np.mean(accuracy_scores)
print("\nFinal Model Accuracy Score:", round(final_accuracy, 4))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# ---------- STEP 1: DATA PREPROCESSING ----------

# Features
features = [
    'Leucocytes', 'CRP', 'LacticAcid', 'InfectionSuspected', 'DiagnosticBlood',
    'DisfuncOrg', 'SIRSCritTachypnea', 'Hypotensie', 'SIRSCritHeartRate', 'Infusion',
    'DiagnosticArtAstrup', 'Age', 'DiagnosticIC', 'DiagnosticSputum',
    'SIRSCriteria2OrMore', 'DiagnosticXthorax', 'SIRSCritTemperature',
    'DiagnosticUrinaryCulture', 'DiagnosticLacticAcid', 'DiagnosticUrinarySediment', 'DiagnosticECG'
]

# Prepare features (X)
X = merged_df[features]
scaler = StandardScaler()
X['Age'] = scaler.fit_transform(X[['Age']])  # Scale 'Age'

# Prepare storage for models and mappings
models = {}
activity_columns = [f'Activity {i}' for i in range(1, 51)]  # Limit to 50 activities
accuracy_scores = []

# ---------- STEP 2: TRAIN MODELS FOR EACH ACTIVITY ----------

for i in range(3, 51):  # Start from Activity 3
    activity_col = f'Activity {i}'
    
    # Skip if activity column doesn't exist
    if activity_col not in merged_df.columns:
        break
    
    # Get target for the activity
    y = merged_df[activity_col]
    
    # Encode labels dynamically
    unique_labels = y.unique()
    activity_mapping = {label: idx for idx, label in enumerate(unique_labels)}
    y_encoded = y.map(activity_mapping)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
    
    # Train Random Forest Model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Calculate and store accuracy
    y_pred = rf_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_scores.append(acc)
    
    # Store the model and label mappings
    models[activity_col] = {
        'model': rf_model,
        'mapping': activity_mapping,
        'reverse_mapping': {v: k for k, v in activity_mapping.items()}
    }

# ---------- STEP 3: FUNCTION TO HANDLE DIAGNOSTICS ----------

# Predict next activity based on test values
def predict_next_activity(current_activity, leucocytes, crp, lactic_acid):
    if current_activity == 'Leucocytes':
        if leucocytes > 15:  # Example threshold
            return 'IV Antibiotics'
        else:
            return 'CRP'
    elif current_activity == 'CRP':
        if crp > 100:  # Example threshold
            return 'IV Antibiotics'
        else:
            return 'LacticAcid'
    elif current_activity == 'LacticAcid':
        if lactic_acid > 2.0:  # Example threshold
            return 'ICU Admit'
        else:
            return 'Discharge'
    else:
        return 'END'

# ---------- STEP 4: PREDICT ACTIVITY SEQUENCE ----------

# Define stopping criteria
stopping_activities = ['Release A', 'Release B', 'Release C', 'Release D', 'Release E', 'Return ER']

def predict_sequence(initial_features, leucocytes, crp, lactic_acid):
    activities_sequence = ['ER Registration', 'ER Triage']  # Start with first two activities
    
    # Iterate through activity predictions
    for i in range(3, 51):
        activity_col = f'Activity {i}'
        if activity_col not in models:
            break
        
        # Predict next activity
        input_data = np.array([initial_features])
        input_data[:, 11] = scaler.transform(input_data[:, [11]])  # Scale 'Age'
        predicted_label = models[activity_col]['model'].predict(input_data)[0]
        predicted_activity = models[activity_col]['reverse_mapping'][predicted_label]
        
        # Stop if activity is NaN
        if pd.isna(predicted_activity):
            break
        # Stop if final activity matches stopping criteria
        if predicted_activity in stopping_activities:
            activities_sequence.append(predicted_activity)
            break
        
        # Append predicted activity to sequence
        activities_sequence.append(predicted_activity)
        
        # Update diagnostic values dynamically
        if predicted_activity == 'Leucocytes':
            predicted_activity = predict_next_activity('Leucocytes', leucocytes, crp, lactic_acid)
        elif predicted_activity == 'CRP':
            predicted_activity = predict_next_activity('CRP', leucocytes, crp, lactic_acid)
        elif predicted_activity == 'LacticAcid':
            predicted_activity = predict_next_activity('LacticAcid', leucocytes, crp, lactic_acid)

    return activities_sequence

# ---------- STEP 5: USER INPUT AND TEST ----------

# Collect user input
print("Enter patient details:")
age = float(input("Age: "))
infection_suspected = int(input("Infection Suspected (0 or 1): "))
disfunc_org = int(input("Organ Dysfunction (0 or 1): "))
sirs_criteria = int(input("SIRS Criteria 2 or more (0 or 1): "))
hypotensie = int(input("Hypotension (0 or 1): "))
diagnostic_blood = int(input("Diagnostic Blood (0 or 1): "))
diagnostic_lactic_acid = int(input("Diagnostic Lactic Acid (0 or 1): "))

# Initial features
example_input = [
    40, 90, 15.5, infection_suspected, diagnostic_blood, disfunc_org, 0, hypotensie, 0, 0,
    0, age, 1, 0, sirs_criteria, 0, 0, 1, diagnostic_lactic_acid, 0, 0
]

# Initial diagnostic values
leucocytes = 10
crp = 50
lactic_acid = 1.5

predicted_sequence = predict_sequence(example_input, leucocytes, crp, lactic_acid)

# Output the final sequence
print("\nPredicted Activity Flow:", predicted_sequence)

# Print the final accuracy score
final_accuracy = np.mean(accuracy_scores)
print("\nFinal Model Accuracy Score:", round(final_accuracy, 4))


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# ---------- STEP 1: DATA PREPROCESSING ----------

# Features
features = [
    'Leucocytes', 'CRP', 'LacticAcid', 'InfectionSuspected', 'DiagnosticBlood',
    'DisfuncOrg', 'SIRSCritTachypnea', 'Hypotensie', 'SIRSCritHeartRate', 'Infusion',
    'DiagnosticArtAstrup', 'Age', 'DiagnosticIC', 'DiagnosticSputum',
    'SIRSCriteria2OrMore', 'DiagnosticXthorax', 'SIRSCritTemperature',
    'DiagnosticUrinaryCulture', 'DiagnosticLacticAcid', 'DiagnosticUrinarySediment', 'DiagnosticECG'
]

# Prepare features (X)
X = merged_df[features]
scaler = StandardScaler()
X['Age'] = scaler.fit_transform(X[['Age']])  # Scale 'Age'

# Prepare storage for models and mappings
models = {}
activity_columns = [f'Activity {i}' for i in range(1, 51)]  # Limit to 50 activities
accuracy_scores = []

# ---------- STEP 2: TRAIN MODELS FOR EACH ACTIVITY ----------

for i in range(3, 51):  # Start from Activity 3
    activity_col = f'Activity {i}'
    
    # Skip if activity column doesn't exist
    if activity_col not in merged_df.columns:
        break
    
    # Get target for the activity
    y = merged_df[activity_col]
    
    # Encode labels dynamically
    unique_labels = y.unique()
    activity_mapping = {label: idx for idx, label in enumerate(unique_labels)}
    y_encoded = y.map(activity_mapping)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)
    
    # Train Random Forest Model
    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)

    # Calculate and store accuracy
    y_pred = rf_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    accuracy_scores.append(acc)
    
    # Store the model and label mappings
    models[activity_col] = {
        'model': rf_model,
        'mapping': activity_mapping,
        'reverse_mapping': {v: k for k, v in activity_mapping.items()}
    }

# ---------- STEP 3: PREDICT ACTIVITY SEQUENCE ----------

# Define stopping criteria
stopping_activities = ['Release A', 'Release B', 'Release C', 'Release D', 'Release E', 'Return ER']

def predict_sequence(initial_features):
    activities_sequence = ['ER Registration', 'ER Triage']  # Start with first two activities
    
    # Iterate through activity predictions
    for i in range(3, 51):
        activity_col = f'Activity {i}'
        if activity_col not in models:
            break
        
        # Predict next activity
        input_data = np.array([initial_features])
        input_data[:, 11] = scaler.transform(input_data[:, [11]])  # Scale 'Age'
        predicted_label = models[activity_col]['model'].predict(input_data)[0]
        predicted_activity = models[activity_col]['reverse_mapping'][predicted_label]
        
        # Stop if activity is NaN
        if pd.isna(predicted_activity):
            break
        # Stop if final activity matches stopping criteria
        if predicted_activity in stopping_activities:
            activities_sequence.append(predicted_activity)
            break
        
        # Append predicted activity to sequence
        activities_sequence.append(predicted_activity)
        
        # Update diagnostic values dynamically
        if predicted_activity == 'Leucocytes':
            initial_features[0] += 1
        elif predicted_activity == 'CRP':
            initial_features[1] += 5
        elif predicted_activity == 'LacticAcid':
            initial_features[2] += 0.2
    
    return activities_sequence

# ---------- STEP 4: TEST THE PIPELINE ----------

# Example input
example_input = [
    40, 90, 15.5, 0, 1, 0, 0, 1, 1, 0,
    0, 70, 1, 0, 0, 0, 0, 1, 1, 0, 1  # Replace with actual data
]

predicted_sequence = predict_sequence(example_input)

# Output the final sequence
print("\nPredicted Activity Flow:", predicted_sequence)

# Print the final accuracy score
final_accuracy = np.mean(accuracy_scores)
print("\nFinal Model Accuracy Score:", round(final_accuracy, 4))

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# ---------- STEP 1: MODEL TO PREDICT SECOND ACTIVITY ----------

# Select features from Other Attributes dataset
X = merged_df[['InfectionSuspected', 'DisfuncOrg', 'SIRSCritTachypnea', 'Hypotensie',
              'SIRSCritHeartRate', 'SIRSCriteria2OrMore', 'SIRSCritTemperature',
              'DiagnosticBlood', 'DiagnosticArtAstrup', 'Age', 'DiagnosticIC',
              'DiagnosticSputum', 'DiagnosticXthorax', 'DiagnosticLacticAcid',
              'DiagnosticUrinaryCulture', 'DiagnosticUrinarySediment', 'DiagnosticECG',
              'Infusion']]  # Use all relevant columns

# Target: Second Activity
y = merged_df['Activity 2']  # Assuming 'Activity_2' represents the second activity

# Encode target variable
activity_mapping = {activity: idx for idx, activity in enumerate(y.unique())}
y_encoded = y.map(activity_mapping)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

# Scale only 'Age' column
scaler = StandardScaler()
X_train['Age'] = scaler.fit_transform(X_train[['Age']])  # Scale 'Age'
X_test['Age'] = scaler.transform(X_test[['Age']])       # Scale 'Age'

# Train model
rf_second_activity = RandomForestClassifier(n_estimators=100, random_state=42)
rf_second_activity.fit(X_train, y_train)

# Evaluate model
y_pred = rf_second_activity.predict(X_test)
print("Accuracy for Second Activity:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ---------- STEP 2: RULES FOR SUBSEQUENT ACTIVITIES ----------

# # Function to predict next activity based on diagnosis values
# def predict_next_activity(current_activity, leucocytes, crp, lactic_acid):
#     if current_activity == 'Leucocytes':
#         if leucocytes > 15:  # Example threshold
#             return 'IV Antibiotics'
#         else:
#             return 'CRP'
#     elif current_activity == 'CRP':
#         if crp > 100:  # Example threshold
#             return 'IV Antibiotics'
#         else:
#             return 'LacticAcid'
#     elif current_activity == 'LacticAcid':
#         if lactic_acid > 2.0:  # Example threshold
#             return 'ICU Admit'
#         else:
#             return 'Discharge'
#     else:
#         return 'END'

# ---------- STEP 3: FINAL PREDICTION PIPELINE ----------

def predict_pipeline():
    print("\n--- Patient Information ---")
    # Collect user input
    age = float(input("Age: "))
    infection_suspected = int(input("Infection Suspected (0 or 1): "))
    disfunc_org = int(input("Organ Dysfunction (0 or 1): "))
    sirs_criteria = int(input("SIRS Criteria 2 or more (0 or 1): "))
    hypotensie = int(input("Hypotension (0 or 1): "))
    diagnostic_blood = int(input("Diagnostic Blood (0 or 1): "))
    diagnostic_lactic_acid = int(input("Diagnostic Lactic Acid (0 or 1): "))

    # Predict Second Activity
    input_features = np.array([[infection_suspected, disfunc_org, 0, hypotensie, 0,
                                sirs_criteria, 0, diagnostic_blood, 0, age,
                                0, 0, 0, diagnostic_lactic_acid, 0, 0, 0, 0]])  # Replace 0s if more inputs are needed
    input_scaled = scaler.transform(input_features[:, [9]])  # Scale only 'Age'
    input_features[:, 9] = input_scaled[:, 0]  # Replace scaled 'Age'

    second_activity_pred = rf_second_activity.predict(input_features)[0]
    second_activity = [key for key, value in activity_mapping.items() if value == second_activity_pred][0]
    print(f"Predicted Second Activity: {second_activity}")

    # Predict Subsequent Activities
    # current_activity = second_activity
    # while current_activity != 'END':
    #     # Get diagnostic values
    #     leucocytes = float(input(f"{current_activity} - Leucocytes: "))
    #     crp = float(input(f"{current_activity} - CRP: "))
    #     lactic_acid = float(input(f"{current_activity} - Lactic Acid: "))

    #     # Predict next activity based on diagnosis trends
    #     next_activity = predict_next_activity(current_activity, leucocytes, crp, lactic_acid)
    #     print(f"Next Activity: {next_activity}")

    #     # Update current activity
    #     if next_activity == 'END':
    #         break
    #     current_activity = next_activity

# Call the pipeline
predict_pipeline()
