In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt

In [None]:
# Load dataset
path = "/Users/cc/Desktop/Work/Uni/Berner FH/DataSets/mimicCSV/mimic-iv-3.1/hosp/shortform/Look Up Tables/scaled_allpatients.csv"
dataset = pd.read_csv(path)

In [None]:
print(dataset.shape)

In [None]:
# features (X) and target (y)
X = dataset.drop(columns=["subject_id", "mortality"])  # Drop irrelevant columns
y = dataset["mortality"]

In [None]:
# Split into train, test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)



In [None]:
X_train.head()

In [None]:
# Define columns that have been scaled
columns_to_scale = [
    "mean_los", "total_los", "max_los",
    "num_admissions",
    "mean_drg_severity", "max_drg_severity", "mode_drg_severity",
    "mean_drg_mortality", "max_drg_mortality", "mode_drg_mortality",
    "totalDiagnosesIcdCodes"
]

# Check mean and standard deviat. columns
scaled_means = dataset[columns_to_scale].mean()
scaled_stds = dataset[columns_to_scale].std()

print("Means scaled columns:")
print(scaled_means)

print("\nStandard deviations scaled columns:")
print(scaled_stds)

In [None]:
rf_model = RandomForestClassifier(
    n_estimators=1000,
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=2,
    min_samples_split=10,
    class_weight='balanced_subsample',
    random_state=55,
    n_jobs=-1
)


In [None]:
rf_model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = rf_model.predict(X_test)
y_pred_proba = rf_model.predict_proba(X_test)[:, 1]  # Probabilities for positive class



In [None]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


In [None]:
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROCAUC Score: {roc_auc:.4f}")


In [None]:
# Extract feature importances and sort them
importances = rf_model.feature_importances_
indices = importances.argsort()[::-1]

# Get column names from the dataset
feature_names = X.columns
print( [feature_names[i] for i in indices[:100]])

# Number of top features to display
top_features = 14


In [None]:
# Plot feature importances with column names
plt.figure(figsize=(20, 12))
plt.barh(range(top_features), importances[indices[:top_features]], align='center')
plt.yticks(range(top_features), [feature_names[i] for i in indices[:top_features]])
plt.xlabel("F Importance")
plt.title("Top 14 Features")
plt.gca().invert_yaxis()
plt.show()