## BAYESIAN REGRESSION


In [None]:
import pandas as pd
import bambi as bmb
import arviz as az
from sklearn.metrics import roc_auc_score

# Load the data
data = pd.read_csv('RF_imputation_NEW.csv')

# Drop unwanted columns
data = data.drop(columns=['deathtime', 'survival_time', 'LOS', 'Unnamed_0', 'V1', 'admittime', 'ID', 'group', 'tLOS', 'subject_id'])

# Normalize the predictors
predictor_columns = data.columns.difference(['outcome'])
data[predictor_columns] = (data[predictor_columns] - data[predictor_columns].mean()) / data[predictor_columns].std()

# Split the data into training and validation sets
train_data = data.sample(frac=0.7, random_state=213)
valid_data = data.drop(train_data.index)

# Define and fit the Bayesian logistic regression model using Laplace approximation
model = bmb.Model('outcome ~ ' + ' + '.join(predictor_columns), train_data, family='bernoulli')
fitted_model = model.fit(inference_method="laplace")

# Summarize the model
print(fitted_model.summary())

# Evaluate the model performance on the training set
train_preds_prob = fitted_model.predict(train_data)
train_auc_value = roc_auc_score(train_data['outcome'], train_preds_prob)
print("Train AUC:", train_auc_value)

# Evaluate the model performance on the validation set
valid_preds_prob = fitted_model.predict(valid_data)
valid_auc_value = roc_auc_score(valid_data['outcome'], valid_preds_prob)
print("Validation AUC:", valid_auc_value)



In [1]:
import pandas as pd
import numpy as np
import pymc3 as pm
import arviz as az
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the data
data = pd.read_csv("RF_imputation_NEW.csv")

# Drop unnecessary columns
data.drop(columns=['deathtime', 'survival_time', 'LOS', 'Unnamed_0', 'V1', 'admittime', 'ID', 'group', 'tLOS', 'subject_id', 'COPD', 'CHD_with_no_MI'], inplace=True)

# Ensure the 'outcome' column is present
if 'outcome' not in data.columns:
    raise ValueError("The 'outcome' column does not exist in the dataframe.")

# Normalize the predictors
predictor_names = data.columns.difference(['outcome'])
scaler = StandardScaler()
data[predictor_names] = scaler.fit_transform(data[predictor_names])

# Split the data into Training and Validation Sets
train_data, valid_data = train_test_split(data, test_size=0.3, random_state=213)

# Define the Bayesian logistic regression model
with pm.Model() as logistic_model:
    # Priors for unknown model parameters
    intercept = pm.Normal('Intercept', 0, sigma=1)
    coefficients = pm.Normal('coefficients', 0, sigma=1, shape=len(predictor_names))
    
    # Linear combination
    linear_combination = intercept + pm.math.dot(train_data[predictor_names].values, coefficients)
    
    # Likelihood (sampling distribution) of observations
    outcome = pm.Bernoulli('outcome', logit_p=linear_combination, observed=train_data['outcome'])
    
    # Inference
    trace = pm.sample(1000, cores=8, random_seed=213, return_inferencedata=True)

# Posterior predictive checks
with logistic_model:
    ppc = pm.sample_posterior_predictive(trace)

# Ensure train_preds_prob is a numpy array
train_preds_prob = np.mean(ppc['outcome'], axis=0)

# Calculate Train AUC
train_auc_value = roc_auc_score(train_data['outcome'], train_preds_prob)
print("Train AUC:", train_auc_value)

# Predict on the Test Data
with logistic_model:
    test_ppc = pm.sample_posterior_predictive(trace, model=logistic_model, var_names=['outcome'], data=valid_data[predictor_names])

# Ensure test_preds_prob is a numpy array
test_preds_prob = np.mean(test_ppc['outcome'], axis=0)

# Calculate Test AUC
test_auc_value = roc_auc_score(valid_data['outcome'], test_preds_prob)
print("Test AUC:", test_auc_value)

# Feature selection based on coefficient credible intervals
coef_means = trace.posterior['coefficients'].mean(dim=['chain', 'draw']).values.flatten()
coef_hpd = az.hdi(trace, hdi_prob=0.95)['coefficients'].values

# Selected features: coefficients whose credible interval does not include zero
selected_features = predictor_names[(coef_hpd[:, 0] > 0) | (coef_hpd[:, 1] < 0)]
print("Selected features:", selected_features)


  self.ctor = getattr(np, o_type.dtype)


AttributeError: module 'numpy' has no attribute 'bool'.
`np.bool` was a deprecated alias for the builtin `bool`. To avoid this error in existing code, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations