In [3]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate predictors
age_60_plus = np.random.binomial(1, 0.3, n_samples)
high_cholesterol = np.random.binomial(1, 0.4, n_samples)
diabetes = np.random.binomial(1, 0.25, n_samples)
preventative_services = np.random.binomial(1, 0.5, n_samples)

# Generate hospital_id (from 1 to 10)
hospital_id = np.random.randint(1, 11, n_samples)

# Initialize the outcome array
er_visit = np.zeros(n_samples, dtype=int)

# Simulate the outcome based on correlations
for i in range(n_samples):
    prob = 0.1  # Base probability of ER visit
    if age_60_plus[i] == 1:
        prob += 0.4
    if high_cholesterol[i] == 1:
        prob += 0.4
    if diabetes[i] == 1:
        prob += 0.2
    if preventative_services[i] == 1:
        prob -= 0.4

    er_visit[i] = np.random.binomial(1, min(max(prob, 0), 1))

# Create a DataFrame
data = pd.DataFrame({
    'Hospital ID': hospital_id,
    'Age 60+': age_60_plus,
    'High Cholesterol': high_cholesterol,
    'Diabetes': diabetes,
    'Preventative Services': preventative_services,
    'ER Visit': er_visit
})

# Display the first few rows of the dataset
print(data.head())


   Hospital ID  Age 60+  High Cholesterol  Diabetes  Preventative Services  \
0           10        0                 0         0                      1   
1            5        1                 0         0                      1   
2            5        1                 1         1                      0   
3            7        0                 1         0                      1   
4            4        0                 1         0                      1   

   ER Visit  
0         0  
1         1  
2         1  
3         0  
4         0  


In [4]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import shap

# Prepare the data
X = data.drop('ER Visit', axis=1)
y = data['ER Visit']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions (optional, to evaluate model)
predictions = model.predict(X_test)

# Compute SHAP values
explainer = shap.Explainer(model)
shap_values = explainer(X)

# Plot SHAP values (for example, for the first prediction)
shap.initjs() # Initialize JavaScript visualization in Jupyter
shap.force_plot(explainer.expected_value, shap_values[0,:], X.iloc[0,:])
