In [1]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 1000

# Generate predictors
age_60_plus = np.random.binomial(1, 0.3, n_samples)
high_cholesterol = np.random.binomial(1, 0.4, n_samples)
diabetes = np.random.binomial(1, 0.25, n_samples)
preventative_services = np.random.binomial(1, 0.5, n_samples)

# Generate hospital_id (from 1 to 10)
hospital_id = np.random.randint(1, 11, n_samples)

# Initialize the outcome array
er_visit = np.zeros(n_samples, dtype=int)

# Simulate the outcome based on correlations
for i in range(n_samples):
    prob = 0.1  # Base probability of ER visit
    if age_60_plus[i] == 1:
        prob += 0.4
    if high_cholesterol[i] == 1:
        prob += 0.4
    if diabetes[i] == 1:
        prob += 0.2
    if preventative_services[i] == 1:
        prob -= 0.4

    er_visit[i] = np.random.binomial(1, min(max(prob, 0), 1))

# Create a DataFrame
data = pd.DataFrame({
    'Hospital ID': hospital_id,
    'Age 60+': age_60_plus,
    'High Cholesterol': high_cholesterol,
    'Diabetes': diabetes,
    'Preventative Services': preventative_services,
    'ER Visit': er_visit
})

# Display the first few rows of the dataset
print(data.head())


   Hospital ID  Age 60+  High Cholesterol  Diabetes  Preventative Services  \
0           10        0                 0         0                      1   
1            5        1                 0         0                      1   
2            5        1                 1         1                      0   
3            7        0                 1         0                      1   
4            4        0                 1         0                      1   

   ER Visit  
0         0  
1         1  
2         1  
3         0  
4         0  


In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import shap

# Prepare the data
X = data.drop(['ER Visit', 'Hospital ID'], axis=1)
y = data['ER Visit']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the XGBoost model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions (opxtional, to evaluate model)
predictions = model.predict(X_test)


In [17]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import shap

# Assuming 'data' is your DataFrame and is already defined
# ... [Your existing code for data preparation and model training] ...

# Create a Tree explainer
explainer = shap.Explainer(model, X_train, model_output = 'probability', feature_perturbation="interventional")

# Calculate SHAP values - this might take some time for larger datasets
shap_values = explainer(X_test)
pd.DataFrame(shap_values.values)

Unnamed: 0,0,1,2,3
0,-0.160862,0.174142,-0.038191,0.168637
1,0.264341,-0.164026,-0.025950,-0.226336
2,-0.134345,0.157260,-0.028542,-0.194240
3,-0.134345,0.157260,-0.028542,-0.194240
4,0.324540,-0.188153,-0.044580,0.205025
...,...,...,...,...
195,0.342920,-0.197386,0.154030,0.234527
196,-0.145189,-0.154176,-0.030979,0.145722
197,-0.145189,-0.154176,-0.030979,0.145722
198,-0.092359,-0.092741,-0.009581,-0.119486


In [18]:
# Assuming shap_values and X have the same order
column_mapping = {i: name for i, name in enumerate(data.drop(['ER Visit', 'Hospital ID'], axis=1).columns)}

# Rename columns in the DataFrame created from SHAP values
shap_df = pd.DataFrame(shap_values.values).rename(columns=column_mapping)

In [19]:
shap_df

Unnamed: 0,Age 60+,High Cholesterol,Diabetes,Preventative Services
0,-0.160862,0.174142,-0.038191,0.168637
1,0.264341,-0.164026,-0.025950,-0.226336
2,-0.134345,0.157260,-0.028542,-0.194240
3,-0.134345,0.157260,-0.028542,-0.194240
4,0.324540,-0.188153,-0.044580,0.205025
...,...,...,...,...
195,0.342920,-0.197386,0.154030,0.234527
196,-0.145189,-0.154176,-0.030979,0.145722
197,-0.145189,-0.154176,-0.030979,0.145722
198,-0.092359,-0.092741,-0.009581,-0.119486


In [47]:
# Convert hospital_id to a Pandas Series (if it's not already one)
hospital_id_series = pd.Series(hospital_id, name='Hospital ID')

# Concatenate hospital_id_series with shap_df
shap_df_id = pd.concat([hospital_id_series, shap_df], axis=1).reset_index(drop=True)

# Display the resulting DataFrame
shap_df_id

Unnamed: 0,Hospital ID,Age 60+,High Cholesterol,Diabetes,Preventative Services
0,10,-1.446149,-1.559696,-0.152059,-1.805626
1,5,1.750355,-0.946588,-0.180997,-1.271719
2,5,2.127268,1.134607,1.186097,1.105225
3,7,-0.801110,1.168890,-0.239809,-1.175996
4,4,-0.801110,1.168890,-0.239809,-1.175996
...,...,...,...,...,...
995,3,-0.924451,0.961066,0.621626,0.866335
996,8,1.623267,0.895238,-0.349454,0.904883
997,7,-0.844911,-0.884697,-0.253785,1.075322
998,3,1.750355,-0.946588,-0.180997,-1.271719


In [49]:
import numpy as np
import pandas as pd

# Assuming X, shap_df, and hospital_id_series are already defined

# Calculate medians for each column in the original dataset
medians = X.median()

# Initialize a mask for rows to keep
keep_rows = np.ones(len(shap_df), dtype=bool)

# Iterate over each column and update the mask
for col in X.columns:
    keep_rows &= X[col] > medians[col]

# Apply the mask to the SHAP values DataFrame
filtered_shap_df = shap_df[keep_rows]

# Also filter the hospital_id_series to match the filtered SHAP values
filtered_hospital_id_series = hospital_id_series[keep_rows]

# Define the standard deviation of the noise
std_dev = 0.05  # Adjust this value as needed

# Generate random noise with the same shape as filtered_shap_df
noise = np.random.normal(0, std_dev, filtered_shap_df.shape)

# Add the noise to the filtered SHAP values
filtered_shap_df_noisy = filtered_shap_df 

# Concatenate the filtered 'Hospital ID' with the noisy SHAP values
final_shap_df = pd.concat([filtered_hospital_id_series.reset_index(drop=True), filtered_shap_df_noisy.reset_index(drop=True)], axis=1)

# Display the resulting DataFrame with noise added
final_shap_df


Unnamed: 0,Hospital ID,Age 60+,High Cholesterol,Diabetes,Preventative Services
0,8,1.782061,1.158435,0.846665,-1.316452
1,1,1.782061,1.158435,0.846665,-1.316452
2,3,1.782061,1.158435,0.846665,-1.316452
3,1,1.782061,1.158435,0.846665,-1.316452
4,2,1.782061,1.158435,0.846665,-1.316452
5,10,1.782061,1.158435,0.846665,-1.316452
6,10,1.782061,1.158435,0.846665,-1.316452
7,9,1.782061,1.158435,0.846665,-1.316452
8,2,1.782061,1.158435,0.846665,-1.316452
9,9,1.782061,1.158435,0.846665,-1.316452


In [45]:
hospital_id_id_series =  filtered_hospital_id_series.reset_index(drop = True)
hospital_id_id_series

0      8
1      1
2      3
3      1
4      2
5     10
6     10
7      9
8      2
9      9
10     7
11     8
12     1
13     6
14     8
15     9
16     1
17     8
18     1
Name: Hospital ID, dtype: int64

In [42]:
data_shap_noise = pd.concat([hospital_id_id_series, data_shap_noise], axis = 1)
data_shap_noise

Unnamed: 0,Hospital ID,Age 60+,High Cholesterol,Diabetes,Preventative Services
0,8.0,-1.404424,-1.569653,-0.116414,-1.858880
1,1.0,1.731227,-0.993487,-0.158861,-1.280777
2,3.0,2.215922,1.101086,1.134117,1.120062
3,1.0,-0.809921,1.194015,-0.242892,-1.183030
4,2.0,-0.834508,1.155410,-0.316292,-1.116113
...,...,...,...,...,...
995,,-0.909574,1.022304,0.659704,0.942161
996,,1.577863,0.940106,-0.388079,0.988329
997,,-0.822505,-0.859354,-0.282886,1.048754
998,,1.770619,-0.859175,-0.101878,-1.253795


Unnamed: 0,Age 60+,High Cholesterol,Diabetes,Preventative Services
0,-1.404424,-1.569653,-0.116414,-1.858880
1,1.731227,-0.993487,-0.158861,-1.280777
2,2.215922,1.101086,1.134117,1.120062
3,-0.809921,1.194015,-0.242892,-1.183030
4,-0.834508,1.155410,-0.316292,-1.116113
...,...,...,...,...
995,-0.909574,1.022304,0.659704,0.942161
996,1.577863,0.940106,-0.388079,0.988329
997,-0.822505,-0.859354,-0.282886,1.048754
998,1.770619,-0.859175,-0.101878,-1.253795
