In [9]:
import os
import boto3
from io import BytesIO
import pandas as pd
import numpy as np
path = '/home/sagemaker-user/ds-dev-repo/phm_roi/src/data'
os.chdir(path)
data = pd.read_csv('data_phm_roi_demo.csv')

In [10]:
import xgboost as xgb 

import pandas as pd 

  

# Load the dataset (if you have saved it previously, otherwise, use the generated data directly) 

# data = pd.read_csv('sample_dataset.csv') 

  

# Columns to exclude from training 

exclude_cols = ['member_id', 'omada_costs', 'web_md_costs'] 

  

# Split the data into no intervention group and others 

no_intervention_data = data[data['no_intervention'] == 1] 

all_data = data.copy() 

  

def train_model(data, target, exclude_cols): 

    # Prepare the data for XGBoost 

    X = data.drop(columns=exclude_cols + [target]) 

    y = data[target] 

  

    # Train XGBoost model for the given target 

    model = xgb.XGBRegressor() 

    model.fit(X, y) 

  

    return model 

  

def predict_with_model(models, data, exclude_cols): 

    predictions = {} 

    for target, model in models.items(): 

        # Prepare the data for predictions 

        X = data.drop(columns=exclude_cols + [target]) 

         

        # Predict the target 

        predictions[target] = model.predict(X) 

     

    return predictions 

In [11]:
# Train models on no intervention group data for specified targets 

targets = ['costs_diff', 'er_visits_diff'] 

models = {target: train_model(no_intervention_data, target, exclude_cols) for target in targets} 

  

# Make predictions on the entire dataset 

predictions = predict_with_model(models, all_data, exclude_cols) 

  

# Add predictions to the original dataset 

for target, preds in predictions.items(): 

    all_data[f'pred_{target}'] = preds 


In [17]:

# Calculate total program costs 

all_data['total_program_costs'] = all_data['omada_costs'] + all_data['web_md_costs'] 

all_data['change_costs'] = all_data['costs_diff'] - all_data['pred_costs_diff'] - all_data['total_program_costs']

all_data['roi_costs'] = (all_data['costs_diff'] - all_data['pred_costs_diff']) / all_data['total_program_costs']

# Calculate ROI for ER visits 

all_data['roi_er_visits'] = all_data['er_visits_diff'] - all_data['pred_er_visits_diff'] 

In [18]:
# Create intervention_group column 

conditions = [ 

    (all_data['omada'] == 1) & (all_data['web_md'] == 0), 

    (all_data['omada'] == 0) & (all_data['web_md'] == 1), 

    (all_data['omada'] == 1) & (all_data['web_md'] == 1), 

    (all_data['no_intervention'] == 1) 

] 

  

choices = ['omada', 'web_md', 'both', 'no_intervention'] 

  

all_data['intervention_group'] = np.select(conditions, choices, default='no_intervention') 

In [19]:
all_data['change_costs']

0      -881.762312
1         0.000313
2     -1011.587272
3         0.002788
4      -963.180978
          ...     
995       0.077952
996   -1187.473517
997    -154.940994
998       0.121282
999    -195.233234
Name: change_costs, Length: 1000, dtype: float64

In [20]:
all_data.to_csv('data_analyses_phm_roi_demo.csv', index = False)

In [8]:
all_data['intervention_group'].value_counts()

intervention_group
no_intervention    258
web_md             252
both               249
omada              241
Name: count, dtype: int64