In [2]:
import os
import boto3
from io import BytesIO
path = '/home/sagemaker-user/ds-dev-repo/phm_roi/src/data'
os.chdir(path)

In [28]:
import pandas as pd 

import numpy as np 

  

# Number of members 

n_members = 1000 

  

# Seed for reproducibility 

np.random.seed(42) 

  

# Generate member IDs 

member_id = np.arange(1, n_members + 1) 

  

# Generate features 

age = np.random.randint(18, 65, size=n_members) 

female = np.random.choice([0, 1], size=n_members) 

months_enrolled = np.random.randint(1, 13, size=n_members) 

  

# Assign interventions 

web_md = np.random.choice([0, 1], size=n_members) 

omada = np.random.choice([0, 1], size=n_members) 

no_intervention = np.where((web_md == 0) & (omada == 0), 1, 0) 

  

# Ensure realistic intervention distributions 

web_md = np.where(no_intervention == 1, 0, web_md) 

omada = np.where(no_intervention == 1, 0, omada) 

  

# Generate pre-intervention data 

average_er_visits_pre = np.random.poisson(2, size=n_members) 

costs_per_month_pre = np.random.normal(1000, 100, size=n_members) 

  

# Post-intervention data with intervention effects 

average_er_visits_post = average_er_visits_pre - ( 

    web_md * np.random.normal(0.5, 0.2, size=n_members) + 

    omada * np.random.normal(0.7, 0.3, size=n_members) + 

    no_intervention * np.random.normal(0.1, 0.1, size=n_members) 

) 

average_er_visits_post = np.clip(average_er_visits_post, 0, None)  # Ensure no negative values 

  

costs_per_month_post = costs_per_month_pre - ( 

    web_md * np.random.normal(200, 20, size=n_members) + 

    omada * np.random.normal(500, 30, size=n_members) + 

    no_intervention * np.random.normal(10, 10, size=n_members) 

) 

costs_per_month_post = np.clip(costs_per_month_post, 0, None)  # Ensure no negative values 

  

# Calculate differences 

er_visits_diff = average_er_visits_post - average_er_visits_pre 

costs_diff = costs_per_month_post - costs_per_month_pre 

  

# Generate intervention costs 

omada_costs = np.where(omada == 1, np.random.beta(2, 5, size=n_members) * 855 + 145, 0) 

web_md_costs = np.where(web_md == 1, 1.65, 0) 

  

# Create DataFrame 

data = pd.DataFrame({ 

    'member_id': member_id, 

    'age': age, 

    'female': female, 

    'months_enrolled': months_enrolled, 

    'web_md': web_md, 

    'omada': omada, 

    'no_intervention': no_intervention, 

    'average_er_visits_pre': average_er_visits_pre, 

    'average_er_visits_post': average_er_visits_post, 

    'costs_per_month_pre': costs_per_month_pre, 

    'costs_per_month_post': costs_per_month_post, 

    'omada_costs': omada_costs, 

    'web_md_costs': web_md_costs, 

    'er_visits_diff': er_visits_diff, 

    'costs_diff': costs_diff 

}) 


In [29]:
data[['costs_diff','web_md']].groupby('web_md').mean()

Unnamed: 0_level_0,costs_diff
web_md,Unnamed: 1_level_1
0,-246.558844
1,-448.055121


In [17]:
data.describe().round(2)

Unnamed: 0,member_id,age,female,months_enrolled,web_md,omada,no_intervention,average_er_visits_pre,average_er_visits_post,costs_per_month_pre,costs_per_month_post,omada_costs,web_md_costs
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,500.5,40.99,0.47,6.42,0.5,0.49,0.26,1.99,1.47,495.35,433.69,189.92,0.83
std,288.82,13.5,0.5,3.47,0.5,0.5,0.44,1.37,1.31,100.55,111.8,215.91,0.83
min,1.0,18.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,141.79,110.99,0.0,0.0
25%,250.75,29.0,0.0,3.0,0.0,0.0,0.0,1.0,0.28,430.97,358.08,0.0,0.0
50%,500.5,42.0,0.0,6.0,1.0,0.0,0.0,2.0,1.27,496.24,434.58,0.0,1.65
75%,750.25,52.0,1.0,9.0,1.0,1.0,1.0,3.0,2.39,561.73,505.87,369.13,1.65
max,1000.0,64.0,1.0,12.0,1.0,1.0,1.0,6.0,6.05,801.47,793.47,820.28,1.65


In [32]:
data.drop(columns = ['costs_per_month_post','average_er_visits_post']).to_csv("data_phm_roi_demo.csv", index = False)