In [1]:
%load_ext lab_black

In [1]:
import numpy as np
import pandas as pd
from pyprojroot import here
import os

path_data = here("./data")
os.chdir(path_data)

In [4]:
data_pmpm = pd.read_csv('data_pmpm.csv')

In [2]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 100000

# Generate predictors
high_blood_pressure = np.random.binomial(1, 0.3, n_samples)
high_cholesterol = np.random.binomial(1, 0.4, n_samples)
diabetes = np.random.binomial(1, 0.25, n_samples)
preventative_services = np.random.binomial(1, 0.5, n_samples)

# Generate hospital_id (from 1 to 10)
hospital_id = np.random.randint(1, 4, n_samples)

# Initialize the costs array
costs = np.zeros(n_samples)

# Adjusted mean and standard deviation
mean_cost = 4000
std_dev_cost = 2000

# Simulate the costs with adjusted correlations
for i in range(n_samples):
    base_cost = np.random.normal(mean_cost, std_dev_cost)

    # Adjust cost based on conditions
    if high_blood_pressure[i] == 1:
        base_cost += 2700
    if high_cholesterol[i] == 1:
        base_cost += 2300
    if diabetes[i] == 1:
        base_cost += 2200
    if preventative_services[i] == 1:
        base_cost -= 2200

    # Consider interaction effects
    if high_blood_pressure[i] == 1 and diabetes[i] == 1:
        base_cost += 500  # Additional cost for comorbidity

    # Ensure the cost is within the specified range
    costs[i] = min(max(base_cost, 100), 10000)

# Create a DataFrame
data = pd.DataFrame(
    {
        "Hospital ID": hospital_id,
        "High Blood Pressure": high_blood_pressure,
        "High Cholesterol": high_cholesterol,
        "Diabetes": diabetes,
        "Preventative Services": preventative_services,
        "Per Member Per Month Cost": costs,
    }
)

data.to_csv("data_pmpm.csv", index=False)

In [25]:
data.corr()

Unnamed: 0,Hospital ID,High Blood Pressure,High Cholesterol,Diabetes,Preventative Services,Per Member Per Month Cost
Hospital ID,1.0,-0.00316,0.005881,0.003493,0.004019,0.000151
High Blood Pressure,-0.00316,1.0,-0.00054,-0.006216,-0.002099,0.421567
High Cholesterol,0.005881,-0.00054,1.0,-0.000524,0.000998,0.368588
Diabetes,0.003493,-0.006216,-0.000524,1.0,0.001747,0.318707
Preventative Services,0.004019,-0.002099,0.000998,0.001747,1.0,-0.36327
Per Member Per Month Cost,0.000151,0.421567,0.368588,0.318707,-0.36327,1.0


In [45]:
import pandas as pd
from scipy.stats import median_abs_deviation

# Sample data, replace this with your actual DataFrame
data_pmpm = pd.DataFrame({'Per Member Per Month Cost': [100, 150, 200, 250, 300, 1000, 1100, 1200]})

# Assuming 'Per Member Per Month Cost' is the column of interest
costs = data_pmpm['Per Member Per Month Cost']

# Calculate the Median
median_cost = costs.median()

# Calculate the Median Absolute Deviation
mad = median_abs_deviation(costs)

# Identify high outliers
threshold = 2 * mad  # You can adjust this threshold as needed
is_high_outlier = (costs - median_cost) > threshold

# Convert boolean to 1 and 0
is_high_outlier = is_high_outlier.astype(int)

# Create a new DataFrame with selected columns without altering data_pmpm
data_high_cost_members = data_pmpm.copy()
member_id_with_m = 'M' + pd.Series(np.arange(1, len(data_pmpm) + 1)).astype(str)
data_high_cost_members['Member ID'] = member_id_with_m
data_high_cost_members['High Cost Member'] = is_high_outlier
data_high_cost_members = data_high_cost_members[['Member ID', 'Per Member Per Month Cost', 'High Cost Member']]

# Display the new DataFrame
data_high_cost_members.to_csv('data_high_cost_members.csv')
data_high_cost_members.head()

Unnamed: 0,Member ID,Per Member Per Month Cost,High Cost Member
0,M1,100,0
1,M2,150,0
2,M3,200,0
3,M4,250,0
4,M5,300,0


In [47]:
import plotly.express as px

# Filter out members who meet the high cost threshold
high_cost_members = data_high_cost_members[data_high_cost_members['High Cost Member'] == 1]

# Sort by 'Per Member Per Month Cost' in descending order
high_cost_members_sorted = high_cost_members.sort_values('Per Member Per Month Cost', ascending=True)

# Convert 'Member ID' to string to treat it as a categorical variable
high_cost_members_sorted['Member ID'] = high_cost_members_sorted['Member ID'].astype(str)

# Create a horizontal bar chart using Plotly
fig = px.bar(high_cost_members_sorted, x='Per Member Per Month Cost', y='Member ID', orientation='h',
             labels={'Member ID': 'Member ID', 'Per Member Per Month Cost': 'Per Member Per Month Cost ($)'},
             title='High Cost Members')

# Show the plot
fig.show()




In [39]:
high_cost_members_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 7 to 5
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Member ID                  3 non-null      object
 1   Per Member Per Month Cost  3 non-null      int64 
 2   High Cost Member           3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 176.0+ bytes
