In [8]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [9]:
import numpy as np
import pandas as pd
from pyprojroot import here
import os

path_data = here("./data")
os.chdir(path_data)

In [2]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 100000

# Generate predictors
high_blood_pressure = np.random.binomial(1, 0.3, n_samples)
high_cholesterol = np.random.binomial(1, 0.4, n_samples)
diabetes = np.random.binomial(1, 0.25, n_samples)
preventative_services = np.random.binomial(1, 0.5, n_samples)

# Generate hospital_id (from 1 to 10)
hospital_id = np.random.randint(1, 4, n_samples)

# Initialize the costs array
costs = np.zeros(n_samples)

# Adjusted mean and standard deviation
mean_cost = 4000
std_dev_cost = 2000

# Simulate the costs with adjusted correlations
for i in range(n_samples):
    base_cost = np.random.normal(mean_cost, std_dev_cost)

    # Adjust cost based on conditions
    if high_blood_pressure[i] == 1:
        base_cost += 2700
    if high_cholesterol[i] == 1:
        base_cost += 2300
    if diabetes[i] == 1:
        base_cost += 2200
    if preventative_services[i] == 1:
        base_cost -= 2200

    # Consider interaction effects
    if high_blood_pressure[i] == 1 and diabetes[i] == 1:
        base_cost += 500  # Additional cost for comorbidity

    # Ensure the cost is within the specified range
    costs[i] = min(max(base_cost, 100), 10000)

# Create a DataFrame
data = pd.DataFrame(
    {
        "Hospital ID": hospital_id,
        "High Blood Pressure": high_blood_pressure,
        "High Cholesterol": high_cholesterol,
        "Diabetes": diabetes,
        "Preventative Services": preventative_services,
        "Per Member Per Month Cost": costs,
    }
)

data.to_csv("data_pmpm.csv", index=False)

In [25]:
data.corr()

Unnamed: 0,Hospital ID,High Blood Pressure,High Cholesterol,Diabetes,Preventative Services,Per Member Per Month Cost
Hospital ID,1.0,-0.00316,0.005881,0.003493,0.004019,0.000151
High Blood Pressure,-0.00316,1.0,-0.00054,-0.006216,-0.002099,0.421567
High Cholesterol,0.005881,-0.00054,1.0,-0.000524,0.000998,0.368588
Diabetes,0.003493,-0.006216,-0.000524,1.0,0.001747,0.318707
Preventative Services,0.004019,-0.002099,0.000998,0.001747,1.0,-0.36327
Per Member Per Month Cost,0.000151,0.421567,0.368588,0.318707,-0.36327,1.0


In [17]:
import pandas as pd
from scipy.stats import median_abs_deviation

# Sample data, replace this with your actual DataFrame
data_pmpm = pd.read_csv("data_pmpm.csv")

# Assuming 'Per Member Per Month Cost' is the column of interest
costs = data_pmpm["Per Member Per Month Cost"]

# Calculate the Median
median_cost = costs.median()

# Calculate the Median Absolute Deviation
mad = median_abs_deviation(costs)

# Identify high outliers
threshold = 2 * mad  # You can adjust this threshold as needed
is_high_outlier = (costs - median_cost) > threshold

# Convert boolean to 1 and 0
is_high_outlier = is_high_outlier.astype(int)

# Create a new DataFrame with selected columns without altering data_pmpm
data_high_cost_members = data_pmpm.copy()
member_id_with_m = "M" + pd.Series(np.arange(1, len(data_pmpm) + 1)).astype(str)
data_high_cost_members["Member ID"] = member_id_with_m
data_high_cost_members["High Cost Member"] = is_high_outlier
data_high_cost_members = data_high_cost_members[
    ["Member ID", "Per Member Per Month Cost", "High Cost Member"]
]

# Display the new DataFrame
data_high_cost_members = data_high_cost_members.round(0)

In [18]:
import pandas as pd
import numpy as np


# Number of rows in the DataFrame
n_rows = data_high_cost_members.shape[0]

# Generate random numbers from a normal distribution
random_values = np.random.normal(1000, 250, n_rows)

# Randomly decide to add or subtract for each row
add_or_subtract = np.random.choice([-1, 1], n_rows)

# Apply the addition or subtraction
data_high_cost_members["Per Member Per Month Cost"] += add_or_subtract * random_values

# Display the modified DataFrame
data_high_cost_members

Unnamed: 0,Member ID,Per Member Per Month Cost,High Cost Member
0,M1,2882.216964,0
1,M2,5931.772709,0
2,M3,5252.993283,0
3,M4,2527.333235,0
4,M5,3612.845814,0
...,...,...,...
99995,M99996,7571.423202,0
99996,M99997,10778.390818,1
99997,M99998,3210.464085,0
99998,M99999,4593.307426,0


In [19]:
data_high_cost_members.to_csv("data_high_cost_members.csv", index=False)