In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import mean_squared_error

In [2]:
num_random_seeds = 10
num_bootstrap_models = 1000
MODEL_CLASS = "lr" #lr,nn,rf

In [3]:
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
prediction_output = "predictions/obermeyer/bootstrap_"+MODEL_CLASS+".csv"
target_variable = "cost_t"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'qualified_cost_25', 'qualified_cost_75', 'qualified_cost_50', 'qualified_gagne_1', 'qualified_gagne_2', 'qualified_gagne_3']

In [4]:
df = pd.read_csv(data_source)
X = df[features+other_variables]
y = df[target_variable]

In [5]:
def get_bootstrap_model(X_train, y_train, X_test, random_seed, bootstrap_size=0.5):
    np.random.seed(random_seed)
    idx = np.random.permutation(len(X_train))[:int(len(X_train)*bootstrap_size)]
    if MODEL_CLASS == "lr":
        model = LinearRegression()
    elif MODEL_CLASS == "rf":
        model = RandomForestRegressor(random_state=random_seed)
    elif MODEL_CLASS == "nn":
        model = MLPRegressor(random_state=random_seed)
    model.fit(X_train[idx], y_train[idx])
    return model.predict(X_test)

In [6]:
output = []
for random_seed in range(num_random_seeds):
    print("random seed", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)
    
    cost_threshold_25 = X_test['qualified_cost_25'].to_numpy()
    cost_threshold_50 = X_test['qualified_cost_50'].to_numpy()
    cost_threshold_75 = X_test['qualified_cost_75'].to_numpy()
    gagne_threshold_1 = X_test['qualified_gagne_1'].to_numpy()
    gagne_threshold_2 = X_test['qualified_gagne_2'].to_numpy()
    gagne_threshold_3 = X_test['qualified_gagne_3'].to_numpy()
    test_idx = X_test['person_id'].to_numpy()
    
    # print(test_idx.isna().sum())
    X_train = X_train.drop(columns=other_variables).to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.drop(columns=other_variables).to_numpy()
    y_test = y_test.to_numpy()

    predictions = []
    columns = []
    for i in tqdm(range(num_bootstrap_models)):
        columns.append(f'm_{i+1}')
        scores = get_bootstrap_model(X_train, y_train, X_test, random_seed=i, bootstrap_size=0.5)
        predictions.append(scores)
    
    predictions_df = pd.DataFrame(predictions).transpose()
    predictions_df.columns=columns
    predictions_df["y"] = y_test
    predictions_df["person_id"] = test_idx 
    predictions_df["seed"] = random_seed
    
    predictions_df['qualified_cost_25'] = cost_threshold_25
    predictions_df['qualified_cost_50'] = cost_threshold_50
    predictions_df['qualified_cost_75'] = cost_threshold_75
    predictions_df['qualified_gagne_1'] = gagne_threshold_1
    predictions_df['qualified_gagne_2'] = gagne_threshold_2
    predictions_df['qualified_gagne_3'] = gagne_threshold_3

    output.append(predictions_df)
output = pd.concat(output)

random seed 0


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 141.74it/s]


random seed 1


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 133.60it/s]


random seed 2


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 141.05it/s]


random seed 3


100%|██████████████████████████████████████| 1000/1000 [00:06<00:00, 143.80it/s]


random seed 4


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 142.79it/s]


random seed 5


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 136.50it/s]


random seed 6


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 139.79it/s]


random seed 7


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 141.64it/s]


random seed 8


100%|██████████████████████████████████████| 1000/1000 [00:06<00:00, 143.37it/s]


random seed 9


100%|██████████████████████████████████████| 1000/1000 [00:07<00:00, 142.41it/s]


In [7]:
output.head()

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_1000,y,person_id,seed,qualified_cost_25,qualified_cost_50,qualified_cost_75,qualified_gagne_1,qualified_gagne_2,qualified_gagne_3
0,0.006753,0.008166,0.00779,0.008346,0.007259,0.007138,0.008137,0.00838,0.00664,0.007848,...,0.007635,0.009628,2545,0,0,1,1,1,0,0
1,0.013049,0.013692,0.013846,0.014468,0.014478,0.01445,0.014055,0.013888,0.014334,0.014554,...,0.014145,0.004905,8198,0,0,0,1,1,1,0
2,0.021138,0.018986,0.018488,0.018745,0.017153,0.019729,0.018739,0.017566,0.018916,0.019219,...,0.020756,0.009446,46461,0,0,1,1,0,0,0
3,0.002706,0.002252,0.002763,0.002001,0.00347,0.001979,0.002444,0.00295,0.002548,0.002147,...,0.002862,0.002361,30620,0,0,0,1,0,0,0
4,0.00998,0.01281,0.01443,0.010169,0.011937,0.010077,0.014131,0.012601,0.011287,0.011969,...,0.01221,0.003996,47418,0,0,0,1,1,1,0


In [8]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160990 entries, 0 to 16098
Columns: 1009 entries, m_1 to qualified_gagne_3
dtypes: float64(1001), int64(8)
memory usage: 1.2 GB


In [9]:
output.to_csv(prediction_output, index=False)