In [1]:
import pandas as pd
import numpy as np
# from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
num_random_seeds = 5
num_bootstrap_models = 100

In [3]:
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
prediction_output = "predictions/obermeyer/bootstrap_regression.csv"
target_variable = "cost_t"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1', 'person_id', 'threshold_25', 'threshold_50', 'threshold_75']

In [4]:
df = pd.read_csv(data_source)
X = df[features]
y = df[target_variable]

In [5]:
def get_bootstrap_model(X_train, y_train, X_test, random_seed, bootstrap_size=0.5):
    np.random.seed(random_seed)
    idx = np.random.permutation(len(X_train))[:int(len(X_train)*bootstrap_size)]
    model = LinearRegression()
    # model = RandomForest(n_estimators=25, random_state=random_seed)
    model.fit(X_train[idx], y_train[idx])
    scores = model.predict(X_test)
    # scores = model.predict_proba(X_test)[:, 1]
    return scores

In [6]:
output = []
for random_seed in range(num_random_seeds):
    print("random seed", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)
    threshold_25 = X_test['threshold_25'].to_numpy()
    threshold_50 = X_test['threshold_50'].to_numpy()
    threshold_75 = X_test['threshold_75'].to_numpy()
    test_idx = X_test['person_id'].to_numpy()
    # print(test_idx.isna().sum())
    X_train = X_train.drop(columns=['person_id']).to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.drop(columns=['person_id']).to_numpy()
    y_test = y_test.to_numpy()

    predictions = []
    columns = []
    accuracy = []
    for i in tqdm(range(num_bootstrap_models)):
        columns.append(f'm_{i+1}')
        scores = get_bootstrap_model(X_train, y_train, X_test, random_seed=i, bootstrap_size=0.5)
        predictions.append(scores)
        accuracy.append(float((np.round(scores)==y_test).sum() / len(y_test)))
    
    predictions_df = pd.DataFrame(predictions).transpose()
    predictions_df.columns=columns
    predictions_df["y"] = y_test
    predictions_df["idx"] = test_idx 
    predictions_df["seed"] = random_seed
    predictions_df['threshold_25'] = threshold_25
    predictions_df['threshold_50'] = threshold_50
    predictions_df['threshold_75'] = threshold_75
    output.append(predictions_df)
output = pd.concat(output)

random seed 0


100%|████████████████████████████████████████| 100/100 [00:00<00:00, 124.94it/s]


random seed 1


100%|████████████████████████████████████████| 100/100 [00:00<00:00, 129.93it/s]


random seed 2


100%|████████████████████████████████████████| 100/100 [00:00<00:00, 130.73it/s]


random seed 3


100%|████████████████████████████████████████| 100/100 [00:00<00:00, 128.72it/s]


random seed 4


100%|████████████████████████████████████████| 100/100 [00:00<00:00, 129.93it/s]


In [7]:
output.head()

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_97,m_98,m_99,m_100,y,idx,seed,threshold_25,threshold_50,threshold_75
0,0.002421,0.00361,0.003165,0.003973,0.002966,0.003023,0.004011,0.003947,0.002069,0.003313,...,0.003707,0.002858,0.003587,0.002265,0.009628,2545,0,1,1,0
1,0.002649,0.003736,0.003636,0.00447,0.004497,0.004651,0.003741,0.00364,0.003776,0.003868,...,0.003432,0.004454,0.004208,0.003848,0.004905,8198,0,1,0,0
2,0.011027,0.009651,0.008816,0.009,0.00781,0.010168,0.008848,0.007403,0.009044,0.009038,...,0.00808,0.009222,0.009075,0.009436,0.009446,46461,0,1,1,0
3,0.000716,0.000319,0.000656,0.00032,0.001263,0.000155,0.000235,0.000826,0.000541,2.6e-05,...,0.000658,0.000799,0.000269,0.000765,0.002361,30620,0,1,0,0
4,0.000632,0.004498,0.005682,0.001974,0.003122,0.002559,0.005822,0.003704,0.002814,0.003566,...,0.00403,0.002461,0.003393,0.003114,0.003996,47418,0,1,0,0


In [8]:
output.to_csv(prediction_output, index=False)