In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import log_loss

In [None]:
# PREDICTION VARIABLES
num_random_seeds = 1
num_bootstrap_models = 10
test_size = 0.20
validation_size = 0.25

MODEL_CLASS = "nn" #lr,nn,rf
BOOTSTRAP_SIZE = 0.50

TARGET_VARIABLE = "qualified_gagne_3"
prediction_output = "predictions/obermeyer/bootstrap_"+MODEL_CLASS+"_"+TARGET_VARIABLE+".csv"

In [None]:
# SETUP VARIABLES
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'gagne_sum_t', 'cost_t']

In [4]:
df = pd.read_csv(data_source)
X = df[features+other_variables]
y = df[TARGET_VARIABLE]

In [5]:
def get_bootstrap_model(X_train, y_train, random_seed, bootstrap_size=BOOTSTRAP_SIZE):
    np.random.seed(random_seed)
    idx = np.random.permutation(len(X_train))[:int(len(X_train)*bootstrap_size)]
    if MODEL_CLASS == "lr":
        model = LogisticRegression()
        #model = LinearRegression()
    elif MODEL_CLASS == "rf":
        model = RandomForestClassifier(random_state=random_seed)
        #model = RandomForestRegressor(random_state=random_seed)
    elif MODEL_CLASS == "nn":
        model = MLPClassifier(random_state=random_seed)
        #model = MLPRegressor(random_state=random_seed)
    model.fit(X_train[idx], y_train[idx])
    return model

In [6]:
output = []
for random_seed in range(num_random_seeds):
    print("random seed", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=random_seed)

    cost = X_test["cost_t"].to_numpy()
    gagne = X_test["gagne_sum_t"].to_numpy()
    person_id = X_test['person_id'].to_numpy()
    
    X_train = X_train.drop(columns=other_variables).to_numpy()
    y_train = y_train.to_numpy()
    X_val = X_val.drop(columns=other_variables).to_numpy()
    y_val = y_val.to_numpy()
    X_test = X_test.drop(columns=other_variables).to_numpy()
    y_test = y_test.to_numpy()

    predictions = {}
    training_loss = {}
    validation_loss = {}
    for i in tqdm(range(num_bootstrap_models)):
        model = get_bootstrap_model(X_train, y_train, random_seed=i, bootstrap_size=BOOTSTRAP_SIZE)        
        predictions[f'm_{i+1}'] = model.predict_proba(X_test)[:, 1]
        training_loss[f'm_{i+1}'] = log_loss(y_train, model.predict_proba(X_train))
        validation_loss[f'm_{i+1}'] = log_loss(y_val, model.predict_proba(X_val))
                            
    predictions_df = pd.concat([
        pd.DataFrame(predictions),
        pd.DataFrame(training_loss, index=[0]),
        pd.DataFrame(validation_loss, index=[0])]).reset_index(drop=True)
    
    predictions_df["y"] = np.concatenate([y_test, [np.nan, np.nan]])
    predictions_df["person_id"] = np.concatenate([person_id, [-2, -1]]) # -1 indicates validation loss, -2 indicates training loss
    predictions_df['cost_t'] = np.concatenate([cost, [np.nan, np.nan]]) 
    predictions_df['gagne_sum_t'] = np.concatenate([gagne, [np.nan, np.nan]])
    predictions_df["seed"] = random_seed

    output.append(predictions_df)
    
output = pd.concat(output)

random seed 0


100%|███████████████████████████████████████████| 10/10 [00:27<00:00,  2.76s/it]


In [63]:
output.to_csv(prediction_output, index=False)

In [None]:
output.head()