In [13]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import scipy as sp

In [14]:
NUM_RANDOM_SEEDS = 5
MODEL_CLASS = "lr"
EPSILON = 0.1
NUM_PERTURBED_MODELS = 100

In [15]:
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
prediction_output = "predictions/obermeyer/model_weight_perturbation_"+MODEL_CLASS+".csv"
target_variable = "cost_t"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'qualified_cost_25', 'qualified_cost_75', 'qualified_cost_50', 'qualified_gagne_1', 'qualified_gagne_2', 'qualified_gagne_3']

In [16]:
df = pd.read_csv(data_source)
X = df[features+other_variables]
y = df[target_variable]

In [17]:
def get_baseline_model_coefficients(X_train, y_train):
    if MODEL_CLASS == "lr":
        model = LinearRegression()
    model.fit(X_train, y_train)
    return model.coef_

In [18]:
def get_perturbed_weights(baseline_weights, epsilon):
    random_vector = np.random.randn(*baseline_weights.shape)
    random_vector = random_vector / np.linalg.norm(random_vector)
    perturbation = np.random.uniform(0, epsilon) * random_vector
    w = baseline_weights + perturbation
    return w

In [19]:
def get_predictions_columns(X_test, baseline_weights, epsilon):
    predictions = []
    columns = []
    for i in tqdm(range(NUM_PERTURBED_MODELS)):
        perturbed_weights = get_perturbed_weights(baseline_weights, epsilon)
        predictions.append(np.dot(X_test, perturbed_weights))
        columns.append(f'm_{i+1}')
    return predictions, columns

In [27]:
output = []
for random_seed in range(NUM_RANDOM_SEEDS):
    print("random seed", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)
    
    cost_threshold_25 = X_test['qualified_cost_25'].to_numpy()
    cost_threshold_50 = X_test['qualified_cost_50'].to_numpy()
    cost_threshold_75 = X_test['qualified_cost_75'].to_numpy()
    gagne_threshold_1 = X_test['qualified_gagne_1'].to_numpy()
    gagne_threshold_2 = X_test['qualified_gagne_2'].to_numpy()
    gagne_threshold_3 = X_test['qualified_gagne_3'].to_numpy()
    test_idx = X_test['person_id'].to_numpy()

    # Combine X_train and X_test
    X_combined = np.concatenate([
        X_train.drop(columns=other_variables).to_numpy(),
        X_test.drop(columns=other_variables).to_numpy()
    ], axis=0)
    
    # Orthonormalize the combined matrix
    intercept_idx = 0
    X_combined = np.insert(X_combined, intercept_idx, 1.0, axis=1)
    X_combined_orth = sp.linalg.orth(X_combined)
    X_combined_orth = np.delete(X_combined_orth, intercept_idx, axis=1)
    
    # Split the combined orthonormalized matrix back into X_train and X_test
    n_train = X_train.shape[0]
    X_train = X_combined_orth[:n_train]
    X_test = X_combined_orth[n_train:]
    #X_train = X_train.drop(columns=other_variables).to_numpy()
    #X_test = X_test.drop(columns=other_variables).to_numpy()


    y_train = y_train.to_numpy()
    y_test = y_test.to_numpy()

    baseline_weights = get_baseline_model_coefficients(X_train, y_train)
    predictions, columns = get_predictions_columns(X_test, baseline_weights, EPSILON)
    
    predictions_df = pd.DataFrame(predictions).transpose()
    predictions_df.columns=columns
    predictions_df["y"] = y_test
    predictions_df["person_id"] = test_idx 
    predictions_df["seed"] = random_seed
    
    predictions_df['qualified_cost_25'] = cost_threshold_25
    predictions_df['qualified_cost_50'] = cost_threshold_50
    predictions_df['qualified_cost_75'] = cost_threshold_75
    predictions_df['qualified_gagne_1'] = gagne_threshold_1
    predictions_df['qualified_gagne_2'] = gagne_threshold_2
    predictions_df['qualified_gagne_3'] = gagne_threshold_3

    output.append(predictions_df)
output = pd.concat(output)

random seed 0


100%|███████████████████████████████████████| 100/100 [00:00<00:00, 2002.93it/s]


random seed 1


100%|███████████████████████████████████████| 100/100 [00:00<00:00, 2080.99it/s]


random seed 2


100%|███████████████████████████████████████| 100/100 [00:00<00:00, 2095.42it/s]


random seed 3


100%|███████████████████████████████████████| 100/100 [00:00<00:00, 2152.93it/s]


random seed 4


100%|███████████████████████████████████████| 100/100 [00:00<00:00, 2236.14it/s]


In [28]:
output.head()

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_100,y,person_id,seed,qualified_cost_25,qualified_cost_50,qualified_cost_75,qualified_gagne_1,qualified_gagne_2,qualified_gagne_3
0,-0.006895,-0.00686,-0.006961,-0.006942,-0.006943,-0.00696,-0.00685,-0.006885,-0.006895,-0.006859,...,-0.006749,0.009628,2545,0,0,1,1,1,0,0
1,-0.000319,-0.00029,-0.000192,-0.00029,-0.000243,-0.000289,-0.000264,-0.000278,-0.000243,-0.000281,...,-0.000296,0.004905,8198,0,0,0,1,1,1,0
2,0.005642,0.005514,0.00564,0.005482,0.005524,0.005469,0.005553,0.005569,0.005633,0.005514,...,0.005221,0.009446,46461,0,0,1,1,0,0,0
3,-0.011559,-0.011712,-0.011597,-0.011541,-0.011707,-0.011757,-0.01171,-0.01172,-0.011693,-0.011781,...,-0.011596,0.002361,30620,0,0,0,1,0,0,0
4,-0.002397,-0.002779,-0.002622,-0.002816,-0.002862,-0.002866,-0.002767,-0.002808,-0.002819,-0.002803,...,-0.002643,0.003996,47418,0,0,0,1,1,1,0


In [29]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80495 entries, 0 to 16098
Columns: 109 entries, m_1 to qualified_gagne_3
dtypes: float64(101), int64(8)
memory usage: 67.6 MB


In [30]:
output.to_csv(prediction_output, index=False)

In [35]:
for i in range(100):
    se = np.mean((output['m_'+str(i+1)] - output['y']) ** 2)
    print(se)

0.0010677815574273667
0.00106787081786986
0.0010676674279346296
0.001067718680443427
0.0010677464965612336
0.0010677849193506351
0.0010677863113521281
0.001067540876452576
0.0010678586659573586
0.001067925883381988
0.0010678195320781242
0.0010678285896931808
0.0010677829885522227
0.0010678810264436918
0.001067758871357681
0.0010674680749241897
0.0010676986971594083
0.0010679853835873813
0.0010675868687960103
0.0010678739521049806
0.0010676416261707015
0.0010677156721232409
0.001067804030095661
0.0010677097401678734
0.0010679343698735573
0.0010677526137731904
0.0010678201428109186
0.0010677689881927435
0.0010678335320472923
0.0010678442800411718
0.0010678378627502626
0.0010678914417320752
0.001067803187921204
0.0010679374317967784
0.0010678125289776098
0.0010675241052864868
0.0010678651249766053
0.0010678116743822463
0.0010677434685988057
0.0010677467579614797
0.0010677164080885183
0.0010678590148270595
0.001067896363974434
0.001068001363379565
0.0010678442978597454
0.001067806079723360