In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [18]:
NUM_RANDOM_SEEDS = 10
MODEL_CLASS = "lr"
EPSILON = 0.05
NUM_PERTURBED_MODELS = 100

In [4]:
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
prediction_output = "predictions/obermeyer/model_weight_perturbation_"+MODEL_CLASS+".csv"
target_variable = "cost_t"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'qualified_cost_25', 'qualified_cost_75', 'qualified_cost_50', 'qualified_gagne_1', 'qualified_gagne_2', 'qualified_gagne_3']

In [19]:
df = pd.read_csv(data_source)
X = df[features+other_variables]
y = df[target_variable]

In [None]:
def get_baseline_model_coefficients(X_train, y_train):
    if MODEL_CLASS == "lr":
        model = LinearRegression()
    model.fit(X_train, y_train)
    return model.coef_

In [21]:
def get_perturbed_weights(baseline_weights, epsilon):
    random_vector = np.random.randn(*baseline_weights.shape)
    random_vector = random_vector / np.linalg.norm(random_vector)
    perturbation = np.random.uniform(0, epsilon) * random_vector
    w = baseline_weights + perturbation
    if np.linalg.norm(w - baseline_weights) > epsilon:
        print(f"Oh no! Got a distance of {np.linalg.norm(w - baseline_weights)}")
    return w

In [22]:
def get_predictions_columns(X_test, X_train, y_train, baseline_weights, epsilon, y_true, baseline_loss):
    predictions = []
    columns = []
    count = 0
    better = 0
    for i in tqdm(range(NUM_PERTURBED_MODELS)):
        perturbed_weights = get_perturbed_weights(baseline_weights, epsilon)
        y_pred = np.dot(X_test, perturbed_weights)
        train_pred = np.dot(X_train, perturbed_weights)
        rss_loss = np.sum((y_train - train_pred) ** 2)
        if rss_loss - baseline_loss > epsilon:
            # print(f"Uh oh! Got a RSS loss of {rss_loss}") # check this against the training loss
            count += 1
            continue
        if rss_loss < baseline_loss:
            # print(f"perturbation {i} led to a better model")
            better += 1
        predictions.append(y_pred)
        columns.append(f'm_{i+1}')
    print(f"Number of models with high loss: {count}")
    print(f"Number of models with lower loss: {better}")
    return predictions, columns

In [None]:
output = []
for random_seed in range(NUM_RANDOM_SEEDS):
    print("random seed", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=random_seed)
    
    cost_threshold_25 = X_test['qualified_cost_25'].to_numpy()
    cost_threshold_50 = X_test['qualified_cost_50'].to_numpy()
    cost_threshold_75 = X_test['qualified_cost_75'].to_numpy()
    gagne_threshold_1 = X_test['qualified_gagne_1'].to_numpy()
    gagne_threshold_2 = X_test['qualified_gagne_2'].to_numpy()
    gagne_threshold_3 = X_test['qualified_gagne_3'].to_numpy()
    test_idx = X_test['person_id'].to_numpy()

    X_train = X_train.drop(columns=other_variables).to_numpy()
    # print(X_train.shape)
    # basis = sp.linalg.orth(X_train)
    # basis = np.transpose(basis)
    # print(basis.shape)
    # X_train_proj = np.dot(X_train, basis)
    # print(X_train_proj.shape)
    y_train = y_train.to_numpy()
    # print(y_train.shape)
    X_test = X_test.drop(columns=other_variables).to_numpy()
    # X_test_proj = np.dot(X_test, basis)
    y_test = y_test.to_numpy()


    intercept_idx = 0
    Xf_train = np.insert(X_train, intercept_idx, 1.0, axis=1)
    print(Xf_train.shape)
    X_train_proj = sp.linalg.orth(X_train)
    print(X_train_proj.shape)
    # Xf_test = np.insert(X_train, intercept_idx, 1.0, axis=1)
    X_test_proj = sp.linalg.orth(X_test)
    # X_train_proj = X_train
    # X_test_proj = X_test
    # print("arrays all set up")

    baseline_weights = get_baseline_model_coefficients(X_train_proj, y_train)
    baseline_prediction = np.dot(X_test_proj, baseline_weights)
    baseline_loss = np.sum((y_test - baseline_prediction) ** 2)
    print(f"baseline loss {baseline_loss}")
    predictions, columns = get_predictions_columns(X_test_proj, X_train_proj, y_train, baseline_weights, EPSILON, y_test, baseline_loss)
    
    
    predictions_df = pd.DataFrame(predictions).transpose()
    predictions_df.columns=columns
    predictions_df["y"] = y_test
    predictions_df["person_id"] = test_idx 
    predictions_df["seed"] = random_seed
    
    predictions_df['qualified_cost_25'] = cost_threshold_25
    predictions_df['qualified_cost_50'] = cost_threshold_50
    predictions_df['qualified_cost_75'] = cost_threshold_75
    predictions_df['qualified_gagne_1'] = gagne_threshold_1
    predictions_df['qualified_gagne_2'] = gagne_threshold_2
    predictions_df['qualified_gagne_3'] = gagne_threshold_3

    output.append(predictions_df)
    print()
output = pd.concat(output)

random seed 0
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 21.541711873644218


100%|██████████| 100/100 [00:00<00:00, 466.08it/s]


Number of models with high loss: 100
Number of models with lower loss: 0

random seed 1
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 20.23699967509895


100%|██████████| 100/100 [00:00<00:00, 274.93it/s]


Number of models with high loss: 100
Number of models with lower loss: 0

random seed 2
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 29.938669394658575


100%|██████████| 100/100 [00:00<00:00, 287.22it/s]


Number of models with high loss: 0
Number of models with lower loss: 100

random seed 3
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 26.492346390315113


100%|██████████| 100/100 [00:00<00:00, 610.93it/s]


Number of models with high loss: 100
Number of models with lower loss: 0

random seed 4
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 18.077839767411866


100%|██████████| 100/100 [00:00<00:00, 512.29it/s]


Number of models with high loss: 100
Number of models with lower loss: 0

random seed 5
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 21.410827405051165


100%|██████████| 100/100 [00:00<00:00, 274.27it/s]


Number of models with high loss: 100
Number of models with lower loss: 0

random seed 6
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 34.546349138162434


100%|██████████| 100/100 [00:00<00:00, 554.00it/s]


Number of models with high loss: 0
Number of models with lower loss: 100

random seed 7
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 23.272498343866864


100%|██████████| 100/100 [00:00<00:00, 346.13it/s]


Number of models with high loss: 100
Number of models with lower loss: 0

random seed 8
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 23.74339618698992


100%|██████████| 100/100 [00:00<00:00, 381.07it/s]


Number of models with high loss: 100
Number of models with lower loss: 0

random seed 9
(32685, 24)
(32685, 23)
beginning to train model
model has been trained!
baseline loss 23.987708565133435


100%|██████████| 100/100 [00:00<00:00, 338.53it/s]


Number of models with high loss: 100
Number of models with lower loss: 0



In [28]:
output.head()

Unnamed: 0,y,person_id,seed,qualified_cost_25,qualified_cost_50,qualified_cost_75,qualified_gagne_1,qualified_gagne_2,qualified_gagne_3,m_1,...,m_91,m_92,m_93,m_94,m_95,m_96,m_97,m_98,m_99,m_100
0,0.009628,2545,0,0,1,1,1,0,0,,...,,,,,,,,,,
1,0.004905,8198,0,0,0,1,1,1,0,,...,,,,,,,,,,
2,0.009446,46461,0,0,1,1,0,0,0,,...,,,,,,,,,,
3,0.002361,30620,0,0,0,1,0,0,0,,...,,,,,,,,,,
4,0.003996,47418,0,0,0,1,1,1,0,,...,,,,,,,,,,


In [29]:
output.info()

<class 'pandas.core.frame.DataFrame'>
Index: 160990 entries, 0 to 16098
Columns: 109 entries, y to m_100
dtypes: float64(101), int64(8)
memory usage: 135.1 MB


In [30]:
output.to_csv(prediction_output, index=False)