In [1]:
import pandas as pd
import numpy as np
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from sklearn.model_selection import train_test_split
import time
from sklearn.linear_model import LogisticRegression
from fasterrisk.binarization_util import convert_continuous_df_to_binary_df
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [2]:
num_random_seeds = 1

sparsity = [5, 6, 7, 8, 9, 10]
coeff_bounds = [5, 10, 15]

TARGET_VARIABLE = "qualified_gagne_2"
prediction_output = "predictions/obermeyer/fasterrisk_"+TARGET_VARIABLE+".csv"

In [3]:
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'gagne_sum_t', 'cost_t']

In [4]:
df = pd.read_csv(data_source)
X = df[features]
y = df[[TARGET_VARIABLE]+other_variables]
y.loc[:, TARGET_VARIABLE] = (y.loc[:, TARGET_VARIABLE]*2)-1

X_binarized_df, featureIndex_to_groupIndex = convert_continuous_df_to_binary_df(X, get_featureIndex_to_groupIndex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y[TARGET_VARIABLE] = (y[TARGET_VARIABLE]*2)-1


Converting continuous features to binary features in the dataframe......
We select thresholds for each continuous feature by sampling (without replacement) <= max_num_thresholds_per_feature values from all unique values in that feature column.
Finish converting continuous features to binary features......


In [5]:
output = []

for random_seed in range(num_random_seeds):
    print("Random Seed:", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X_binarized_df, y, test_size=0.20, random_state=random_seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_seed)
    
    cost = y_test["cost_t"].to_numpy()
    gagne = y_test["gagne_sum_t"].to_numpy()
    person_id = y_test['person_id'].to_numpy()

    y_train = y_train[TARGET_VARIABLE].to_numpy()
    y_val = y_val[TARGET_VARIABLE].to_numpy()
    y_test = y_test[TARGET_VARIABLE].to_numpy()
    X_train = X_train.to_numpy()
    X_val = X_val.to_numpy()
    X_test = X_test.to_numpy()

    predictions = {}
    training_loss = {}
    validation_loss = {}
    i = 0
    
    for k in sparsity:
        for b in coeff_bounds:
            RiskScoreOptimizer_m = RiskScoreOptimizer(X = X_train, y = y_train, k = k, lb = -b, ub = b,
                                                     select_top_m=100, parent_size=20, num_ray_search=40)
                                              #group_sparsity = 3, \
                                              #featureIndex_to_groupIndex = featureIndex_to_groupIndex)    
            RiskScoreOptimizer_m.optimize()
            multipliers, sparseDiversePool_beta0_integer, sparseDiversePool_betas_integer = RiskScoreOptimizer_m.get_models()

            print(k, b, "We generate {} risk score models from the sparse diverse pool".format(len(multipliers)))

            val_loss = []
            for model_index in range(len(multipliers)):
                multiplier = multipliers[model_index]
                intercept = sparseDiversePool_beta0_integer[model_index]
                coefficients = sparseDiversePool_betas_integer[model_index]
                RiskScoreClassifier_m = RiskScoreClassifier(multiplier, intercept, coefficients, X_train = X_train)
                
                predictions[f'm_{i+1}'] = RiskScoreClassifier_m.predict_prob(X_test)
                training_loss[f'm_{i+1}'] = log_loss(y_train, RiskScoreClassifier_m.predict_prob(X_train))
                validation_loss[f'm_{i+1}'] = log_loss(y_val, RiskScoreClassifier_m.predict_prob(X_val))
                val_loss.append(log_loss(y_val, RiskScoreClassifier_m.predict_prob(X_val)))
                i += 1
            print(np.mean(val_loss))
            print()
            
    predictions_df = pd.concat([
        pd.DataFrame(predictions),
        pd.DataFrame(training_loss, index=[0]),
        pd.DataFrame(validation_loss, index=[0])]).reset_index(drop=True)
    
    predictions_df["y"] = np.concatenate([y_test, [np.nan, np.nan]])
    predictions_df["person_id"] = np.concatenate([person_id, [-2, -1]]) # -1 indicates validation loss, -2 indicates training loss
    predictions_df['cost_t'] = np.concatenate([cost, [np.nan, np.nan]]) 
    predictions_df['gagne_sum_t'] = np.concatenate([gagne, [np.nan, np.nan]])
    predictions_df["seed"] = random_seed

    output.append(predictions_df)
    
output = pd.concat(output)
for c in output.columns:
    if c.startswith("m_"):
        output[c] = output[c].astype('float32')

Random Seed: 0
5 5 We generate 84 risk score models from the sparse diverse pool
0.2896223875720162

5 10 We generate 100 risk score models from the sparse diverse pool
0.2897917168263317

5 15 We generate 100 risk score models from the sparse diverse pool
0.28967314827369384

6 5 We generate 84 risk score models from the sparse diverse pool
0.2892845315626386

6 10 We generate 100 risk score models from the sparse diverse pool
0.2880386389099772

6 15 We generate 100 risk score models from the sparse diverse pool
0.288060961078057

7 5 We generate 82 risk score models from the sparse diverse pool
0.2892658217783924

7 10 We generate 98 risk score models from the sparse diverse pool
0.2868125495492103

7 15 We generate 100 risk score models from the sparse diverse pool
0.28665173505892255

8 5 We generate 86 risk score models from the sparse diverse pool
0.2871914591557787

8 10 We generate 100 risk score models from the sparse diverse pool
0.28556188268887583

8 15 We generate 100 ris

In [6]:
output.to_csv(prediction_output, index=False)

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_1673,m_1674,m_1675,m_1676,m_1677,y,person_id,cost_t,gagne_sum_t,seed
0,0.111365,0.109609,0.114006,0.109575,0.112977,0.092255,0.112993,0.112984,0.112886,0.112885,...,0.186702,0.186631,0.186546,0.186536,0.186467,-1.0,2545,5300.0,1.0,0
1,0.799726,0.801626,0.796891,0.801664,0.797992,0.681817,0.797975,0.797985,0.798090,0.798091,...,0.773174,0.773242,0.773324,0.773334,0.773400,1.0,8198,2700.0,2.0,0
2,0.111365,0.109609,0.114006,0.109575,0.112977,0.092255,0.112993,0.112984,0.112886,0.112885,...,0.079246,0.079189,0.079121,0.079113,0.079058,-1.0,46461,5200.0,0.0,0
3,0.030429,0.029563,0.016288,0.029546,0.031235,0.021654,0.031243,0.031239,0.031189,0.031189,...,0.024627,0.024599,0.024566,0.024562,0.024535,-1.0,30620,1300.0,0.0,0
4,0.799726,0.801626,0.796891,0.801664,0.797992,0.681817,0.797975,0.797985,0.798090,0.798091,...,0.813298,0.813369,0.813454,0.813464,0.813533,1.0,47418,2200.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9754,0.888635,0.890391,0.885994,0.942321,0.887023,0.821167,0.887007,0.887016,0.887114,0.887115,...,0.900910,0.900972,0.901047,0.901056,0.901117,1.0,16951,3600.0,5.0,0
9755,0.799726,0.801626,0.796891,0.801664,0.797992,0.681817,0.797975,0.797985,0.798090,0.798091,...,0.773174,0.773242,0.773324,0.773334,0.773400,1.0,35677,4100.0,2.0,0
9756,0.030429,0.029563,0.031755,0.029546,0.031235,0.021654,0.031243,0.031239,0.031189,0.031189,...,0.039603,0.039565,0.039519,0.039513,0.039476,-1.0,27577,3200.0,1.0,0
9757,0.280880,0.281003,0.281237,0.281314,0.281352,0.281358,0.281374,0.281375,0.281410,0.281439,...,0.275322,0.275357,0.275361,0.275361,0.275366,,-2,,,0
