In [1]:
import pandas as pd
import numpy as np
from fasterrisk.fasterrisk import RiskScoreOptimizer, RiskScoreClassifier
from sklearn.model_selection import train_test_split
import time
from sklearn.linear_model import LogisticRegression
from fasterrisk.binarization_util import convert_continuous_df_to_binary_df
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [8]:
num_random_seeds = 2

sparsity = [5] #[5, 6, 7, 8, 9, 10]
coeff_bounds = [5] #[5, 10, 15]

TARGET_VARIABLE = "qualified_gagne_2"
prediction_output = "predictions/obermeyer/fasterrisk_"+TARGET_VARIABLE+".csv"

In [9]:
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'gagne_sum_t', 'cost_t']

In [10]:
df = pd.read_csv(data_source)
X = df[features]
y = df[[TARGET_VARIABLE]+other_variables]
y.loc[:, TARGET_VARIABLE] = (y[TARGET_VARIABLE] * 2) - 1

X_binarized_df, featureIndex_to_groupIndex = convert_continuous_df_to_binary_df(X, get_featureIndex_to_groupIndex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y.loc[:, TARGET_VARIABLE] = (y[TARGET_VARIABLE] * 2) - 1


Converting continuous features to binary features in the dataframe......
We select thresholds for each continuous feature by sampling (without replacement) <= max_num_thresholds_per_feature values from all unique values in that feature column.
Finish converting continuous features to binary features......


In [11]:
output = []

for random_seed in range(num_random_seeds):
    X_train, X_test, y_train, y_test = train_test_split(X_binarized_df, y, test_size=0.20, random_state=random_seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=random_seed)
    
    cost = y_test["cost_t"].to_numpy()
    gagne = y_test["gagne_sum_t"].to_numpy()
    person_id = y_test['person_id'].to_numpy()

    y_train = y_train[TARGET_VARIABLE].to_numpy()
    y_val = y_val[TARGET_VARIABLE].to_numpy()
    y_test = y_test[TARGET_VARIABLE].to_numpy()
    X_train = X_train.to_numpy()
    X_val = X_val.to_numpy()
    X_test = X_test.to_numpy()

    predictions = {}
    training_loss = {}
    validation_loss = {}
    i = 0
    
    for k in sparsity:
        for b in coeff_bounds:
            print("Random Seed, k, b:", random_seed, k, b)

            RiskScoreOptimizer_m = RiskScoreOptimizer(X = X_train, y = y_train, k = k, lb = -b, ub = b,
                                                     select_top_m=100, parent_size=20, num_ray_search=40)
                                              #group_sparsity = 3, \
                                              #featureIndex_to_groupIndex = featureIndex_to_groupIndex)    
            RiskScoreOptimizer_m.optimize()
            multipliers, sparseDiversePool_beta0_integer, sparseDiversePool_betas_integer = RiskScoreOptimizer_m.get_models()

            val_loss = []
            for model_index in range(len(multipliers)):
                multiplier = multipliers[model_index]
                intercept = sparseDiversePool_beta0_integer[model_index]
                coefficients = sparseDiversePool_betas_integer[model_index]
                RiskScoreClassifier_m = RiskScoreClassifier(multiplier, intercept, coefficients, X_train = X_train)
                
                predictions[f'm_{i+1}'] = RiskScoreClassifier_m.predict_prob(X_test)
                training_loss[f'm_{i+1}'] = log_loss(y_train, RiskScoreClassifier_m.predict_prob(X_train))
                validation_loss[f'm_{i+1}'] = log_loss(y_val, RiskScoreClassifier_m.predict_prob(X_val))
                val_loss.append(log_loss(y_val, RiskScoreClassifier_m.predict_prob(X_val)))
                i += 1
            print(np.mean(val_loss))
            print()
            
    predictions_df = pd.concat([
        pd.DataFrame(predictions),
        pd.DataFrame(training_loss, index=[0]),
        pd.DataFrame(validation_loss, index=[0])]).reset_index(drop=True)
    
    predictions_df["y"] = np.concatenate([y_test, [np.nan, np.nan]])
    predictions_df["person_id"] = np.concatenate([person_id, [-2, -1]]) # -1 indicates validation loss, -2 indicates training loss
    predictions_df['cost_t'] = np.concatenate([cost, [np.nan, np.nan]]) 
    predictions_df['gagne_sum_t'] = np.concatenate([gagne, [np.nan, np.nan]])
    predictions_df["seed"] = random_seed

    output.append(predictions_df)
    
output = pd.concat(output)
output.loc[:, "y"] = (output["y"] + 1) / 2
for c in output.columns:
    if c.startswith("m_"):
        output[c] = output[c].astype('float32')

Random Seed, k, b: 0 5 5
0.2896223875720162

Random Seed, k, b: 1 5 5
0.2787109538982964



In [12]:
output.to_csv(prediction_output, index=False)