In [8]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import random
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [10]:
# PREDICTION VARIABLES
num_random_seeds = 2
iterations_per_seed = 10
test_size = 0.20
validation_size = 0.25

MIN_EPOCH = 150
MAX_EPOCH = 250
NNEURON = 100
NLAYER = 1
BETA1 = 0.9
BETA2 = 0.999
TRAINLR = 0.001
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

TARGET_VARIABLE = "qualified_gagne_2"
prediction_output = "predictions/obermeyer/nn_variance_"+TARGET_VARIABLE+".csv"

In [11]:
# SETUP VARIABLES
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'gagne_sum_t', 'cost_t']

In [12]:
df = pd.read_csv(data_source)
X = df[features+other_variables]
y = df[TARGET_VARIABLE]

In [13]:
## Multi-layer preceptron with weight perturbation
class MLP(nn.Module):
    def __init__(self, nn_arch):
        super(MLP, self).__init__()
        self.nfeature, self.nclass, self.nneuron, self.nlayer = nn_arch

        self.read_in = nn.Linear(self.nfeature, self.nneuron)
        self.ff = nn.Linear(self.nneuron, self.nneuron)
        self.read_out = nn.Linear(self.nneuron, self.nclass)

    def forward(self, x):
        x = self.read_in(x)
        for _ in range(self.nlayer):
            x = F.relu(self.ff(x))

        logits = self.read_out(x)
        return logits


def train_model(model, X, y, epoch, optimizer, criterion, device):
    for _ in range(epoch):
        model.train()
        optimizer.zero_grad()
        outputs = model(torch.Tensor(X).to(device))
        loss = criterion(torch.squeeze(outputs), torch.Tensor(y).type(torch.LongTensor).to(device))
        loss.backward()
        optimizer.step()

    return model

def get_next_epoch(model, X, y, optimizer, criterion, device):
    model.train()
    optimizer.zero_grad()  
    outputs = model(torch.Tensor(X).to(device))
    loss = criterion(torch.squeeze(outputs), torch.Tensor(y).type(torch.LongTensor).to(device))
    loss.backward()
    optimizer.step()
    return model

def evaluate(model, X, y, criterion, device):
    model.eval()
    logits = model(torch.Tensor(X).to(device))
    return criterion(torch.squeeze(logits), torch.Tensor(y).type(torch.LongTensor).to(device)).item()

def predict(model, X, class_target=1):
    model.eval()
    logits = model(torch.Tensor(X).to(device))
    pred = F.softmax(logits, dim=1)
    return pred[:, class_target].detach().numpy()

In [14]:
output = []

for random_seed in range(num_random_seeds):
    print("Random Seed:", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=random_seed)
    
    cost = X_test["cost_t"].to_numpy()
    gagne = X_test["gagne_sum_t"].to_numpy()
    person_id = X_test['person_id'].to_numpy()
    
    X_train = X_train.drop(columns=other_variables).to_numpy()
    y_train = y_train.to_numpy()
    X_val = X_val.drop(columns=other_variables).to_numpy()
    y_val = y_val.to_numpy()
    X_test = X_test.drop(columns=other_variables).to_numpy()
    y_test = y_test.to_numpy()

    nfeature = X_train.shape[1]
    nclass = len(set(y_train))
    nn_arch = [nfeature, nclass, NNEURON, NLAYER]

    predictions = {}
    training_loss = {}
    validation_loss = {}
    i = 0
    for iteration in tqdm(range(iterations_per_seed)):
        np.random.seed(iteration*random_seed)
        random.seed(iteration*random_seed)
        torch.manual_seed(iteration*random_seed)

        model = MLP(nn_arch).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), betas=[BETA1, BETA2], lr=TRAINLR)
        model = train_model(model, X_train, y_train, MIN_EPOCH-1, optimizer, criterion, device)

        for epoch in range(MIN_EPOCH, MAX_EPOCH):
            model = get_next_epoch(model, X_train, y_train, optimizer, criterion, device)
            predictions[f'm_{i+1}'] = predict(model, X_test)
            training_loss[f'm_{i+1}'] = evaluate(model, X_train, y_train, criterion, device)
            validation_loss[f'm_{i+1}'] = evaluate(model, X_val, y_val, criterion, device)
            i += 1

    predictions_df = pd.concat([
        pd.DataFrame(predictions),
        pd.DataFrame(training_loss, index=[0]),
        pd.DataFrame(validation_loss, index=[0])]).reset_index(drop=True)
    
    predictions_df["y"] = np.concatenate([y_test, [np.nan, np.nan]])
    predictions_df["person_id"] = np.concatenate([person_id, [-2, -1]]) # -1 indicates validation loss, -2 indicates training loss
    predictions_df['cost_t'] = np.concatenate([cost, [np.nan, np.nan]]) 
    predictions_df['gagne_sum_t'] = np.concatenate([gagne, [np.nan, np.nan]])
    predictions_df["seed"] = random_seed

    output.append(predictions_df)
    
output = pd.concat(output)    

Random Seed: 0


100%|██████████| 10/10 [00:59<00:00,  5.91s/it]


Random Seed: 1


100%|██████████| 10/10 [00:56<00:00,  5.69s/it]


In [86]:
output.to_csv(prediction_output, index=False)

In [7]:
output.head()

Unnamed: 0,m_1,m_2,m_3,m_4,m_5,m_6,m_7,m_8,m_9,m_10,...,m_996,m_997,m_998,m_999,m_1000,y,person_id,cost_t,gagne_sum_t,seed
0,0.284669,0.283793,0.282828,0.280251,0.279028,0.277213,0.275613,0.2753,0.274382,0.273067,...,0.226767,0.225796,0.223017,0.227865,0.220843,0.0,2545,0.009628,1.0,0
1,0.584526,0.585354,0.586762,0.58728,0.589542,0.590826,0.591272,0.592424,0.59301,0.593878,...,0.667722,0.667625,0.665167,0.671884,0.663922,1.0,8198,0.004905,2.0,0
2,0.12033,0.119835,0.119696,0.119251,0.119615,0.119472,0.118841,0.118437,0.117844,0.117455,...,0.115787,0.115565,0.114366,0.116751,0.113287,0.0,46461,0.009446,0.0,0
3,0.0268,0.026547,0.026326,0.026077,0.025937,0.025758,0.025532,0.025317,0.025043,0.024752,...,0.015689,0.015637,0.015515,0.015659,0.015348,0.0,30620,0.002361,0.0,0
4,0.617911,0.620277,0.621888,0.622014,0.622948,0.623737,0.624935,0.627131,0.628835,0.630214,...,0.700903,0.701107,0.700283,0.703297,0.700155,1.0,47418,0.003996,2.0,0
