In [8]:
import pandas as pd
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
import random
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [10]:
# PREDICTION VARIABLES
num_random_seeds = 10
iterations_per_seed = 10
test_size = 0.20
validation_size = 0.25

MIN_EPOCH = 200
MAX_EPOCH = 300
NNEURON = 100
NLAYER = 1
BETA1 = 0.9
BETA2 = 0.999
TRAINLR = 0.001
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

TARGET_VARIABLE = "qualified_gagne_2"
prediction_output = "predictions/obermeyer/nn_variance_"+TARGET_VARIABLE+".csv"

In [11]:
# SETUP VARIABLES
data_source = "data/obermeyer/obermeyer_data_cleaned.csv"
features = ['dem_female', 'dem_age_band_18-24_tm1', 'dem_age_band_25-34_tm1', 'dem_age_band_35-44_tm1', 'dem_age_band_45-54_tm1',
            'dem_age_band_55-64_tm1', 'dem_age_band_65-74_tm1', 'dem_age_band_75+_tm1', 'hypertension_elixhauser_tm1', 'cost_dialysis_tm1',
            'cost_emergency_tm1', 'cost_home_health_tm1', 'cost_ip_medical_tm1', 'cost_ip_surgical_tm1', 'cost_laboratory_tm1',
            'cost_op_primary_care_tm1', 'cost_op_specialists_tm1', 'cost_op_surgery_tm1', 'cost_other_tm1', 'cost_pharmacy_tm1',
            'cost_physical_therapy_tm1', 'cost_radiology_tm1', 'gagne_sum_tm1']
other_variables = ['person_id', 'gagne_sum_t', 'cost_t']

In [12]:
df = pd.read_csv(data_source)
X = df[features+other_variables]
y = df[TARGET_VARIABLE]

In [13]:
## Multi-layer preceptron with weight perturbation
class MLP(nn.Module):
    def __init__(self, nn_arch):
        super(MLP, self).__init__()
        self.nfeature, self.nclass, self.nneuron, self.nlayer = nn_arch

        self.read_in = nn.Linear(self.nfeature, self.nneuron)
        self.ff = nn.Linear(self.nneuron, self.nneuron)
        self.read_out = nn.Linear(self.nneuron, self.nclass)

    def forward(self, x):
        x = self.read_in(x)
        for _ in range(self.nlayer):
            x = F.relu(self.ff(x))

        logits = self.read_out(x)
        return logits


def train_model(model, X, y, epoch, optimizer, criterion, device):
    for _ in range(epoch):
        model.train()
        optimizer.zero_grad()
        outputs = model(torch.Tensor(X).to(device))
        loss = criterion(torch.squeeze(outputs), torch.Tensor(y).type(torch.LongTensor).to(device))
        loss.backward()
        optimizer.step()

    return model

def get_next_epoch(model, X, y, optimizer, criterion, device):
    model.train()
    optimizer.zero_grad()  
    outputs = model(torch.Tensor(X).to(device))
    loss = criterion(torch.squeeze(outputs), torch.Tensor(y).type(torch.LongTensor).to(device))
    loss.backward()
    optimizer.step()
    return model

def evaluate(model, X, y, criterion, device):
    model.eval()
    logits = model(torch.Tensor(X).to(device))
    return criterion(torch.squeeze(logits), torch.Tensor(y).type(torch.LongTensor).to(device)).item()

def predict(model, X, class_target=1):
    model.eval()
    logits = model(torch.Tensor(X).to(device))
    pred = F.softmax(logits, dim=1)
    return pred[:, class_target].detach().cpu().numpy()

In [14]:
output = []

for random_seed in range(num_random_seeds):
    print("Random Seed:", random_seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_seed)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=validation_size, random_state=random_seed)
    
    cost = X_test["cost_t"].to_numpy()
    gagne = X_test["gagne_sum_t"].to_numpy()
    person_id = X_test['person_id'].to_numpy()
    
    X_train = X_train.drop(columns=other_variables).to_numpy()
    y_train = y_train.to_numpy()
    X_val = X_val.drop(columns=other_variables).to_numpy()
    y_val = y_val.to_numpy()
    X_test = X_test.drop(columns=other_variables).to_numpy()
    y_test = y_test.to_numpy()

    nfeature = X_train.shape[1]
    nclass = len(set(y_train))
    nn_arch = [nfeature, nclass, NNEURON, NLAYER]

    predictions = {}
    training_loss = {}
    validation_loss = {}
    i = 0
    for iteration in tqdm(range(iterations_per_seed)):
        np.random.seed(iteration*random_seed)
        random.seed(iteration*random_seed)
        torch.manual_seed(iteration*random_seed)

        model = MLP(nn_arch).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), betas=[BETA1, BETA2], lr=TRAINLR)
        model = train_model(model, X_train, y_train, MIN_EPOCH-1, optimizer, criterion, device)

        for epoch in range(MIN_EPOCH, MAX_EPOCH):
            model = get_next_epoch(model, X_train, y_train, optimizer, criterion, device)
            predictions[f'm_{i+1}'] = predict(model, X_test)
            training_loss[f'm_{i+1}'] = evaluate(model, X_train, y_train, criterion, device)
            validation_loss[f'm_{i+1}'] = evaluate(model, X_val, y_val, criterion, device)
            i += 1

    predictions_df = pd.concat([
        pd.DataFrame(predictions),
        pd.DataFrame(training_loss, index=[0]),
        pd.DataFrame(validation_loss, index=[0])]).reset_index(drop=True)
    
    predictions_df["y"] = np.concatenate([y_test, [np.nan, np.nan]])
    predictions_df["person_id"] = np.concatenate([person_id, [-2, -1]]) # -1 indicates validation loss, -2 indicates training loss
    predictions_df['cost_t'] = np.concatenate([cost, [np.nan, np.nan]]) 
    predictions_df['gagne_sum_t'] = np.concatenate([gagne, [np.nan, np.nan]])
    predictions_df["seed"] = random_seed

    output.append(predictions_df)
    
output = pd.concat(output)    
for c in output.columns:
    if c.startswith("m_"):
        output[c] = output[c].astype('float32')

Random Seed: 0


100%|██████████| 10/10 [00:59<00:00,  5.91s/it]


Random Seed: 1


100%|██████████| 10/10 [00:56<00:00,  5.69s/it]


In [86]:
output.to_csv(prediction_output, index=False)