# Labeling alerts based on existing tags

In [None]:
import os.path
import copy
import pickle
import pandas
import numpy
import numpy.random

# Old-fashioned learning
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, f1_score

# Deep learning
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
import torch.nn.functional as F
from torch.autograd import Variable

from intensix.monitor.models import Alert

In [None]:
DATA = "../data"
MODELS = "../models"

STAY_TAGS = os.path.join(DATA, "stay_tags.pkl")
SOMESTAYS = os.path.join(DATA, "stays-3-14-days")

with open(STAY_TAGS, "rb") as f:
    stay_tags = pickle.load(f)
    
with open(SOMESTAYS, "r") as f:
    somestays = []
    for line in f:
        somestays.append(line.strip())

## Function for labeling a single stay

In [None]:
def label_alerts(stayid, df, alerts):
    """Adds tag-based labels.
    """
    alerts = numpy.append(alerts, numpy.zeros_like(alerts[:, -1:]), axis=1)
    if stayid in stay_tags:
        tags = sorted(
            set([(pandas.to_datetime(tag['time']) - df.index[0]).total_seconds()//60 
                 for tag in stay_tags[stayid]
                 if 'deterioration' in tag['concept']]))
        ia = 0
        it = 0
        while True:
            if it == len(tags):
                break
            if ia == len(alerts):
                break
            if alerts[ia, 0] > tags[it]:
                it += 1
                continue
            alerts[ia, -1] = 1
            ia += 1
    return alerts

## Loop over the dataset 

In [None]:
POS = 0
TOT = 0
dataset = []
for stayid in somestays:
    try:
        with open(os.path.join(DATA, "monitor-dataset-{}.pkl".format(stayid)),
                  "rb") as f:
            df = pickle.load(f)
        with open(os.path.join(DATA, "monitor-dataset-{}-alerts.npy".format(stayid)),
                  "rb") as f:
            alerts = numpy.load(f)
        print("+", end="")
        labeled_alerts = label_alerts(stayid, df, alerts)
        numpy.save(os.path.join(DATA, "monitor-dataset-{}-labeled-alerts.npy".format(stayid)),
                   labeled_alerts)
        dataset.append(labeled_alerts)
        POS += int(numpy.sum(labeled_alerts[:, -1]))
        TOT += len(labeled_alerts)
    except FileNotFoundError:
        print("-", end="")
print()
print("{} positive alerts out of {} total ({:.2f})".format(POS, TOT, POS/TOT))
dataset = numpy.concatenate(dataset, axis=0)
numpy.save(os.path.join(DATA, "labeled-alerts.npy"), dataset)

## Baseline tests --- scikit-learn style classification

In [None]:
CV = 5
TF = 1 / CV
class_weight = {0: POS,
                1: TOT - POS}

### Logistic regression

In [None]:
model = LogisticRegression(class_weight=class_weight)
scores = cross_val_score(model, dataset[:, 2:-1], dataset[:, -1], cv=CV, scoring='f1')
print(scores.mean(), scores.std())

In [None]:
ntrain = int((1 - TF) * len(dataset))
trainset = dataset[:ntrain]
testset = dataset[ntrain:]
model.fit(trainset[:, 2:-1], trainset[:, -1])
print("TRAIN:\n{}".format(confusion_matrix(trainset[:, -1], model.predict(trainset[:, 2:-1])) /
                          len(trainset)))
print("\nTEST:\n{}".format(confusion_matrix(testset[:, -1], model.predict(testset[:, 2:-1])) /
                           len(testset)))

### Linear SVM 

In [None]:
model = SGDClassifier(class_weight=class_weight, tol=0.001)
scores = cross_val_score(model, dataset[:, 2:-1], dataset[:, -1], cv=CV, scoring='f1')
print(scores.mean(), scores.std())

In [None]:
ntrain = int((1 - TF) * len(dataset))
trainset = dataset[:ntrain]
testset = dataset[ntrain:]
model.fit(trainset[:, 2:-1], trainset[:, -1])
print("TRAIN:\n{}".format(confusion_matrix(trainset[:, -1], model.predict(trainset[:, 2:-1])) / 
                          len(trainset)))
print("\nTEST:\n{}".format(confusion_matrix(testset[:, -1], model.predict(testset[:, 2:-1])) /
                          len(testset)))

## Deep learning

In [None]:
NEPOCHS = 100
BATCH_SIZE = 16
LEARNING_RATE = 0.0001

model = Alert(hidden_size=128, p=0.5)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

ntrain = int((1 - TF) * len(dataset))
trainset = torch.from_numpy(dataset[:ntrain])
testset = torch.from_numpy(dataset[ntrain:])

def truepred(model, dset):
    """Returns true and predicted labels.
    """
    y_true = dset[:, -1].numpy()
    y_pred = numpy.round(model(Variable(dset[:, 2:-1]))
                         .data.numpy()[:, 0])
    return y_true, y_pred
    
train_loader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
iepoch = 0

In [None]:
iepoch0 = iepoch
while iepoch != iepoch0 + NEPOCHS:
    model.train()
    train_loss = 0.
    train_samples = 0
    print("batch", end=" ")
    for ibatch, batch in enumerate(train_loader):
        optimizer.zero_grad()
        z = model(Variable(batch[:, 2:-1]))
        y = Variable(batch[:, -1]).resize(batch.size(0), 1)
        weight = (y * class_weight[1] + (1 - y) * class_weight[0])/(class_weight[0] + class_weight[1])
        loss = F.binary_cross_entropy(z, y, weight)
        loss.backward()
        train_loss += loss.data[0] * batch.size(0)
        train_samples += batch.size(0)
        optimizer.step()
        if (ibatch + 1) % (len(trainset) // BATCH_SIZE // 5) == 0:
            print("{}({}): {:.4f}".format(ibatch + 1, iepoch + 1, train_loss / train_samples), end=" ")
    print()
    train_loss /= train_samples
        
    model.eval()
    z = model(Variable(testset[:, 2:-1]))
    y = Variable(testset[:, -1]).resize(len(testset), 1)
    weight = (y * class_weight[1] + (1 - y) * class_weight[0])/(class_weight[0] + class_weight[1])
    loss = F.binary_cross_entropy(z, y, weight)
    print("EPOCH {}: train loss: {:.4f}, test loss: {:.4f}".format(iepoch + 1, train_loss, loss.data[0]))  
    iepoch += 1
    
    if iepoch % 10 == 0:
        train_true, train_pred = truepred(model, trainset)
        test_true, test_pred = truepred(model, testset)

        print("\nTRAIN:\n{}\nF1 = {:.4f}\n\nTEST:\n{}\nF1 = {:.4f}\n"
              .format(confusion_matrix(train_true, train_pred)/len(trainset),
                      f1_score(train_true, train_pred),
                      confusion_matrix(test_true, test_pred)/len(testset), 
                      f1_score(test_true, test_pred)))

In [None]:
torch.save(model.state_dict(), os.path.join(MODELS, "alerts.model"))