In [4]:
import sys
sys.path.insert(0,"/work/pip")

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy, time
from torch.autograd import Variable

## Things to Encorporate/Consider:

### Model Design 
- MSE Loss for All Outputs
    - Add all Losses Together
- Linear Layer Output After Encoding
    - Compare to Target Masked Input 
    - MSE Loss for only masks considered
- Linear Layer Ouput of Decoding = Remove Final Softmax
    - Compare Decoing Output to Future Target Sequence
    - MSE Loss
- Allow Model to Learn Positional Embedding
    - Embedding with Vocab = Sequence Length
- No Input Embedding vs. Simple Linear Layer
    - Need Linear Layer for Small Imput Dimensionality
- Idea: Part way throught the encoding process extract the prediction for the masked input
    - Intuition
        - The First Few Layers Function to Fill in Missingness/Noise
        - Remaining Layers Then Encode Input Further to Allow for more tailored input to the decode-encode multiheaded attention layer in the decoder. 
        - Requires the First Encoding layers to be different. 
    - 3x Encoding Reconstruction Layers --> 3x Encoding Layers
   

### Training Design
- Needs to Injest Patients as a Rolling Window with Patient Level Loss as the average of Loss from each windown
- Determine a method to mask out members of the sequence
    - Simply Zeroing out = Adding Noise aka a Normal Value
- For the Data: Need to Give Input, Shifted Target, Actual Target

# Model Architecture Class

In [6]:
class EncoderDecoder(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoder, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        return self.decode(self.encode(src, src_mask), src_mask, 
                           tgt, tgt_mask)
    
    def encode(self, src, src_mask):
        return self.encoder(self.src_embed(src), src_mask)
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [7]:
class Generator(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, vocab):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

## Encoder and Decoder Classes   

### Encoder

The encoder is composed of a stack of $N=6$ identical layers. 

In [8]:
def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [9]:
class Encoder(nn.Module):
    "Core encoder is a stack of N layers"
    def __init__(self, layer, N):
        super(Encoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, mask):
        "Pass the input (and mask) through each layer in turn."
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

### Defining Encoder Layers

In [10]:
class EncoderLayer(nn.Module):
    "Encoder is made up of self-attn and feed forward (defined below)"
    def __init__(self, size, self_attn, feed_forward, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = self_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 2)
        self.size = size

    def forward(self, x, mask):
        "Follow Figure 1 (left) for connections."
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, mask))
        return self.sublayer[1](x, self.feed_forward)

In [11]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [12]:
class SublayerConnection(nn.Module):
    """
    A residual connection followed by a layer norm.
    Note for code simplicity the norm is first as opposed to last.
    """
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        "Apply residual connection to any sublayer with the same size."
        return x + self.dropout(sublayer(self.norm(x)))

### Decoder Class and Layer

In [13]:
class Decoder(nn.Module):
    "Generic N layer decoder with masking."
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = LayerNorm(layer.size)
        
    def forward(self, x, memory, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)
        return self.norm(x)

In [14]:
class DecoderLayer(nn.Module):
    "Decoder is made of self-attn, src-attn, and feed forward (defined below)"
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)
 
    def forward(self, x, memory, src_mask, tgt_mask):
        "Follow Figure 1 (right) for connections."
        m = memory
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))
        x = self.sublayer[1](x, lambda x: self.src_attn(x, m, m, src_mask))
        return self.sublayer[2](x, self.feed_forward)

#### Decoder Masking

In [15]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype('uint8')
    return torch.from_numpy(subsequent_mask) == 0

## Attention

In [171]:
def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) \
             / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

In [172]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        "Implements Figure 2"
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        
        # 1) Do all the linear projections in batch from d_model => h x d_k 
        query, key, value = \
            [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
             for l, x in zip(self.linears, (query, key, value))]
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, 
                                 dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous() \
             .view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

## Position-wise Feed-Forward Networks                               

In [18]:
class PositionwiseFeedForward(nn.Module):
    "Implements FFN equation."
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

## Embeddings and Softmax

In [19]:
class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

In [81]:
class LinearEmbeddings(nn.Module):
    def __init__(self, d_model, d_input):
        super(LinearEmbeddings, self).__init__()
        self.lut = nn.Linear(d_input, d_model)
        self.d_model = d_model
    
    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

## Positional Encoding

In [67]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) *
                             -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + Variable(self.pe[:, :x.size(1)], 
                         requires_grad=False)
        return self.dropout(x)

In [68]:
class PositionalEmbedding(nn.Module):
    "Postional Embedding Class, where Positional Embedding is Learned"
    def __init__(self, d_model, max_len=24):
        super(PositionalEmbedding, self).__init__()
        
        self.lut = nn.Embedding(max_len, d_model)
        self.d_model = d_model
        self.max_len = max_len
        
    def forward(self, x):
        x = x + Variable(self.lut(torch.LongTensor(range(self.max_len))))
        return x

## Making Model

### Vanilla Encoder

In [69]:
def make_model(src_vocab, tgt_vocab, N=6, 
               d_model=512, d_ff=2048, h=8, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEncoding(d_model, dropout)
    model = EncoderDecoder(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), 
                             c(ff), dropout), N),
        nn.Sequential(Embeddings(d_model, src_vocab), c(position)),
        nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)),
        Generator(d_model, tgt_vocab))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model

### Medical Sequence Modeling

#### Archetecture:
1. Input Embedding
    1. Linear Embedding d_input $\rightarrow{}$ 4x d_input = d_model
    2. Positional Embedding 
        - Learned By Model
2. Encoding
    1. Masked Seqeunce Model Encoding = 3 Layers
        - Separate Linear Layer to Predict Masked Input 
    2. Future Sequence Prediction Ecoding = 3 Layers
3. Decoding
    1. Future Sequence Prediction Decoding = 6 Layers
        - Uses Final Output from Encode for Key and Values for Attention

In [70]:
class GeneratorMed(nn.Module):
    "Define standard linear + softmax generation step."
    def __init__(self, d_model, d_input):
        super(GeneratorMed, self).__init__()
        self.proj = nn.Linear(d_model, d_input)

    def forward(self, x):
        return self.proj(x)

In [181]:
class EncoderDecoderMed(nn.Module):
    """
    A standard Encoder-Decoder architecture. Base for this and many 
    other models.
    """
    def __init__(self, encoder, decoder, src_embed, tgt_embed, generator):
        super(EncoderDecoderMed, self).__init__()
        self.encoder_A = encoder
        self.encoder_B = copy.deepcopy(encoder)
        self.decoder = decoder
        self.src_embed = src_embed
        self.tgt_embed = tgt_embed
        self.generator = generator
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        "Take in and process masked src and target sequences."
        z, x = self.encode(src, src_mask)
        z = self.decode(z, src_mask, tgt, tgt_mask)
        #x = self.generator(x)
        #z = self.generator(z)
        return x, z
    
    def encode(self, src, src_mask):
        x = self.encoder_A(self.src_embed(src), src_mask)
        return self.encoder_B(x, src_mask), x 
    
    def decode(self, memory, src_mask, tgt, tgt_mask):
        return self.decoder(self.tgt_embed(tgt), memory, src_mask, tgt_mask)

In [252]:
def make_model_Med(d_input, N=3, h=4, max_len = 24, dropout=0.1):
    "Helper: Construct a model from hyperparameters."
    #Define Dimensions
    d_model = d_input*h
    d_ff = d_model*4
    
    c = copy.deepcopy
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionalEmbedding(d_model, max_len)
    model = EncoderDecoderMed(
        Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N),
        Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N*2),
        nn.Sequential(LinearEmbeddings(d_model, d_input), c(position)),
        nn.Sequential(LinearEmbeddings(d_model, d_input), c(position)),
        GeneratorMed(d_model, d_input))
    
    # This was important from their code. 
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    return model
    
    

In [309]:
def data_genMed(batch, seq_len, nbatches):
    "Generate random data for a src-tgt copy task."
    for i in range(nbatches):
        data = torch.from_numpy(np.random.uniform(-1, 1, size=(batch, seq_len+1, 17)))
        data[:, 0, :] = 1
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield BatchMed(src, tgt, 0)

In [310]:
class BatchMed:
    "Object for holding a batch of data with mask during training."
    def __init__(self, src, trg=None, pad=0):
        self.src = src[:, 1:, :]
        self.src_mask = torch.sum(self.src == pad, dim = 2).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1, :]
            self.trg_y = trg[:, 1:, :]
            self.trg_mask = \
                self.make_std_mask(self.trg, pad)
    
    @staticmethod
    def make_std_mask(tgt, pad):
        "Create a mask to hide padding and future words."
        tgt_mask = torch.sum(tgt == pad, dim = 2).unsqueeze(-2)
        tgt_mask = tgt_mask & Variable(
            subsequent_mask(tgt.size(1)).type_as(tgt_mask.data))
        return tgt_mask

In [311]:
d_in = 17
seq_len = 24
batches = 500
h = 4
model = make_model_Med(d_input = d_in, N=3, h=h, max_len = seq_len, dropout=0.1).float()


for i, batch in enumerate(data_genMed(batches, seq_len, 1)):
    out = model.forward(batch.src.float(), batch.trg.float(), 
                        batch.src_mask, batch.trg_mask)
    print(out[0].shape)
    print(out[1].shape)

torch.Size([500, 24, 68])
torch.Size([500, 24, 68])


## Loss Calculation

- Format for Loss Computation
    1. Loss (MSE) is Averaged Accross Each Window
        - Masked Sequence (MS) Loss is computed only for Masked Input Elements
        - Future Sequence (FS) Loss is computed for all elements
    2. Respective Losses are Normalized Based on the Number of Elements in the Window
    3. MS and FS Losses are added together by a given proportion

- Inputs:
    1. Unmasked Input
    2. Output of MS Encoder
    3. Target Future Sequence
    4. Shifted Input Target
    4. Output of Decoder
    5. Masked Hours for Each Patient
    6. Number of Windows for a Given Patient  

In [312]:
class LossCompute:
    "A loss compute and normalized by window size."
    def __init__(self, generator, criterion):
        self.generator = generator
        self.criterion = criterion
        
    def __call__(self, x, y, norm, mask = None):
        x = self.generator(x)
        loss = 0
        if mask is not None:
            for i, j, m, n in zip(x, y, mask, norm):
                i = i[(m==1)]
                j = j[(m==1)]
                loss += self.criterion(i.contiguous(), j.contiguous()) / n
        else:
            for i, j, n in zip(x, y, norm): 
                loss += self.criterion(i.contiguous(), j.contiguous()) / n
        
        return loss

In [313]:
class MedEmbedLoss:
    "Class to Calculate Embedding Loss: Masked Sequence Loss + Future Sequence Loss"
    def __init__(self, generator, criterion, opt=None):
        self.generator = generator
        self.criterion = criterion
        self.opt = opt
        self.loss = LossCompute(generator, criterion)
        
    def __call__(self, MSx, MSy, MSmask, FSx, FSy, norm, MSprop=0.5):
        MSLoss = self.loss(MSx, MSy, norm, MSmask)
        FSLoss = self.loss(FSx, FSy, norm)
        loss = MSprop*MSLoss + (1-MSprop)*FSLoss
        
        loss.backward()
        if self.opt is not None:
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return loss.item()

In [314]:
class NoamOpt:
    "Optim wrapper that implements rate."
    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0
        
    def step(self):
        "Update parameters and rate"
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate
        self._rate = rate
        self.optimizer.step()
        
    def rate(self, step = None):
        "Implement `lrate` above"
        if step is None:
            step = self._step
        return self.factor * \
            (self.model_size ** (-0.5) *
            min(step ** (-0.5), step * self.warmup ** (-1.5)))
        
def get_std_opt(model):
    return NoamOpt(model.src_embed[0].d_model, 2, 4000,
            torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9))

In [315]:
mask = torch.zeros(30,24)
sel = torch.randperm(24)[:4]
mask[:, sel] = 1
pt1 = 12
pt2 = 45
nx = torch.ones(pt1)*pt1
ny = torch.ones(30-pt1)*pt2
norm = torch.cat([nx, ny])

loss = MedEmbedLoss(GeneratorMed(68, 17), torch.nn.MSELoss())

l = loss(out[0], batch.src.float(), mask, out[1], batch.trg.float(), mask, norm)

## Training Loop

In [None]:
def run_epoch(data_iter, model, loss_compute, MSprop):
    "Standard Training and Logging Function"
    start = time.time()
    total_tokens = 0
    total_loss = 0
    tokens = 0
    for i, batch in enumerate(data_iter):
        out = model.forward(batch.src_masked, batch.trg, batch.src_pad, batch.trg_pad)
        loss = MedEmbedLoss(out[0], batch.src, batch.mask, out[1], batch.trg, batch.norm, MSprop)
        total_loss += loss
        if i % 50 == 1:
            elapsed = time.time() - start
            print("Epoch Step: %d Loss: %f, Time Elapsed: %f" %
                    (i, total_loss, elapsed))
            start = time.time()
    return total_loss / data_iter.n_samples

In [337]:
12000*200

2400000