# Embeddings from Language Models (ELMo)

In [1]:
import os
import numpy as np
from allennlp.modules.elmo import Elmo as allennlp_Elmo

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

## Set Utils

In [3]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## Set Configs

In [4]:
OUTPUT_SIZE = 1
DROPOUT = 0.5
LR = 2e-4
WEIGHT_DECAY = 1e-4
STEP_SIZE = 1
GAMMA = 0.9

options_file = 'elmo_2x4096_512_2048cnn_2xhighway_options.json'
weight_file = 'elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'

In [5]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

## Build ELMo Network

<img src='images/elmo-architecture.png' width=50% />

In [6]:
class Elmo(nn.Module):
    
    def __init__(self, output_size, dropout=0.5, 
                       options_file='elmo_2x4096_512_2048cnn_2xhighway_options.json', 
                       weight_file='elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5'):
        super(Elmo, self).__init__()
        
        self.dropout = dropout
        self.options_file = options_file
        self.weight_file = weight_file
        
        # use ELMo, deep bi-directional LSTM, for extracting sentence/ contextualized word embeddings
        self.elmo = allennlp_Elmo(options_file, weight_file, 1, dropout, do_layer_norm=False)
        self.conv_layer = nn.Conv1d(1024, 16, 3)
        self.pool_layer = nn.AdaptiveMaxPool1d(128)
        self.relu = nn.ReLU6()
        self.dropout = nn.Dropout(dropout)
        self.fc_layer = nn.Linear(2048, output_size)
        
    def init_weights(self):
        
        for name, param in self.fc_layer.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)
        for name, param in self.conv_layer.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            elif 'weight' in name:
                nn.init.xavier_uniform_(param)
                
    def forward(self, sentences):
        
        elmo_out = self.elmo(sentences)
        x = elmo_out['elmo_representations'][0]
        x = x.transpose(1,2)
        x = self.conv_layer(x)
        x = self.relu(x)
        x = self.pool_layer(x)
        x = x.view(-1, 2048)
        x = self.dropout(x)
        output = self.fc_layer(x)
        
        return output

#### Initialize ELMo Network

In [None]:
elmo = Elmo(OUTPUT_SIZE, DROPOUT, options_file, weight_file)
elmo.to(device)

In [None]:
print("Number of params:", count_parameters(elmo))

## Set Loss Function

In [None]:
ce_loss = nn.CrossEntropyLoss()

## Set Optimizer

In [None]:
optimizer = torch.optim.Adam(elmo.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

---