In [386]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torchtext
import json
import numpy as np
from tqdm import tqdm

<h1>3-2. ELMo<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prepare-dataset:-IMBD" data-toc-modified-id="Prepare-dataset:-IMBD-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prepare dataset: IMBD</a></span><ul class="toc-item"><li><span><a href="#For-pretraining" data-toc-modified-id="For-pretraining-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>For pretraining</a></span></li><li><span><a href="#For-training" data-toc-modified-id="For-training-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>For training</a></span></li></ul></li><li><span><a href="#Build-the-model" data-toc-modified-id="Build-the-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build the model</a></span><ul class="toc-item"><li><span><a href="#Bidirectional-language-model" data-toc-modified-id="Bidirectional-language-model-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Bidirectional language model</a></span></li><li><span><a href="#ELMo" data-toc-modified-id="ELMo-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>ELMo</a></span></li></ul></li><li><span><a href="#Train-the-model" data-toc-modified-id="Train-the-model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train the model</a></span><ul class="toc-item"><li><span><a href="#Pretrain-bidirectional-language-model" data-toc-modified-id="Pretrain-bidirectional-language-model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Pretrain bidirectional language model</a></span></li><li><span><a href="#Train-ELMo-for-sentiment-analysis" data-toc-modified-id="Train-ELMo-for-sentiment-analysis-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Train ELMo for sentiment analysis</a></span></li></ul></li></ul></div>

## Prepare dataset: IMBD

In [2]:
from torchtext.experimental.datasets import IMDB
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("spacy")
train, test = IMDB(tokenizer=tokenizer, root="~/torchdata/")

aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [02:08<00:00, 654kB/s] 
100%|██████████| 25000/25000 [00:24<00:00, 1028.00lines/s]


In [3]:
from collections import Counter

vocab = train.get_vocab()

v = Counter(['<s>', '</s>'])
v = torchtext.vocab.Vocab(v, specials=['<s>', '</s>'])
vocab.extend(v)

In [369]:
VOCAB_SIZE = len(vocab)

In [84]:
ngrams = 2
bos = "<s>"
eos = "</s>"

x = []
y = []
for _, words in tqdm(train):
    # Begin-of-sentence token
    x.append([vocab.stoi[bos]] + words[0:ngrams-1].tolist())
    y.append(words[ngrams-1].item())
    
    # in-sentence tokens
    for i in range(len(words)-ngrams):
        text = words[i:i+ngrams]
        label = words[i+ngrams]
        x.append(text.tolist())
        y.append(label.tolist())
        
    # End-of-sentence token
    x.append(words[i+1:i+ngrams].tolist() + [vocab.stoi[eos]])
    y.append(vocab.stoi[eos])

100%|██████████| 25000/25000 [00:47<00:00, 527.94it/s]


```python
with open("./IMBD_bigram.json", "w") as w:
    json.dump({"data": x, "label": y}, w)
```

### For pretraining

* transform to character-level n-gram dataset
    * original script
```
%load https://gist.githubusercontent.com/akurniawan/30719686669dced49e7ced720329a616/raw/7b9f9967c01ce87ac505520a5aa58d3b24c55c66/translation_char_example.py
```
    * modified
```
%load https://gist.github.com/naturale0/6bb3b8a5c682bd281de87e408fa71bf1/raw/df8b7e198f149f81c4f72af977760b2eb3226cdf/translation_char_example.py
```

In [121]:
# Modified a little to fit classification
import itertools
from torchtext.experimental.datasets import TextClassificationDataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.experimental.functional import sequential_transforms

def build_char_vocab(data, index, 
                     bow="<w>", eow="</w>",
                     bos="<s>", eos="</s>"):
    """
    build character level vocabulary
    """
    tok_list = [
        [bow],
        [eow],
        [bos],
        [eos],
    ]
    for line in data:
        tokens = list(itertools.chain.from_iterable(line[index]))
        tok_list.append(tokens)
    return build_vocab_from_iterator(tok_list)


def stoi(vocab):
    """
    change string to index
    """
    def func(tok_iter):
        return [[vocab[char] for char in word]\
                for word in tok_iter]

    return func


def tokenize_char(bow="<w>", eow="</w>", 
                  bos="<s>", eos="</s>",
                  max_word_length=20):
    """
    attach bow, eow token and pad with token
    """
    def func(tok_iter):
        result = np.empty((ngrams, max_word_length+2), dtype=object)
        result[:] = "<pad>"
        
        for i, word in enumerate(tok_iter):
            if "".join(word) == "<s>":
                # Begin-of-sentence token
                result[i, :3] = [bow, bos, eow] #+ ["<pad>"] * (max_word_length - 1)
            elif "".join(word) == "</s>":
                # End-of-sentence token
                result[i, :3] = [bow, eos, eow] #+ ["<pad>"] * (max_word_length - 1)
            else:
                # in-sentence words
                if len(word) < max_word_length:
                    result[i, :len(word)+2] = [bow] + word + [eow]
                else:
                    result[i, :] = [bow] + word[:max_word_length] + [eow]
                
                
#                 result[:len(tok_iter)] = [
#                     [bow] + word + [eow] \
#                     + ["<pad>"] * (max_word_length - len(word)) \
#                     if len(word) < max_word_length \
#                     else [bow] + word[:max_word_length] + [eow]
#                 for word in tok_iter]

        return result

    return func

In [101]:
# Cache training data for vocabulary construction
train_data = [(line[0], [vocab.itos[ix] for ix in line[1]]) for line in tqdm(zip(y, x))]

6810715it [00:07, 935578.38it/s] 


In [105]:
train_data[:3]

[(1657, ['<s>', 'I']), (11, ['I', 'rented']), (14567, ['rented', 'I'])]

In [106]:
# Setup vocabularies (both words and chars)
char_vocab = build_char_vocab(train_data, index=1)

6810719lines [00:07, 908319.44lines/s]


In [122]:
# Building the dataset with character level tokenization
def char_tokenizer(words):
    return [list(word) for word in words]

char_transform = sequential_transforms(
    char_tokenizer, 
    tokenize_char(), 
    stoi(char_vocab),
    lambda x: torch.tensor(x)
)

trainset = TextClassificationDataset(
    train_data,
    char_vocab,
    (lambda x: x, char_transform),
)

In [140]:
print([[char_vocab.itos[i] for i in w] for w in trainset[17][1]] [1][:13])

['<w>', 's', 'u', 'r', 'r', 'o', 'u', 'n', 'd', 'e', 'd', '</w>', '<pad>']


In [148]:
# Prepare DataLoader
def collate_fn(batch):
    label, text = zip(*batch)
    label = torch.LongTensor(label)
    text = torch.stack(text)
    #lens = list(map(lambda x: len(x[(x != 0).all(dim=1)]), text))
    
    return label, text

pretrainloader = data.DataLoader(trainset, batch_size=32, collate_fn=collate_fn)

In [390]:
y, x = next(iter(pretrainloader))

### For training

In [61]:
# Modified a little to fit classification
import itertools
from torchtext.experimental.datasets import TextClassificationDataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.experimental.functional import sequential_transforms

def build_char_vocab(data, index, bow="<w>", eow="</w>"):
    """
    build character level vocabulary
    """
    tok_list = [
        [bow],
        [eow],
    ]
    for line in data:
        tokens = list(itertools.chain.from_iterable(line[index]))
        tok_list.append(tokens)
    return build_vocab_from_iterator(tok_list)


def stoi(vocab):
    """
    change string to index
    """
    def func(tok_iter):
        return [[vocab[char] for char in word]\
                for word in tok_iter]

    return func


def tokenize_char(bow="<w>", eow="</w>", max_word_length=20):
    """
    attach bow, eow token and pad with token
    """
    def func(tok_iter):
        result = np.empty((max(len_seq), max_word_length+2), dtype=object)
        
        # "≥" for padding
        result[:len(tok_iter)] = [
            [bow] + word + [eow] \
            + ["<pad>"] * (max_word_length - len(word)) \
            if len(word) < max_word_length \
            else [bow] + word[:max_word_length] + [eow]
        for word in tok_iter]
        
        return result

    return func

In [54]:
# Cache training data for vocabulary construction
train_data = [(line[0], [vocab.itos[ix] for ix in line[1]]) for line in train]

In [59]:
# store list of seq length for packing later
len_seq = list(map(lambda x: len(x[1]), train_data))

In [62]:
# Setup vocabularies (both words and chars)
char_vocab = build_char_vocab(train_data, index=1)

6760717lines [00:05, 1185785.41lines/s]


In [66]:
# Building the dataset with character level tokenization
def char_tokenizer(words):
    return [list(word) for word in words]

char_transform = sequential_transforms(
    char_tokenizer, 
    tokenize_char(), 
    stoi(char_vocab),
    lambda x: torch.tensor(x)
)

trainset = TextClassificationDataset(
    train_data,
    char_vocab,
    (lambda x: x, char_transform),
)

In [67]:
# Prepare DataLoader
def collate_fn(batch):
    label, text = zip(*batch)
    label = torch.stack(label)
    text = torch.stack(text)
    #lens = list(map(lambda x: len(x[(x != 0).all(dim=1)]), text))
    
    return label, text

trainloader = data.DataLoader(trainset, batch_size=32, collate_fn=collate_fn)

## Build the model

### Bidirectional language model

In [380]:
class CharConv(nn.Module):
    
    def __init__(self):
        super(CharConv, self).__init__()
        
        # Embedding layer
        CHAR_EMBEDDING_DIM = 16
        self.char_embedding = nn.Embedding(len(char_vocab), CHAR_EMBEDDING_DIM)
        
        # Conv layers
        self.convs = [
            nn.Conv2d(CHAR_EMBEDDING_DIM, 4, 1),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 4, (1, 2)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 8, (1, 3)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 16, (1, 4)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 32, (1, 5)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 64, (1, 6)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 128, (1, 7))
        ]
        
    
    def forward(self, x):
        # character-level convolution
        x = self.char_embedding(x).permute(0,3,1,2)
        x = [conv(x) for conv in self.convs]
        x = [F.max_pool2d(x_c, kernel_size=(1, x_c.shape[3])) for x_c in x]
        x = [torch.squeeze(x_p) for x_p in x]
        x = torch.hstack(x)  # 1, n_batch, concat_length
        #x = x.view(2, 0, 1)
        
        return x

In [381]:
class BiLSTM(nn.Module):
    def __init__(self):
        super(BiLSTM, self).__init__()
        # Bi-LSTM
        self.lstm1 = nn.LSTM(256, 1024, bidirectional=True)
        self.proj = nn.Linear(2*1024, 2*256, bias=False)
        self.lstm2 = nn.LSTM(2*256, 1024, bidirectional=True)
    
    def forward(self, x):
        # reshape input
        x = x.view(-1, 256, ngrams).permute(2, 0, 1)
        
        # 1st LSTM layer
        o, (h1, __) = self.lstm1(x)
        
        # main connection
        p = self.proj(o)
        
        # skip connection
        x2 = x.repeat(1,1,2)
        x3 = x2 + p
        
        # 2nd LSTM layer
        _, (h2, __) = self.lstm2(x3)
        return h1, h2

In [384]:
class BiLangModel(nn.Module):
    """
    Bidirectional language model (will be pretrained)
    """
    def __init__(self, char_cnn, bi_lstm):
        super(BiLangModel, self).__init__()
        
        # Highway connection
        CHAR_EMBEDDING_DIM = 16
        self.highway = nn.Linear(ngrams * 256, ngrams * 256)
        self.transform = nn.Linear(ngrams * 256, ngrams * 256)
        self.char_cnn = char_cnn
        self.bi_lstm = bi_lstm
        
        # last layer: classifier
        self.fc = nn.Linear(ngrams * 1024, 1024)
        
    def forward(self, x):
        # Character-level convolution
        x = self.char_cnn(x)
        x = x.view(-1, ngrams*256)
        
        # highway
        h = self.highway(x)
        t_gate = torch.sigmoid(self.transform(x))
        c_gate = 1 - t_gate
        x = h * t_gate + x * c_gate
        
        # Bi-LSTM
        _, x = self.bi_lstm(x)
        
        # fully-connected layer (classify)
        x = x.view(-1, ngrams * 1024)
        x = self.fc(x)
        
        return x

In [385]:
charconv = CharConv()
bilstm = BiLSTM()
bilm = BiLangModel(charconv, bilstm)
bilm(x).shape

torch.Size([32, 1024])

In [401]:
from torchviz import make_dot

charconv = CharConv()
bilstm = BiLSTM()

model = BiLangModel(charconv, bilstm)
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
criterion = nn.AdaptiveLogSoftmaxWithLoss(1024, VOCAB_SIZE, cutoffs=[10, 100, 1000, 10000])

make_dot(criterion(model(x), y).loss).render("BiLanguageModel", format="png")

'BiLanguageModel.png'

### ELMo

In [None]:
bilm(x)

## Train the model

### Pretrain bidirectional language model

In [404]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [405]:
model = model.to(device)
criterion = criterion.to(device)

In [432]:
def accuracy(output, y):
    with torch.no_grad():
        pred = criterion.log_prob(output).argmax(dim=1)
    correct = (pred == y).float()
    #print(pred.shape, correct.shape)
    return correct.sum() / len(correct)

In [436]:
def train(model, dataloader, criterion, optimizer):
    loss_epoch = 0.
    acc_epoch = 0.
    for y, x in dataloader:
        model.zero_grad()
        
        out = model(x)
        loss = criterion(out, y).loss
        
        loss.backward()
        optimizer.step()
        
        loss_epoch += loss.item()
        acc_epoch += accuracy(out, y).item()
    return loss_epoch, acc_epoch

In [437]:
N_EPOCH = 2

losses = []
accs = []
for i in range(1, N_EPOCH+1):
    loss_epoch, acc_epoch = train(model, pretrainloader, criterion, optimizer)
    losses.append(loss_epoch)
    accs.append(acc_epoch)
    
    if i % 5 == 0:
        print(f"epoch: {i:03}, loss: {loss_epoch/len(trainiter): .3f}, acc: {acc_epoch/len(trainiter): .4f}")

KeyboardInterrupt: 

### Train ELMo for sentiment analysis