In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import json

<h1>3-2. ELMo<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prepare-dataset:-Gutenberg" data-toc-modified-id="Prepare-dataset:-Gutenberg-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Prepare dataset: Gutenberg</a></span><ul class="toc-item"><li><span><a href="#For-pretraining" data-toc-modified-id="For-pretraining-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>For pretraining</a></span></li><li><span><a href="#For-training" data-toc-modified-id="For-training-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>For training</a></span></li></ul></li><li><span><a href="#Build-the-model" data-toc-modified-id="Build-the-model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Build the model</a></span><ul class="toc-item"><li><span><a href="#Bidirectional-language-model" data-toc-modified-id="Bidirectional-language-model-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Bidirectional language model</a></span></li><li><span><a href="#ELMo" data-toc-modified-id="ELMo-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>ELMo</a></span></li></ul></li><li><span><a href="#Train-the-model" data-toc-modified-id="Train-the-model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Train the model</a></span><ul class="toc-item"><li><span><a href="#Pretrain-bidirectional-language-model" data-toc-modified-id="Pretrain-bidirectional-language-model-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Pretrain bidirectional language model</a></span></li><li><span><a href="#Train-ELMo-for-sentiment-analysis" data-toc-modified-id="Train-ELMo-for-sentiment-analysis-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Train ELMo for sentiment analysis</a></span></li></ul></li></ul></div>

## Prepare dataset: Gutenberg

In [11]:
from torchtext.experimental.datasets import IMDB
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer("spacy")
train, test = IMDB(tokenizer=tokenizer)

  SRC = (PWD / "_custom_kernels.cu").open("r", encoding="utf8").read()
  MMH_SRC = (PWD / "_murmur3.cu").open("r", encoding="utf8").read()
100%|██████████| 25000/25000 [00:26<00:00, 936.68lines/s] 


In [25]:
vocab = train.get_vocab()

In [48]:
ngrams = 2
x = []
y = []
for _, words in train:
    for i in range(len(words)-ngrams):
        text = words[i:i+ngrams]
        label = words[i+ngrams]
        x.append(text.tolist())
        y.append(label.tolist())

```python
with open("./IMBD_bigram.json", "w") as w:
    json.dump({"data": x, "label": y}, w)
```

### For pretraining

* transform to character-level n-gram dataset
    * original script
```
%load https://gist.githubusercontent.com/akurniawan/30719686669dced49e7ced720329a616/raw/7b9f9967c01ce87ac505520a5aa58d3b24c55c66/translation_char_example.py
```
    * modified
```
%load https://gist.github.com/naturale0/6bb3b8a5c682bd281de87e408fa71bf1/raw/df8b7e198f149f81c4f72af977760b2eb3226cdf/translation_char_example.py
```

In [83]:
# Modified a little to fit classification
import itertools
from torchtext.experimental.datasets import TextClassificationDataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.experimental.functional import sequential_transforms

def build_char_vocab(data, index, bow="<w>", eow="</w>"):
    """
    build character level vocabulary
    """
    tok_list = [
        [bow],
        [eow],
    ]
    for line in data:
        tokens = list(itertools.chain.from_iterable(line[index]))
        tok_list.append(tokens)
    return build_vocab_from_iterator(tok_list)


def stoi(vocab):
    """
    change string to index
    """
    def func(tok_iter):
        return [[vocab[char] for char in word]\
                for word in tok_iter]

    return func


def tokenize_char(bow="<w>", eow="</w>", max_word_length=20):
    """
    attach bow, eow token and pad with token
    """
    def func(tok_iter):
        result = np.empty((max(len_seq), max_word_length+2), dtype=object)
        
        # "≥" for padding
        result[:len(tok_iter)] = [
            [bow] + word + [eow] \
            + ["<pad>"] * (max_word_length - len(word)) \
            if len(word) < max_word_length \
            else [bow] + word[:max_word_length] + [eow]
        for word in tok_iter]
        
        return result

    return func

In [81]:
# Cache training data for vocabulary construction
train_data = [(line[0], [vocab.itos[ix] for ix in line[1]]) for line in zip(y, x)]

In [82]:
train_data[:3]

[(11, ['I', 'rented']), (14567, ['rented', 'I']), (36197, ['I', 'AM'])]

In [84]:
# Setup vocabularies (both words and chars)
char_vocab = build_char_vocab(train_data, index=1)

6760717lines [00:06, 993084.93lines/s]


In [85]:
# Building the dataset with character level tokenization
def char_tokenizer(words):
    return [list(word) for word in words]

char_transform = sequential_transforms(
    char_tokenizer, 
    tokenize_char(), 
    stoi(char_vocab),
    lambda x: torch.tensor(x)
)

trainset = TextClassificationDataset(
    train_data,
    char_vocab,
    (lambda x: x, char_transform),
)

In [93]:
# Prepare DataLoader
def collate_fn(batch):
    label, text = zip(*batch)
    label = torch.LongTensor(label)
    text = torch.stack(text)
    #lens = list(map(lambda x: len(x[(x != 0).all(dim=1)]), text))
    
    return label, text

pretrainloader = data.DataLoader(trainset, batch_size=32, collate_fn=collate_fn)

### For training

In [61]:
# Modified a little to fit classification
import itertools
from torchtext.experimental.datasets import TextClassificationDataset
from torchtext.vocab import build_vocab_from_iterator
from torchtext.experimental.functional import sequential_transforms

def build_char_vocab(data, index, bow="<w>", eow="</w>"):
    """
    build character level vocabulary
    """
    tok_list = [
        [bow],
        [eow],
    ]
    for line in data:
        tokens = list(itertools.chain.from_iterable(line[index]))
        tok_list.append(tokens)
    return build_vocab_from_iterator(tok_list)


def stoi(vocab):
    """
    change string to index
    """
    def func(tok_iter):
        return [[vocab[char] for char in word]\
                for word in tok_iter]

    return func


def tokenize_char(bow="<w>", eow="</w>", max_word_length=20):
    """
    attach bow, eow token and pad with token
    """
    def func(tok_iter):
        result = np.empty((max(len_seq), max_word_length+2), dtype=object)
        
        # "≥" for padding
        result[:len(tok_iter)] = [
            [bow] + word + [eow] \
            + ["<pad>"] * (max_word_length - len(word)) \
            if len(word) < max_word_length \
            else [bow] + word[:max_word_length] + [eow]
        for word in tok_iter]
        
        return result

    return func

In [54]:
# Cache training data for vocabulary construction
train_data = [(line[0], [vocab.itos[ix] for ix in line[1]]) for line in train]

In [59]:
# store list of seq length for packing later
len_seq = list(map(lambda x: len(x), train_data))

In [62]:
# Setup vocabularies (both words and chars)
char_vocab = build_char_vocab(train_data, index=1)

6760717lines [00:05, 1185785.41lines/s]


In [66]:
# Building the dataset with character level tokenization
def char_tokenizer(words):
    return [list(word) for word in words]

char_transform = sequential_transforms(
    char_tokenizer, 
    tokenize_char(), 
    stoi(char_vocab),
    lambda x: torch.tensor(x)
)

trainset = TextClassificationDataset(
    train_data,
    char_vocab,
    (lambda x: x, char_transform),
)

In [67]:
# Prepare DataLoader
def collate_fn(batch):
    label, text = zip(*batch)
    label = torch.stack(label)
    text = torch.stack(text)
    #lens = list(map(lambda x: len(x[(x != 0).all(dim=1)]), text))
    
    return label, text

trainloader = data.DataLoader(trainset, batch_size=32, collate_fn=collate_fn)

## Build the model

### Bidirectional language model

In [108]:
x = next(iter(pretrainloader))[1]
x.shape

torch.Size([32, 2, 22])

In [109]:
x2 = nn.Embedding(len(char_vocab), 4)(x)
x2.shape

torch.Size([32, 2, 22, 4])

In [110]:
x2.permute(0,3,1,2).shape

torch.Size([32, 4, 2, 22])

In [111]:
x3 = nn.Conv2d(4, 5, (1, 3))(x2.permute(0,3,1,2))
x3.shape

torch.Size([32, 5, 2, 20])

In [155]:
x4 = F.max_pool2d(x3, kernel_size=(1, x3.shape[3]))
x4.shape

torch.Size([32, 5, 2, 1])

In [163]:
x5 = torch.squeeze(x4)
x5.shape, x5.view(-1, ngrams*5).shape, x5.permute(2, 0, 1).shape  # (seq_len, batch, input_size)

(torch.Size([32, 5, 2]), torch.Size([32, 10]), torch.Size([2, 32, 5]))

In [165]:
x6 = nn.Linear(ngrams*5, 10)(x5.view(-1, ngrams*5))
x6.shape

torch.Size([32, 10])

In [132]:
o, h = nn.LSTM(5, 6)(x5.permute(2, 0, 1))
len(h), o.shape

(2, torch.Size([1, 32, 6]))

In [None]:
class CharConv(nn.Module):
    
    def __init__(self):
        super(self, BiLM).__init__()
        
        # Embedding layer
        CHAR_EMBEDDING_DIM = 16
        self.char_embedding = nn.Embedding(len(char_vocab), CHAR_EMBEDDING_DIM)
        
        # Conv layers
        self.convs = [
            nn.Conv2d(CHAR_EMBEDDING_DIM, 4, 1),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 4, (1, 2)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 8, (1, 3)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 16, (1, 4)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 32, (1, 5)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 64, (1, 6)),
            nn.Conv2d(CHAR_EMBEDDING_DIM, 128, (1, 7))
        ]
        
    
    def forward(self, x):
        # character-level convolution
        x = self.char_embedding(x).permute(0,3,1,2)
        x = [conv(x) for conv in self.convs]
        x = [F.max_pool2d(x_c, kernel_size=x_c.shape[3]) for x_c in x]
        x = [torch.squeeze(x_p) for x_p in x]
        x = torch.vstack(x)  # 1, n_batch, concat_length
        x = x.permute(2, 0, 1)
        
        return x

In [None]:
class BiLSTM(nn.Module):
    def __init__(self):
        # Bi-LSTM
        self.lstm1 = nn.LSTM(256, 1024, bidirectional=True)
        self.proj = nn.Linear(1024, 128, bias=False)
        self.lstm2 = nn.LSTM(128, 1024, bidirectional=True)
    
    def forward(self, x):
        pass

In [None]:
class BiLM(nn.Module):
    def __init__(self, char_cnn, bi_lstm):
        
        # Highway connection
        CHAR_EMBEDDING_DIM = 16
        self.highway = nn.Linear(ngrams * 256, 256)
        self.transform = nn.Linear(ngrams * 256, 256)
        self.char_cnn = char_cnn
        self.bi_lstm = bi_lstm
    
    def forward(self, x):
        # Character-level convolution
        x = self.char_cnn(x)
        x = x.view(1, -1)
        
        # highway
        h = self.highway(x)
        t_gate = torch.sigmoid(self.transform(x))
        c_gate = 1 - t_gate
        x = h * t_gate + x * c_gate
        
        # Bi-LSTM
        self.bi_lstm(x)

### ELMo

## Train the model

### Pretrain bidirectional language model

### Train ELMo for sentiment analysis