## Часть 2. [3 балла] Извлечение именованных сущностей
1. Обучите стандартную модель для извлечения именованных сущностей, CNN-BiLSTM-CRF, для извлечения именованных *низкоуровневых именованных сущностей*, т.е. для самых коротких из вложенных сущностей. 
Модель устроена так: сверточная сеть на символах + эмбеддинги слов + двунаправленная LSTM сеть (модель последовательности) + CRF (глобальная нормализация).
2. Замените часть модели на символах и словах (CNN + эмбеддинги словах) на ELMo и / или BERT. Должна получиться модель ELMo / BERT + BiLSTM + CRF. 
3. Замените модель последовательности (BiLSTM) на другой слой, например, на Transformer. Должна получиться модель CNN  + Transformer + CRF. 

In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

torch.manual_seed(1)

import itertools as it
from operator import itemgetter

from nltk import ngrams

__Доработать по 1 заданию:__
1. BiLSTM по батчам
2. добавить CNN
3. добавить предобученные вектора

__2 задание:__
1. Подставить BERT\ELMo вместо представлений

__3 задание:__
1. Подставить Transformer вместо BiLSTM


# 1. Загрузка данных

### 1.1 Preprocess data

In [155]:
from tager.transform_tsv import load_data_from
sent_train, sent_dev, sent_test = load_data_from("litbank/entities/tsv/")

100%|██████████| 80/80 [00:00<00:00, 155.91it/s]
100%|██████████| 10/10 [00:00<00:00, 132.02it/s]
100%|██████████| 10/10 [00:00<00:00, 135.94it/s]


In [None]:
# words
words = it.chain(sent_train, sent_dev, sent_test)
words = it.chain(*map(itemgetter(0), words))
ix_to_word = sorted(set(words))
word_to_ix = dict(zip(ix_to_word, range(len(ix_to_word))))

# chars
ix_to_char = sorted(set(it.chain(*ix_to_word)))
char_to_ix = dict(zip(ix_to_char, range(len(ix_to_char))))
char_to_ix["<pad>"] = len(char_to_ix)

# tags
START_TAG = "<START>"
STOP_TAG = "<STOP>"

tags = it.chain(sent_train, sent_dev, sent_test)
tags = it.chain(*map(itemgetter(1), tags))
ix_to_tag = sorted(set(tags))+[START_TAG, STOP_TAG]
tag_to_ix = dict(zip(ix_to_tag, range(len(ix_to_tag))))

len(word_to_ix), len(char_to_ix), len(tag_to_ix)

### 1.2. Load pretrained word embeddings

In [175]:
! wget http://nlp.stanford.edu/data/wordvecs/glove.6B.zip

--2019-10-30 14:07:01--  http://nlp.stanford.edu/data/wordvecs/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/wordvecs/glove.6B.zip [following]
--2019-10-30 14:07:02--  https://nlp.stanford.edu/data/wordvecs/glove.6B.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.6B.zip [following]
--2019-10-30 14:07:03--  http://downloads.cs.stanford.edu/nlp/data/wordvecs/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, a

In [176]:
# ! wget https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip

In [177]:
! unzip glove.6B.zip -d glove_vectors

Archive:  glove.6B.zip
  inflating: glove_vectors/glove.6B.100d.txt  
  inflating: glove_vectors/glove.6B.200d.txt  
  inflating: glove_vectors/glove.6B.300d.txt  
  inflating: glove_vectors/glove.6B.50d.txt  


In [203]:
def load_word_embeds(word_to_ix, path_to_glove_file):
    """
    Params:
        words: set
        path_to_glove_file: str
    
    Return:
        dict: key - word, value - np.ndarray
    """
    embeds_matrix = np.random.normal(size=(len(word_to_ix), 100))
    
    word_to_ix = dict((k.lower(), v) for k,v in word_to_ix.items())
    
    with open(path_to_glove_file) as f:
        for line in f.readlines():
            word, *vec = line.split()
            if word in word_to_ix:
                embeds_matrix[word_to_ix[word],:] = np.array(list(map(float, vec)))
                
    return embeds_matrix

In [211]:
glove_word_matrix = load_word_embeds(word_to_ix, "glove_vectors/glove.6B.100d.txt")
glove_embeds = nn.Embedding(len(word_to_ix), 100)
glove_embeds.load_state_dict({'weight':torch.tensor(glove_word_matrix)})
del glove_word_matrix

### 1.3. Data Loader

In [2]:
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

  data = yaml.load(f.read()) or {}


In [3]:
import csv

class LitBankDataset(Dataset):
    """LitBank dataset."""

    def __init__(self, path_to_tsv_files, file_indexes):
        """
        Args:
            path_to_tsv_files (string): Path to the tsv file location.
        """
        self.path_to_tsv_files = path_to_tsv_files
        
        # parse files
        books = sorted(os.listdir(path_to_tsv_files))
        books = books[file_indexes[0]:file_indexes[1]]
        
        self.sentences = list()
        for book in books:
            self.sentences.extend(self.get_sent_and_tags_from_tsv(path_to_tsv_files + book))
            
    @staticmethod
    def get_sent_and_tags_from_tsv(full_file_name):
        """ Функция преобразует разметку из формата tsv в формат [sentence: list, tags: list].
        """
        df = pd.read_csv(full_file_name, sep="\t",
                         header=None, quoting=csv.QUOTE_NONE)
        df = df.loc[:, :1]
        df.columns = ["word", "tag"]

        split_ix = df.index[(df["word"] == ".") & (df["tag"] == "O")].tolist()  # sent split index
        words = df["word"].tolist()
        tags = df["tag"].tolist()

        sent = [words[start + 1:end + 1] for start, end in zip([0] + split_ix, split_ix)]
        tags = [tags[start + 1:end + 1] for start, end in zip([0] + split_ix, split_ix)]

        return list(zip(sent, tags))
            
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
            
        return self.sentences[idx]

In [4]:
dataset_train = LitBankDataset("litbank/entities/tsv/", file_indexes=(0,80))
dataset_dev = LitBankDataset("litbank/entities/tsv/", file_indexes=(80,90))
dataset_test = LitBankDataset("litbank/entities/tsv/", file_indexes=(90,100))

In [5]:
for sent, tags in dataset_train:
    print(sent)
    print(tags)
    break

['I', 'In', 'Chancery', 'London', '.']
['O', 'O', 'B-FAC', 'B-GPE', 'O']


# 2. Model

In [6]:
from tager.utils import argmax, prepare_sequence, log_sum_exp
from tager.model import BaseBiLSTM_CRF

### 2.1. CNN_BiLSTM_CRF

#### Модификация архитектуры

In [160]:
class LayerCharCNN(nn.Module):
    """ LayerCharCNN implements character-level convolutional 1D layer.
    source: https://github.com/achernodub/targer/blob/master/src/layers/layer_char_cnn.py
    """

    def __init__(self, char_embeddings_dim, filter_num, char_window_size, char_to_ix):
        super().__init__()
        self.char_embeddings_dim = char_embeddings_dim
        self.char_cnn_filter_num = filter_num
        self.char_window_size = char_window_size
        self.output_dim = char_embeddings_dim * filter_num
        self.char_to_ix = char_to_ix
        
        self.char_embeds = nn.Embedding(len(char_to_ix), char_embeddings_dim)
        
        self.conv1d = nn.Conv1d(in_channels=char_embeddings_dim,
                                out_channels=char_embeddings_dim * filter_num,
                                kernel_size=char_window_size,
                                groups=char_embeddings_dim)

    def prepare_word(self, seq, max_word_len):
        """Фукнция преобразует набор токенов в тензор из их id."""
        seq = it.chain(seq, ("<pad>" for i in range(max_word_len-len(seq))))
        idxs = [self.char_to_ix[ch] for ch in seq]
        return torch.tensor(idxs, dtype=torch.long)
        
    def _get_char_embeds(self, sentence):
        max_word_len = max(map(len, sentence))
        char_idx = torch.cat([self.prepare_word(word, max_word_len).view(1, -1)
                                 for word in sentence], dim=0)
        embeds = self.char_embeds(char_idx)
        
        return embeds
        
    def forward(self, sentence):
        char_embeddings_feature = self._get_char_embeds(sentence).transpose(2, 1)
        char_embeddings_feature = self.conv1d(char_embeddings_feature)
        max_pooling_out, _ = torch.max(char_embeddings_feature, dim=2)

        return max_pooling_out  # shape: seq_len x filter_num*char_embeddings_dim

In [133]:
char_cnn = LayerCharCNN(char_embeddings_dim=100, filter_num=1, char_window_size=3, char_to_ix=char_to_ix)
sent = ["send", "letter"]
char_cnn(sent).size()

torch.Size([2, 100])

In [209]:
class CNN_BiLSTM_CRF(BaseBiLSTM_CRF):
    
    def __init__(self, vocab_size, tag_to_ix, char_to_ix, embedding_dim, char_embedding_dim,
                 hidden_dim, char_window_size, max_word_len, word_embeds):
        
        super().__init__(vocab_size, tag_to_ix, embedding_dim, hidden_dim)
        
        self.embedding_dim = embedding_dim
        
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.char_cnn = LayerCharCNN(
            char_embeddings_dim=char_embedding_dim,
            filter_num=1,
            char_window_size=char_window_size,
            char_to_ix=char_to_ix
        )

        self.word_embeds = word_embeds
        self.lstm = nn.LSTM(embedding_dim*2, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        

    def _get_lstm_features(self, sentence):
        
        # words to idxs
        word_idxs = prepare_sentence(sentence)
        
        self.hidden = self.init_hidden()
        
        # Get word add char embedings.
        words_embeds = self.word_embeds(word_idxs)
        chars_embeds = self.char_cnn(sentence)
        embeds = torch.cat((words_embeds, chars_embeds), dim=1).view(len(sentence), 1, -1)
    
        # Run BiLSTM.
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)

        # Transfom embeds to tag space.
        lstm_feats = self.hidden2tag(lstm_out)

        return lstm_feats
    
    
    def forward(self, sentence):
        lstm_feats = self._get_lstm_features(sentence)
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [212]:
conf = dict(
    vocab_size = len(word_to_ix),
    tag_to_ix = tag_to_ix,
    char_to_ix = char_to_ix,
    embedding_dim = 100,
    char_embedding_dim = 100,
    hidden_dim = 100,
    char_window_size = 3,
    max_word_len=max_word_len,
    word_embeds=glove_embeds
)

model = CNN_BiLSTM_CRF(**conf)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)
model(["send", "letter"])

(tensor(4.7945, grad_fn=<SelectBackward>), [1, 0])

In [214]:
from functools import partial

prepare_sentence = partial(prepare_sequence, to_ix=word_to_ix)
prepare_tags = partial(prepare_sequence, to_ix=tag_to_ix)

In [215]:
# Make sure prepare_sequence from earlier in the LSTM section is loaded
from tqdm import tqdm_notebook

for epoch in range(3):
    print("Epoch", epoch)
    
    for sentence, tags in tqdm_notebook(dataset_train):

        model.zero_grad()
        targets = prepare_tags(tags)
        loss = model.neg_log_likelihood(sentence, targets)

        loss.backward()
        optimizer.step()

Epoch 0


HBox(children=(IntProgress(value=0, max=6130), HTML(value='')))

RuntimeError: $ Torch: invalid memory size -- maybe an overflow? at /opt/conda/conda-bld/pytorch-cpu_1549632688322/work/aten/src/TH/THGeneral.cpp:188

In [219]:
dataset_train[597]

(['First',
  ',',
  'however',
  ',',
  'she',
  'waited',
  'for',
  'a',
  'few',
  'minutes',
  'to',
  'see',
  'if',
  'she',
  'was',
  'going',
  'to',
  'shrink',
  'any',
  'further',
  ':',
  'she',
  'felt',
  'a',
  'little',
  'nervous',
  'about',
  'this',
  ';',
  '‘',
  'for',
  'it',
  'might',
  'end',
  ',',
  'you',
  'know',
  ',',
  '’',
  'said',
  'Alice',
  'to',
  'herself',
  ',',
  '‘',
  'in',
  'my',
  'going',
  'out',
  'altogether',
  ',',
  'like',
  'a',
  'candle',
  '.'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-PER',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'])

In [17]:
# Check predictions after training
with torch.no_grad():
    precheck_sent = prepare_sequence(sent_train[3][0], word_to_ix)
    print(model(precheck_sent))
# We got it!

(tensor(260.5890), [12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12])
