## Binary structure classification used in tree building: Step 3. BiMPM

Prepare data and model-related scripts.

Evaluate models.

Output:
 - ``models/structure_predictor_bimpm/*``

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import pickle
from utils.file_reading import read_edus, read_gold, read_negative, read_annotation

### Make a directory

In [3]:
MODEL_PATH = 'models/structure_predictor_bimpm'
! mkdir $MODEL_PATH

TRAIN_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf_train.tsv')
DEV_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf_dev.tsv')
TEST_FILE_PATH = os.path.join(MODEL_PATH, 'structure_cf_test.tsv')

mkdir: cannot create directory ‘models/structure_predictor_bimpm’: File exists


In [4]:
! ls models/structure_predictor_bimpm/

config_elmo.json  structure_cf_dev.tsv	 structure_cf_train.tsv
strprdelmo	  structure_cf_test.tsv


### Prepare train/test sets

In [130]:
IN_PATH = 'data_structure'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

In [131]:
import razdel

def tokenize(text):
    result = ' '.join([tok.text for tok in razdel.tokenize(text)])
    return result
    
train_samples['snippet_x'] = train_samples.snippet_x.map(tokenize)
train_samples['snippet_y'] = train_samples.snippet_y.map(tokenize)

dev_samples['snippet_x'] = dev_samples.snippet_x.map(tokenize)
dev_samples['snippet_y'] = dev_samples.snippet_y.map(tokenize)

test_samples['snippet_x'] = test_samples.snippet_x.map(tokenize)
test_samples['snippet_y'] = test_samples.snippet_y.map(tokenize)

In [132]:
train_samples.relation.value_counts()

0    33744
1    16400
Name: relation, dtype: int64

In [133]:
train_samples[train_samples.snippet_x.map(len) < 5].head(3)

Unnamed: 0,level_0,snippet_x,snippet_y,category_id,order,filename,is_broken,token_begin_x,token_begin_y,token_end_y,...,eucl_embed_dist,snippet_x_tmp,snippet_y_tmp,postags_x,postags_y,sm_x_positive,sm_x_negative,sm_y_positive,sm_y_negative,relation
350,,выло,и яростно трещало,,,./data/blogs_16,False,0.935768,0.937028,0.942065,...,0.917148,выть_VERB,"и_CONJ яростно_ADV трещать_VERB ,",VERB,CONJ ADV VERB,1e-05,1e-05,0.002126,0.341593,0
580,2405.0,Это,помимо явных перемен в виде тут же появившихся...,elaboration,NS,news1_15,False,0.199614,0.200579,0.216972,...,0.529479,это_PRON,помимо_ADP явный_ADJ перемена_NOUN в_ADP вид_N...,PRON,ADP ADJ NOUN ADP NOUN ADV PART VERB ADJ NOUN P...,1e-05,1e-05,0.017452,0.065615,1
883,1316.0,Боль,от,cause,NS,blogs_65,False,0.128819,0.129328,0.130346,...,1.167933,боль_NOUN,от_ADP потеря_NOUN,NOUN,ADP NOUN,1e-05,0.384922,1e-05,0.156115,1


In [134]:
length = train_samples.snippet_x.map(len)
length.describe()

count    50144.000000
mean       104.064734
std        100.394467
min          3.000000
25%         39.000000
50%         70.000000
75%        130.000000
max        726.000000
Name: snippet_x, dtype: float64

In [135]:
counts = train_samples['relation'].value_counts(normalize=False).values
NUMBER_CLASSES = len(counts)
print("number of classes:", NUMBER_CLASSES)
print("class weights:", np.round(counts.min() / counts, decimals=6))

number of classes: 2
class weights: [0.486012 1.      ]


In [136]:
train_samples = train_samples.reset_index()
train_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'same_paragraph', 'index']].to_csv(
    TRAIN_FILE_PATH, sep='\t', header=False, index=False)

dev_samples = dev_samples.reset_index()
dev_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'same_paragraph', 'index']].to_csv(
    DEV_FILE_PATH, sep='\t', header=False, index=False)

test_samples = test_samples.reset_index()
test_samples[['relation', 'snippet_x', 'snippet_y', 'same_sentence', 'same_paragraph', 'index']].to_csv(
    TEST_FILE_PATH, sep='\t', header=False, index=False)

In [137]:
train_samples.shape

(50144, 2066)

### Customize BiMPM model with adding inputs 

In [7]:
! rm -r models/bimpm_custom_package
! mkdir models/bimpm_custom_package
! touch models/bimpm_custom_package/__init__.py
! mkdir models/bimpm_custom_package/tokenizers
! mkdir models/bimpm_custom_package/dataset_readers
! mkdir models/bimpm_custom_package/model

In [11]:
%%writefile models/bimpm_custom_package/dataset_readers/__init__.py

try:
    from bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from bimpm_custom_package.dataset_readers.custom_reader import CustomDataReader
except ModuleNotFoundError:
    from models.bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from models.bimpm_custom_package.dataset_readers.custom_reader import CustomDataReader

Overwriting models/bimpm_custom_package/dataset_readers/__init__.py


In [12]:
%%writefile models/bimpm_custom_package/tokenizers/whitespace_tokenizer.py

from allennlp.data.tokenizers import Token, Tokenizer
from overrides import overrides
from typing import List


@Tokenizer.register("whitespace_tokenizer")
class WhitespaceTokenizer(Tokenizer):
    def __init__(self, max_length=None) -> None:
        super().__init__()
        self.max_length = max_length

    def _tokenize(self, text):
        if self.max_length:
            return [Token(token) for token in text.split()][:self.max_length]
        return [Token(token) for token in text.split()]

    @overrides
    def tokenize(self, text: str) -> List[Token]:
        tokens = self._tokenize(text)

        return tokens


Overwriting models/bimpm_custom_package/tokenizers/whitespace_tokenizer.py


In [42]:
! head $TRAIN_FILE_PATH

0	что решение визового вопроса займет гораздо более длительный срок .	""" Поскольку это касается Шенгенской зоны в целом , изменение визового режима должно быть одобрено всеми странами , входящими в ее состав"	0	0	0
1	начиная жестокими избиениями журналистов	и заканчивая незаконным лишением свободы для « обеспечения национальной безопасности »	1	1	1
0	у кого-то очень плохой обмен веществ , чтобы его восстановить и сделать его быстрее . У кого-то уже патологические нарушения в работе гормонов , которые не позволяют увидеть быстрый результат .	Ваше тело нужно правильно программировать	0	0	2
0	По дороге обратно в отель стояли в огромной пробке минут 40 .	IMG Конечно , надо было зайти на базар	0	0	3
0	Также задавал этот вопрос в личной беседе опытным JavaScript-разработчикам , выступающим на митапах с докладами , и людям не из мира фронтенда , результат развед-опроса был сильно похож на статистику ответов в twitter .	Я знал ответ , это же	0	0	4
0	поскольку большинство российских предп

In [14]:
# import csv

# file_path = TRAIN_FILE_PATH
# with open(file_path, "r") as data_file:
#     tsv_in = csv.reader(data_file, delimiter="\t")
#     for row in tsv_in:
#         if len(row) == 6:
#             print('+')

In [62]:
%%writefile models/bimpm_custom_package/dataset_readers/custom_reader.py

import csv
import logging
from typing import Optional, Dict

import numpy as np
from allennlp.common.file_utils import cached_path
from allennlp.data.dataset_readers.dataset_reader import DatasetReader
from allennlp.data.fields import LabelField, TextField, Field, ArrayField
from allennlp.data.instance import Instance
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer, PretrainedTransformerTokenizer

logger = logging.getLogger(__name__)


@DatasetReader.register("custom_pairs_reader")
class CustomDataReader(DatasetReader):
    """
    # Parameters
    tokenizer : `Tokenizer`, optional
        Tokenizer to use to split the premise and hypothesis into words or other kinds of tokens.
        Defaults to `WhitespaceTokenizer`.
    token_indexers : `Dict[str, TokenIndexer]`, optional
        Indexers used to define input token representations. Defaults to `{"tokens":
        SingleIdTokenIndexer()}`.
    """

    def __init__(
            self,
            tokenizer: Tokenizer = None,
            token_indexers: Dict[str, TokenIndexer] = None,
            combine_input_fields: Optional[bool] = None,
            **kwargs) -> None:

        super().__init__(**kwargs)
        self._tokenizer = tokenizer or SpacyTokenizer()
        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}

        if isinstance(self._tokenizer, PretrainedTransformerTokenizer):
            assert not self._tokenizer._add_special_tokens

        if combine_input_fields is not None:
            self._combine_input_fields = combine_input_fields
        else:
            self._combine_input_fields = isinstance(self._tokenizer, PretrainedTransformerTokenizer)

    def _read(self, file_path):
        logger.info("Reading instances from lines in file at: %s", file_path)
        file_path = cached_path(file_path)
        with open(file_path, "r") as data_file:
            tsv_in = csv.reader(data_file, delimiter="\t")
            for row in tsv_in:
                if len(row) == 6:
                    yield self.text_to_instance(premise=row[1], hypothesis=row[2], label=row[0],
                                                same_sentence=row[3], same_paragraph=row[4])

    def text_to_instance(
            self,  # type: ignore
            premise: str,
            hypothesis: str,
            label: str,
            same_sentence: str,
            same_paragraph: str,
    ) -> Instance:

        fields: Dict[str, Field] = {}
        tokenized_premise = self._tokenizer.tokenize(premise)
        tokenized_hypothesis = self._tokenizer.tokenize(hypothesis)

        if self._combine_input_fields:
            tokens = self._tokenizer.add_special_tokens(tokenized_premise, tokenized_hypothesis)
            fields["tokens"] = TextField(tokens, self._token_indexers)
        else:
            tokenized_premise = self._tokenizer.add_special_tokens(tokenized_premise)
            tokenized_hypothesis = self._tokenizer.add_special_tokens(tokenized_hypothesis)
            fields["premise"] = TextField(tokenized_premise, self._token_indexers)
            fields["hypothesis"] = TextField(tokenized_hypothesis, self._token_indexers)

        _same_sentence = list(map(list, zip(*same_sentence)))
        _same_paragraph = list(map(list, zip(*same_paragraph)))
        fields["same_sentence"] = ArrayField(np.array(_same_sentence).astype(np.float32))
        fields["same_paragraph"] = ArrayField(np.array(_same_paragraph).astype(np.float32))

        if label is not None:
            fields["label"] = LabelField(label)

        return Instance(fields)


Overwriting models/bimpm_custom_package/dataset_readers/custom_reader.py


In [48]:
%%writefile models/bimpm_custom_package/model/__init__.py

try:
    from bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from bimpm_custom_package.model.custom_bimpm import BiMpm as CustomBiMpm
    from bimpm_custom_package.model.multiclass_bimpm import BiMpm as MulticlassBiMpm
    from bimpm_custom_package.model.custom_bimpm_predictor import CustomBiMPMPredictor
except ModuleNotFoundError:
    from models.bimpm_custom_package.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
    from models.bimpm_custom_package.model.custom_bimpm import BiMpm as CustomBiMpm
    from models.bimpm_custom_package.model.multiclass_bimpm import BiMpm as MulticlassBiMpm
    from models.bimpm_custom_package.model.custom_bimpm_predictor import CustomBiMPMPredictor

Overwriting models/bimpm_custom_package/model/__init__.py


In [49]:
%%writefile models/bimpm_custom_package/model/custom_bimpm.py
"""
BiMPM (Bilateral Multi-Perspective Matching) model implementation.
"""

from typing import Dict, List

import torch
from allennlp.common.checks import check_dimensions_match
from allennlp.data import Vocabulary
from allennlp.models.model import Model
from allennlp.modules import FeedForward, Seq2SeqEncoder, Seq2VecEncoder, TextFieldEmbedder
from allennlp.modules.bimpm_matching import BiMpmMatching
from allennlp.nn import InitializerApplicator
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from overrides import overrides


@Model.register("custom_bimpm")
class BiMpm(Model):
    """
    This ``Model`` augments with additional features the BiMPM model described in `Bilateral Multi-Perspective 
    Matching for Natural Language Sentences <https://arxiv.org/abs/1702.03814>`_ by Zhiguo Wang et al., 2017.
    implemented in https://github.com/galsang/BIMPM-pytorch>`_.
    Additional features are added before the feedforward classifier.
    """

    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 matcher_word: BiMpmMatching,
                 encoder1: Seq2SeqEncoder,
                 matcher_forward1: BiMpmMatching,
                 matcher_backward1: BiMpmMatching,
                 encoder2: Seq2SeqEncoder,
                 matcher_forward2: BiMpmMatching,
                 matcher_backward2: BiMpmMatching,
                 aggregator: Seq2VecEncoder,
                 classifier_feedforward: FeedForward,
                 encode_together: bool = False,
                 encode_lstm: bool = True,
                 dropout: float = 0.1,
                 class_weights: list = [],
                 initializer: InitializerApplicator = InitializerApplicator(),
                 **kwargs) -> None:
        super().__init__(vocab, **kwargs)

        self.text_field_embedder = text_field_embedder

        self.matcher_word = matcher_word

        self.encoder1 = encoder1
        self.matcher_forward1 = matcher_forward1
        self.matcher_backward1 = matcher_backward1

        self.encoder2 = encoder2
        self.matcher_forward2 = matcher_forward2
        self.matcher_backward2 = matcher_backward2

        self.aggregator = aggregator

        self.encode_together = encode_together
        self.encode_lstm = encode_lstm

        matching_dim = self.matcher_word.get_output_dim()

        if self.encode_lstm:
            matching_dim += self.matcher_forward1.get_output_dim(
            ) + self.matcher_backward1.get_output_dim(
            ) + self.matcher_forward2.get_output_dim(
            ) + self.matcher_backward2.get_output_dim(
            )

        check_dimensions_match(matching_dim, self.aggregator.get_input_dim(),
                               "sum of dim of all matching layers", "aggregator input dim")

        self.classifier_feedforward = classifier_feedforward

        self.dropout = torch.nn.Dropout(dropout)

        if class_weights:
            self.class_weights = class_weights
        else:
            self.class_weights = [1.] * self.classifier_feedforward.get_output_dim()

        self.metrics = {"accuracy": CategoricalAccuracy(),
                        "f1": F1Measure(1)}

        self.loss = torch.nn.CrossEntropyLoss(weight=torch.FloatTensor(self.class_weights))

        initializer(self)

    @overrides
    def forward(self,
                premise: Dict[str, torch.LongTensor],
                hypothesis: Dict[str, torch.LongTensor],
                same_sentence: List[Dict[str, torch.IntTensor]],
                same_paragraph: List[Dict[str, torch.IntTensor]],
                label: torch.LongTensor = None,  # pylint:disable=unused-argument
                ) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        premise : Dict[str, torch.LongTensor]
            The premise from a ``TextField``
        hypothesis : Dict[str, torch.LongTensor]
            The hypothesis from a ``TextField``
        label : torch.LongTensor, optional (default = None)
            The label for the pair of the premise and the hypothesis
        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
            Additional information about the pair
        Returns
        -------
        An output dictionary consisting of:
        logits : torch.FloatTensor
            A tensor of shape ``(batch_size, num_labels)`` representing unnormalised log
            probabilities of the entailment label.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """

        def encode_pair(x1, x2, mask1=None, mask2=None):
            _joined_pair: Dict[str, torch.LongTensor] = {}

            for key in premise.keys():
                bsz = premise[key].size(0)
                x1_len, x2_len = premise[key].size(1), hypothesis[key].size(1)
                sep = torch.empty([bsz, 1], dtype=torch.long, device=premise[key].device)
                sep.data.fill_(0)  # 2 is the id for </s>

                x = torch.cat([premise[key], hypothesis[key]], dim=1)
                _joined_pair[key] = x

            x_output = self.dropout(self.text_field_embedder(_joined_pair))
            return x_output[:, :x1_len], x_output[:, -x2_len:], mask1, mask2

        mask_premise = util.get_text_field_mask(premise)
        mask_hypothesis = util.get_text_field_mask(hypothesis)

        if self.encode_together:
            embedded_premise, embedded_hypothesis, _, _ = encode_pair(premise, hypothesis)
        else:
            embedded_premise = self.dropout(self.text_field_embedder(premise))
            embedded_hypothesis = self.dropout(self.text_field_embedder(hypothesis))

        # embedding and encoding of the premise
        encoded_premise1 = self.dropout(self.encoder1(embedded_premise, mask_premise))
        encoded_premise2 = self.dropout(self.encoder2(encoded_premise1, mask_premise))

        # embedding and encoding of the hypothesis
        encoded_hypothesis1 = self.dropout(self.encoder1(embedded_hypothesis, mask_hypothesis))
        encoded_hypothesis2 = self.dropout(self.encoder2(encoded_hypothesis1, mask_hypothesis))

        matching_vector_premise: List[torch.Tensor] = []
        matching_vector_hypothesis: List[torch.Tensor] = []

        def add_matching_result(matcher, encoded_premise, encoded_hypothesis):
            # utility function to get matching result and add to the result list
            matching_result = matcher(encoded_premise, mask_premise, encoded_hypothesis, mask_hypothesis)
            matching_vector_premise.extend(matching_result[0])
            matching_vector_hypothesis.extend(matching_result[1])

        # calculate matching vectors from word embedding, first layer encoding, and second layer encoding
        add_matching_result(self.matcher_word, embedded_premise, embedded_hypothesis)
        half_hidden_size_1 = self.encoder1.get_output_dim() // 2
        add_matching_result(self.matcher_forward1,
                            encoded_premise1[:, :, :half_hidden_size_1],
                            encoded_hypothesis1[:, :, :half_hidden_size_1])
        add_matching_result(self.matcher_backward1,
                            encoded_premise1[:, :, half_hidden_size_1:],
                            encoded_hypothesis1[:, :, half_hidden_size_1:])

        half_hidden_size_2 = self.encoder2.get_output_dim() // 2
        add_matching_result(self.matcher_forward2,
                            encoded_premise2[:, :, :half_hidden_size_2],
                            encoded_hypothesis2[:, :, :half_hidden_size_2])
        add_matching_result(self.matcher_backward2,
                            encoded_premise2[:, :, half_hidden_size_2:],
                            encoded_hypothesis2[:, :, half_hidden_size_2:])

        # concat the matching vectors
        matching_vector_cat_premise = self.dropout(torch.cat(matching_vector_premise, dim=2))
        matching_vector_cat_hypothesis = self.dropout(torch.cat(matching_vector_hypothesis, dim=2))

        # aggregate the matching vectors
        aggregated_premise = self.dropout(self.aggregator(matching_vector_cat_premise, mask_premise))
        aggregated_hypothesis = self.dropout(self.aggregator(matching_vector_cat_hypothesis, mask_hypothesis))

        # encode additional information
        batch_size, _ = aggregated_premise.size()
        encoded_same_sentence = same_sentence.float().view(batch_size, -1)
        encoded_same_paragraph = same_paragraph.float().view(batch_size, -1)

        # the final forward layer
        logits = self.classifier_feedforward(
            torch.cat([aggregated_premise,
                       aggregated_hypothesis,
                       encoded_same_sentence,
                       encoded_same_paragraph], dim=-1))

        probs = torch.nn.functional.softmax(logits, dim=-1)

        output_dict = {'logits': logits, "probs": probs}

        if label is not None:
            loss = self.loss(logits, label)
            output_dict["loss"] = loss

            for metric in self.metrics.values():
                metric(logits, label)

        return output_dict

    def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
        """
        Does a simple argmax over the probabilities, converts index to string label, and
        add `"label"` key to the dictionary with the result.
        """
        predictions = output_dict["label_probs"]
        if predictions.dim() == 2:
            predictions_list = [predictions[i] for i in range(predictions.shape[0])]
        else:
            predictions_list = [predictions]
        classes = []
        for prediction in predictions_list:
            label_idx = prediction.argmax(dim=-1).item()
            label_str = self.vocab.get_index_to_token_vocabulary("labels").get(
                label_idx, str(label_idx)
            )
            classes.append(label_str)
        output_dict["label"] = classes
        return output_dict

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {
            "f1": self.metrics["f1"].get_metric(reset)['f1'],
            "accuracy": self.metrics["accuracy"].get_metric(reset)
        }

    default_predictor = 'custom_bimpm_predictor'


Overwriting models/bimpm_custom_package/model/custom_bimpm.py


In [50]:
%%writefile models/bimpm_custom_package/model/custom_bimpm_predictor.py

from typing import Dict, List

import numpy
from allennlp.common import JsonDict
from allennlp.data import Instance
from allennlp.data.fields.label_field import LabelField
from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
from allennlp.predictors import Predictor
from overrides import overrides


# You need to name your predictor and register so that `allennlp` command can recognize it
# Note that you need to use "@Predictor.register", not "@Model.register"!
@Predictor.register("custom_bimpm_predictor")
class CustomBiMPMPredictor(Predictor):

    def predict(self, premise: str, hypothesis: str, same_sentence: str, same_paragraph: str) -> JsonDict:
        return self.predict_json({"premise": premise, "hypothesis": hypothesis,
                                  "same_sentence": same_sentence, "same_paragraph": same_paragraph})

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        """
        Expects JSON that looks like `{"premise": "...", "hypothesis": "...", "metadata": "..."}`.
        """
        premise_text = json_dict["premise"]
        hypothesis_text = json_dict["hypothesis"]
        same_sentence = json_dict["same_sentence"]
        same_paragraph = json_dict["same_paragraph"]
        reader_has_tokenizer = (
                getattr(self._dataset_reader, "tokenizer", None) is not None
                or getattr(self._dataset_reader, "_tokenizer", None) is not None
        )
        if not reader_has_tokenizer:
            tokenizer = SpacyTokenizer()
            premise_text = tokenizer.tokenize(premise_text)
            hypothesis_text = tokenizer.tokenize(hypothesis_text)

        return self._dataset_reader.text_to_instance(premise_text,
                                                     hypothesis_text,
                                                     label=None,
                                                     same_sentence=same_sentence,
                                                     same_paragraph=same_paragraph)

    def predictions_to_labeled_instances(
            self, instance: Instance, outputs: Dict[str, numpy.ndarray]
    ) -> List[Instance]:
        new_instance = instance.duplicate()
        label = numpy.argmax(outputs["label_logits"])
        # Skip indexing, we have integer representations of the strings "entailment", etc.
        new_instance.add_field("label", LabelField(int(label), skip_indexing=True))
        return [new_instance]


Overwriting models/bimpm_custom_package/model/custom_bimpm_predictor.py


###  Generate config file

In [19]:
print(TRAIN_FILE_PATH)
print(DEV_FILE_PATH)
print(TEST_FILE_PATH)

models/structure_predictor_bimpm/structure_cf_train.tsv
models/structure_predictor_bimpm/structure_cf_dev.tsv
models/structure_predictor_bimpm/structure_cf_test.tsv


In [98]:
! head $TRAIN_FILE_PATH

0	что решение визового вопроса займет гораздо более длительный срок .	""" Поскольку это касается Шенгенской зоны в целом , изменение визового режима должно быть одобрено всеми странами , входящими в ее состав"	0	0	0
1	начиная жестокими избиениями журналистов	и заканчивая незаконным лишением свободы для « обеспечения национальной безопасности »	1	1	1
0	у кого-то очень плохой обмен веществ , чтобы его восстановить и сделать его быстрее . У кого-то уже патологические нарушения в работе гормонов , которые не позволяют увидеть быстрый результат .	Ваше тело нужно правильно программировать	0	0	2
0	По дороге обратно в отель стояли в огромной пробке минут 40 .	IMG Конечно , надо было зайти на базар	0	0	3
0	Также задавал этот вопрос в личной беседе опытным JavaScript-разработчикам , выступающим на митапах с докладами , и людям не из мира фронтенда , результат развед-опроса был сильно похож на статистику ответов в twitter .	Я знал ответ , это же	0	0	4
0	поскольку большинство российских предп

In [61]:
! mkdir models/structure_predictor_bimpm

mkdir: cannot create directory ‘models/structure_predictor_bimpm’: File exists


In [140]:
1074-1024

50

In [141]:
1124-1024

100

In [144]:
MODEL_PATH

'models/structure_predictor_bimpm'

In [145]:
%%writefile $MODEL_PATH/config_elmo.json

// Configuration for a sentence matching model based on:
//   Wang, Zhiguo, Wael Hamza, and Radu Florian. "Bilateral multi-perspective matching for natural language sentences."
//   Proceedings of the 26th International Joint Conference on Artificial Intelligence. 2017.
// (Augmented with additional granularity related features)

local NUM_EPOCHS = 50;
local LR = 1e-5;
local MAX_LEN = 1000;
local LSTM_ENCODER_HIDDEN = 25;

{
  "dataset_reader": {
    "type": "custom_pairs_reader",
    "tokenizer": {
      "type": "just_spaces"
    },
#     "token_indexers": {
#       "tokens": {
#         "type": "single_id",
#         "lowercase_tokens": true
#       },
#       "elmo": {
#         "type": "elmo_characters"
#       }
#     }
    "token_indexers": {
      "token_characters": {
        "type": "characters",
        "min_padding_length": 3
      },
      "elmo": {
        "type": "elmo_characters",
      },
    }
  },
  "train_data_path": "structure_predictor_bimpm/structure_cf_train.tsv",
  "validation_data_path": "structure_predictor_bimpm/structure_cf_dev.tsv",
  "model": {
    "type": "custom_bimpm",
    "dropout": 0.5,
    "class_weights": [0.5, 1.0],
    "encode_together": false,
    "text_field_embedder": {
        "token_embedders": {
            "elmo": {
                    "type": "elmo_token_embedder",
                    "options_file": "rsv_elmo/options.json",
                    "weight_file": "rsv_elmo/model.hdf5",
                    "do_layer_norm": true,
                    "dropout": 0.2
            },
            "token_characters": {
                "type": "character_encoding",
                "dropout": 0.2,
                "embedding": {
                    "embedding_dim": 20,
                    "sparse": false,
                    "vocab_namespace": "token_characters"
                },
                "encoder": {
                    "type": "lstm",
                    "input_size": $.model.text_field_embedder.token_embedders.token_characters.embedding.embedding_dim,
                    "hidden_size": LSTM_ENCODER_HIDDEN,
                    "num_layers": 1,
                    "bidirectional": true,
                    "dropout": 0.4
              },
            }
        }
    },
    "matcher_word": {
      "is_forward": true,
      "hidden_dim": 1024+LSTM_ENCODER_HIDDEN+LSTM_ENCODER_HIDDEN,
      "num_perspectives": 10,
      "with_full_match": false
    },
    "encoder1": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": 1024+LSTM_ENCODER_HIDDEN+LSTM_ENCODER_HIDDEN,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward1": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward1": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "encoder2": {
      "type": "lstm",
      "bidirectional": true,
      "input_size": $.model.matcher_forward1.hidden_dim+$.model.matcher_backward1.hidden_dim,
      "hidden_size": 200,
      "num_layers": 1
    },
    "matcher_forward2": {
      "is_forward": true,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "matcher_backward2": {
      "is_forward": false,
      "hidden_dim": 200,
      "num_perspectives": 10
    },
    "aggregator":{
      "type": "lstm",
      "bidirectional": true,
      "input_size": 264,
      "hidden_size": 100,
      "num_layers": 1,
      "dropout": 0.1,
    },
    "classifier_feedforward": {
      "input_dim": $.model.matcher_forward2.hidden_dim+$.model.matcher_backward2.hidden_dim+1+1,
      "num_layers": 2,
      "hidden_dims": [200, 2],
      "activations": ["mish", "mish"],
      "dropout": [0.5, 0.0]
    },
    "initializer": {
      "regexes": [
        [".*linear_layers.*weight", {"type": "xavier_normal"}],
        [".*linear_layers.*bias", {"type": "constant", "val": 0}],
        [".*weight_ih.*", {"type": "xavier_normal"}],
        [".*weight_hh.*", {"type": "orthogonal"}],
        [".*bias.*", {"type": "constant", "val": 0}],
        [".*matcher.*match_weights.*", {"type": "kaiming_normal"}]
      ]
    }
  },
  "data_loader": {
    "batch_sampler": {
        "type": "bucket",
        "batch_size": 20,
        "padding_noise": 0.0,
        "sorting_keys": ["premise"],
    },
  },
  "trainer": {
    "num_epochs": NUM_EPOCHS,
    "patience": 5,
    "grad_clipping": 5.0,
    "validation_metric": "+f1",
    "cuda_device": 0,
    "optimizer": {
      "type": "adam",
      "lr": LR
    },
  }
}

Overwriting models/structure_predictor_bimpm/config_elmo.json


In [64]:
# ! mv ../../../maintenance_rst/models/structure_predictor_bimpm ../../../maintenance_rst/models/structure_predictor_bimpm_OLD

In [29]:
! cp -r models/structure_predictor_bimpm ../../../maintenance_rst/models/structure_predictor_bimpm

### 3. Scripts for training/prediction 

#### Option 1. Directly from the config

Train a model

In [96]:
%%writefile models/train_structure_predictor.sh
# usage:
# $ cd models 
# $ sh train_structure_predictor.sh {bert|elmo} result_directory

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="structure_cf_dev.tsv"
export TEST_FILE_PATH="structure_cf_test.tsv"

rm -r structure_predictor_bimpm/${RESULT_DIR}/
allennlp train -s structure_predictor_bimpm/${RESULT_DIR}/ structure_predictor_bimpm/config_${METHOD}.json \
   --include-package bimpm_custom_package

Overwriting models/train_structure_predictor.sh


Predict on dev&test

In [18]:
%%writefile models/eval_structure_predictor.sh
# usage:
# $ cd models 
# $ sh eval_structure_predictor.sh {bert|elmo} result_directory

export METHOD=${1}
export RESULT_DIR=${2}
export DEV_FILE_PATH="structure_cf_dev.tsv"
export TEST_FILE_PATH="structure_cf_test.tsv"

allennlp predict --use-dataset-reader --silent \
    --output-file structure_predictor_bimpm/${RESULT_DIR}/predictions_dev.json \
    structure_predictor_bimpm/${RESULT_DIR}/model.tar.gz structure_predictor_bimpm/${DEV_FILE_PATH} \
    --include-package bimpm_custom_package \
    --predictor custom_bimpm_predictor

allennlp predict --use-dataset-reader --silent \
    --output-file structure_predictor_bimpm/${RESULT_DIR}/predictions_test.json \
    structure_predictor_bimpm/${RESULT_DIR}/model.tar.gz structure_predictor_bimpm/${TEST_FILE_PATH} \
    --include-package bimpm_custom_package \
    --predictor custom_bimpm_predictor

Overwriting models/eval_structure_predictor.sh


#### Option 2. Using wandb for parameters adjustment

In [None]:
%%writefile models/wandb_structure_predictor.yaml
# usage:
# $ cd models
# wandb sweep wandb_structure_predictor.yaml

name: structure_predictor_stacked
program: wandb_allennlp # this is a wrapper console script around allennlp commands. It is part of wandb-allennlp
method: bayes
## Do not for get to use the command keyword to specify the following command structure
command:
  - ${program} #omit the interpreter as we use allennlp train command directly
  - "--subcommand=train"
  - "--include-package=bimpm_custom_package" # add all packages containing your registered classes here
  - "--config_file=structure_predictor_bimpm/config_elmo.json"
  - ${args}
metric:
    name: best_f1
    goal: maximize
parameters:
    model.type:
        values: ["custom_bimpm",]
    iterator.batch_size:
        values: [20,]
    model.encode_together:
        values: ["false",]
    trainer.optimizer.lr:
        values: [0.001,]
    model.dropout:
        values: [0.5]


3. Run training

``wandb sweep wandb_structure_predictor.yaml``

(returns %sweepname)

``wandb agent --count 1 %sweepname``

Move the best model in structure_predictor_bimpm

In [None]:
! cp -r models/wandb/run-20200720_203050-84hl3zwy/training_dumps models/structure_predictor_bimpm/snowy-sweep-2

In [None]:
! mv models/wandb/run-20200929_034343-5tmisocu models/structure_predictor_bimpm/colorful-sweep-1

###  Evaluate classifier

In [None]:
def load_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            result.append(json.loads(line)["label"])
            
    result = list(map(int, result))
    print('length of result:', len(result))
    return result

In [None]:
! cp ../../../maintenance_rst/models/structure_predictor_bimpm/colorful-sweep-1-dumps/*.json models/structure_predictor_bimpm/colorful-sweep-1-dumps/

In [None]:
RESULT_DIR = 'colorful-sweep-1-dumps'

On dev set

In [None]:
import pandas as pd
import json

true = pd.read_csv(DEV_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')
print('length of true labels:', len(true))

In [None]:
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

print('f1: %.2f'%(f1_score(true[:len(pred)], pred)*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred)*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred)*100))

print(classification_report(true[:len(pred)], pred, digits=4))

On test set

In [None]:
import pandas as pd
import json

true = pd.read_csv(TEST_FILE_PATH, sep='\t', header=None)[0].values.tolist()
pred = load_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')
print('length of true labels:', len(true))

In [None]:
from sklearn.metrics import classification_report

print('f1: %.2f'%(f1_score(true[:len(pred)], pred)*100))
print('pr: %.2f'%(precision_score(true[:len(pred)], pred)*100))
print('re: %.2f'%(recall_score(true[:len(pred)], pred)*100))

print(classification_report(true[:len(pred)], pred, digits=4))

### Ensemble 

In [None]:
model_vocab = [0, 1]
catboost_vocab = [0, 1]

def load_neural_predictions(path):
    result = []
    
    with open(path, 'r') as file:
        for line in file.readlines():
            probs = json.loads(line)['probs']
            probs = {model_vocab[i]: probs[i] for i in range(len(model_vocab))}
            result.append(probs)
            
    return result

def load_scikit_predictions(model, X):
    result = []
    
    try:
        predictions = model.predict_proba(X)
    except AttributeError:
        predictions = model._predict_proba_lr(X)
    
    for prediction in predictions:
        probs = {catboost_vocab[j]: prediction[j] for j in range(len(catboost_vocab))}
        result.append(probs)
    
    return result

def vote_predictions(pred1, pred2, soft=True):
    assert len(pred1) == len(pred2)
    result = []
    
    for i in range(len(pred1)):
        sample_result = {}
        for key in pred1[i].keys():
            if soft:
                sample_result[key] = pred1[i][key] + pred2[i][key]
            else:
                sample_result[key] = max(pred1[i][key], pred2[i][key])
        
        result.append(sample_result)
    
    return result

def probs_to_classes(pred):
    result = []
    
    for sample in pred:
        best_class = ''
        best_prob = 0.
        for key in sample.keys():
            if sample[key] > best_prob:
                best_prob = sample[key]
                best_class = key
        
        result.append(best_class)
    
    return result

In [None]:
import pickle

model = pickle.load(open('models/structure_predictor_baseline/model.pkl', 'rb'))
scaler = pickle.load(open('models/structure_predictor_baseline/scaler.pkl', 'rb'))
drop_columns = pickle.load(open('models/structure_predictor_baseline/drop_columns.pkl', 'rb'))

In [None]:
IN_PATH = 'data_structure'

train_samples = pd.read_pickle(os.path.join(IN_PATH, 'train_samples.pkl'))
dev_samples = pd.read_pickle(os.path.join(IN_PATH, 'dev_samples.pkl'))
test_samples = pd.read_pickle(os.path.join(IN_PATH, 'test_samples.pkl'))

y_train, X_train = train_samples['relation'].to_frame(), train_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_dev, X_dev = dev_samples['relation'].to_frame(), dev_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])
y_test, X_test = test_samples['relation'].to_frame(), test_samples.drop('relation', axis=1).drop(
    columns=drop_columns + ['category_id'])

In [None]:
X_scaled_np = scaler.transform(X_dev)
X_dev = pd.DataFrame(X_scaled_np, index=X_dev.index)

X_scaled_np = scaler.transform(X_test)
X_test = pd.DataFrame(X_scaled_np, index=X_test.index)

In [None]:
from sklearn import metrics


TARGET = 'relation'
svm_predictions = load_scikit_predictions(model, X_dev)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_dev.json')

tmp = vote_predictions(neural_predictions, svm_predictions, soft=True)
ensemble_pred = probs_to_classes(tmp)

print('f1: %.2f'%(metrics.f1_score(y_dev, ensemble_pred)*100.))
print('pr: %.2f'%(metrics.precision_score(y_dev, ensemble_pred)*100.))
print('re: %.2f'%(metrics.recall_score(y_dev, ensemble_pred)*100.))
print()
print(metrics.classification_report(y_dev, ensemble_pred, digits=4))

In [None]:
svm_predictions = load_scikit_predictions(model, X_test)
neural_predictions = load_neural_predictions(f'{MODEL_PATH}/{RESULT_DIR}/predictions_test.json')

tmp = vote_predictions(neural_predictions, svm_predictions, soft=True)
ensemble_pred = probs_to_classes(tmp)

print('f1: %.2f'%(metrics.f1_score(y_test, ensemble_pred)*100.))
print('pr: %.2f'%(metrics.precision_score(y_test, ensemble_pred)*100.))
print('re: %.2f'%(metrics.recall_score(y_test, ensemble_pred)*100.))
print()
print(metrics.classification_report(y_test, ensemble_pred, digits=4))