In [1]:
import os
import sys

tf = import_tensorflow()
from bert.tokenization import FullTokenizer

sys.path.insert(0, "/tf/datadrive/datascientist/relation-extraction/")
from src.model.dependency_parsing import BertForDependencyParsing
from src.data.io import from_conllu
from src.data.preprocessing import apply_bpe, fit_encodings, apply_encodings

%load_ext autoreload
%autoreload 2

W0506 08:40:22.112535 140183032723264 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0506 08:40:22.122165 140183032723264 deprecation_wrapper.py:119] From /tf/datadrive/datascientist/relation-extraction/src/model/base.py:62: The name tf.Session is deprecated. Please use tf.compat.v1.Session instead.



In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
print(tf.__version__)
assert tf.test.is_gpu_available()

### load data

In [2]:
path_mask = "/tf/datadrive/data/syntagrus/ru_syntagrus-ud-{}.conllu"
for part in ["train", "dev", "test"]:
# for part in ["dev", "test"]:
    path = path_mask.format(part)
    if part == "train":
        examples_train = from_conllu(path=path, warn=False)
    elif part == "dev":
        examples_valid = from_conllu(path=path, warn=False)
    elif part == "test":
        examples_test = from_conllu(path=path, warn=False)

===== DATASET INFO =====
num documents: 522
num sentences: 47962
num tokens: 850548
num sentences ignored: 852
===== DATASET INFO =====
num documents: 47
num sentences: 6425
num tokens: 114371
num sentences ignored: 159
===== DATASET INFO =====
num documents: 53
num sentences: 6347
num tokens: 113789
num sentences ignored: 144


In [3]:
def remove_spaces(s):
    return s.replace(' ', '').replace('\xa0', '')

def check():
    limit = 5
    for x in examples_train + examples_valid + examples_test:
        for chunk in x.chunks:
            actual = ''.join(remove_spaces(t.text) for t in chunk.tokens)
            expected = remove_spaces(chunk.text)
            if actual != expected:
                print(chunk.id)
                print(actual)
                print(expected)
                print("text:", chunk.text)
                print("first tokens:", [t.text for t in chunk.tokens[:500]])
                print()
                limit -= 1
                if limit == 0:
                    return
check()

### bpe

In [4]:
bert_dir = "/tf/datadrive/nn_lfs/rubert_cased_L-12_H-768_A-12_v2/"
tokenizer = FullTokenizer(vocab_file=os.path.join(bert_dir, "vocab.txt"), do_lower_case=False)

W0506 08:40:43.379161 140183032723264 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [5]:
for x in examples_train + examples_valid + examples_test:
    for chunk in x.chunks:
        apply_bpe(
            chunk, 
            tokenizer=tokenizer, 
            ner_prefix_joiner=None,
            ner_encoding="bio"  # TODO: костыль!!1!
        )

In [6]:
for t in examples_valid[0].chunks[0].tokens:
    print(t.text, "\t", t.pieces)

Алгоритм 	 ['Алгоритм']
, 	 [',']
от 	 ['от']
имени 	 ['имени']
учёного 	 ['учёного']
аль 	 ['аль']
- 	 ['-']
Хорезми 	 ['Хорезм', '##и']
, 	 [',']
- 	 ['-']
точный 	 ['точный']
набор 	 ['набор']
инструкций 	 ['инструкций']
, 	 [',']
описывающих 	 ['описывающих']
порядок 	 ['порядок']
действий 	 ['действий']
исполнителя 	 ['исполнителя']
для 	 ['для']
достижения 	 ['достижения']
результата 	 ['результата']
решения 	 ['решения']
задачи 	 ['задачи']
за 	 ['за']
конечное 	 ['конечное']
время 	 ['время']
. 	 ['.']


### filter

In [7]:
before = 0
after = 0
for x in examples_train + examples_valid + examples_test:
    before += len(x.chunks)
    x.chunks = [chunk for chunk in x.chunks if sum(len(t.pieces) for t in chunk.tokens) <= 256]
    after += len(x.chunks)

print(before, after)

60734 60734


### encodings

In [8]:
def get_unique_labels(examples):
    labels = set()
    for x in examples:
        for chunk in x.chunks:
            for t in chunk.tokens:
                labels.add(t.rel)
    return labels

In [9]:
labels = get_unique_labels(examples_train + examples_valid + examples_test)
rel2id = {label: i for i, label in enumerate(sorted(labels))}

In [10]:
print(len(rel2id))
print(rel2id)

40
{'acl': 0, 'acl:relcl': 1, 'advcl': 2, 'advmod': 3, 'amod': 4, 'appos': 5, 'aux': 6, 'aux:pass': 7, 'case': 8, 'cc': 9, 'ccomp': 10, 'compound': 11, 'conj': 12, 'cop': 13, 'csubj': 14, 'csubj:pass': 15, 'dep': 16, 'det': 17, 'discourse': 18, 'expl': 19, 'fixed': 20, 'flat': 21, 'flat:foreign': 22, 'flat:name': 23, 'iobj': 24, 'mark': 25, 'nmod': 26, 'nsubj': 27, 'nsubj:pass': 28, 'nummod': 29, 'nummod:entity': 30, 'nummod:gov': 31, 'obj': 32, 'obl': 33, 'orphan': 34, 'parataxis': 35, 'punct': 36, 'root': 37, 'vocative': 38, 'xcomp': 39}


### fit

In [11]:
config = {
    "model": {
        "bert": {
            "test_mode": False,
            "dir": bert_dir,
            "dim": 768,
            "attention_probs_dropout_prob": 0.5,  # default 0.1
            "hidden_dropout_prob": 0.1,
            "dropout": 0.2,
            "scope": "bert",
            "pad_token_id": tokenizer.vocab["[PAD]"],
            "cls_token_id": tokenizer.vocab["[CLS]"],
            "sep_token_id": tokenizer.vocab["[SEP]"],
            "root_token_id": tokenizer.vocab["[unused1]"]
        },
        "parser": {
            "use_birnn": False,
            "rnn": {
                "num_layers": 1,
                "cell_dim": 8,
                "dropout": 0.5,
                "recurrent_dropout": 0.0
            },
            "biaffine_arc": {
                "num_mlp_layers": 1,
                "activation": "relu",
                "head_dim": 1024,
                "dep_dim": 1024,
                "dropout": 0.33,
                "num_labels": 1,
            },
            "biaffine_type": {
                "num_mlp_layers": 1,
                "activation": "relu",
                "head_dim": 256,
                "dep_dim": 256,
                "dropout": 0.33,
                "num_labels": len(rel2id),
            }
        }
    },
    "training": {
        "num_epochs": 10,
        "batch_size": 8,
        "max_epochs_wo_improvement": 20,
        "num_train_samples": sum(len(x.chunks) for x in examples_train),
    },
    "optimizer": {
        "init_lr": 2e-5,
        "warmup_proportion": 0.1,
    },
    "inference": {
        "max_tokens_per_batch": 10000,
        "window": 1
    }
}

In [12]:
tf.reset_default_graph()
sess_config = tf.ConfigProto()
sess_config.gpu_options.allow_growth = True
sess = tf.Session(config=sess_config)
model = BertForDependencyParsing(sess=sess, config=config, rel_enc=rel2id)
model.build()
model.reset_weights()

W0506 08:42:21.374456 140183032723264 deprecation_wrapper.py:119] From /tf/datadrive/datascientist/relation-extraction/src/model/base.py:448: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0506 08:42:21.381497 140183032723264 deprecation_wrapper.py:119] From /tf/datadrive/datascientist/relation-extraction/src/model/base.py:124: The name tf.variable_scope is deprecated. Please use tf.compat.v1.variable_scope instead.

W0506 08:42:21.389999 140183032723264 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/modeling.py:409: The name tf.get_variable is deprecated. Please use tf.compat.v1.get_variable instead.

W0506 08:42:21.437133 140183032723264 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/modeling.py:490: The name tf.assert_less_equal is deprecated. Please use tf.compat.v1.assert_less_equal instead.

W0506 08:42:21.790929 140183032723264 lazy_loader.py:50] 
The TensorFlow contrib module will

In [13]:
model_dir = "/tmp/bert_for_dependency_parsing"

In [14]:
def verbose_fn(d):
    print({k: round(v, 4) for k, v in d.items()})

In [15]:
model.train(
    examples_train=examples_train,
    examples_valid=examples_valid,
    model_dir=model_dir,
    verbose=True,
    verbose_fn=verbose_fn
)

  0%|          | 0/5996 [00:00<?, ?it/s]

model dir /tmp/bert_for_dependency_parsing created


100%|██████████| 5996/5996 [12:25<00:00,  8.04it/s]


epoch 0 finished. mean train loss: 2.2538764476776123. evaluation starts.
{'loss': 0.5333, 'loss_arc': 0.3679, 'loss_type': 0.1654, 'score': 0.8962, 'uas': 0.9253, 'las': 0.8962}
current score: 0.8961624887427757
!!! new best score: 0.8961624887427757


  0%|          | 1/5996 [00:00<12:43,  7.86it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:18<00:00,  8.12it/s]


epoch 1 finished. mean train loss: 1.340116262435913. evaluation starts.
{'loss': 0.3953, 'loss_arc': 0.2744, 'loss_type': 0.1208, 'score': 0.9208, 'uas': 0.9419, 'las': 0.9208}
current score: 0.9208190887550166
!!! new best score: 0.9208190887550166


  0%|          | 1/5996 [00:00<11:50,  8.43it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:14<00:00,  8.16it/s]


epoch 2 finished. mean train loss: 0.9953550696372986. evaluation starts.
{'loss': 0.3596, 'loss_arc': 0.2532, 'loss_type': 0.1063, 'score': 0.9263, 'uas': 0.9455, 'las': 0.9263}
current score: 0.9263012476939084
!!! new best score: 0.9263012476939084


  0%|          | 1/5996 [00:00<10:23,  9.61it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:15<00:00,  8.15it/s]


epoch 3 finished. mean train loss: 0.8089344501495361. evaluation starts.
{'loss': 0.3359, 'loss_arc': 0.2376, 'loss_type': 0.0984, 'score': 0.9321, 'uas': 0.9498, 'las': 0.9321}
current score: 0.9321418891152478
!!! new best score: 0.9321418891152478


  0%|          | 1/5996 [00:00<10:41,  9.34it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:17<00:00,  8.13it/s]


epoch 4 finished. mean train loss: 0.688689649105072. evaluation starts.
{'loss': 0.3463, 'loss_arc': 0.2482, 'loss_type': 0.0981, 'score': 0.9338, 'uas': 0.951, 'las': 0.9338}
current score: 0.933838123300487
!!! new best score: 0.933838123300487


  0%|          | 1/5996 [00:00<12:22,  8.07it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:16<00:00,  8.14it/s]


epoch 5 finished. mean train loss: 0.6044422388076782. evaluation starts.


  0%|          | 1/5996 [00:00<12:22,  8.07it/s]

{'loss': 0.3406, 'loss_arc': 0.2444, 'loss_type': 0.0962, 'score': 0.9337, 'uas': 0.9501, 'las': 0.9337}
current score: 0.9336545103216725
best score: 0.933838123300487
steps wo improvement: 1


100%|██████████| 5996/5996 [12:15<00:00,  8.15it/s]


epoch 6 finished. mean train loss: 0.541215181350708. evaluation starts.
{'loss': 0.3491, 'loss_arc': 0.256, 'loss_type': 0.0931, 'score': 0.934, 'uas': 0.9505, 'las': 0.934}
current score: 0.9339692754282117
!!! new best score: 0.9339692754282117


  0%|          | 1/5996 [00:00<12:06,  8.25it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:14<00:00,  8.16it/s]


epoch 7 finished. mean train loss: 0.49125516414642334. evaluation starts.
{'loss': 0.346, 'loss_arc': 0.251, 'loss_type': 0.0949, 'score': 0.9352, 'uas': 0.9512, 'las': 0.9352}
current score: 0.9352283358543687
!!! new best score: 0.9352283358543687


  0%|          | 1/5996 [00:00<11:28,  8.71it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:14<00:00,  8.16it/s]


epoch 8 finished. mean train loss: 0.45110705494880676. evaluation starts.
{'loss': 0.3588, 'loss_arc': 0.2632, 'loss_type': 0.0956, 'score': 0.9363, 'uas': 0.9522, 'las': 0.9363}
current score: 0.9362862963513479
!!! new best score: 0.9362862963513479


  0%|          | 1/5996 [00:00<12:34,  7.94it/s]

saved new head to /tmp/bert_for_dependency_parsing/model.ckpt


100%|██████████| 5996/5996 [12:14<00:00,  8.17it/s]


epoch 9 finished. mean train loss: 0.4183713495731354. evaluation starts.
{'loss': 0.356, 'loss_arc': 0.2605, 'loss_type': 0.0955, 'score': 0.9365, 'uas': 0.9524, 'las': 0.9365}
current score: 0.936478652805344
!!! new best score: 0.936478652805344
saved new head to /tmp/bert_for_dependency_parsing/model.ckpt
restoring model from /tmp/bert_for_dependency_parsing/model.ckpt


In [17]:
d_test = model.evaluate(examples=examples_test)
verbose_fn(d_test)

{'loss': 0.3073, 'loss_arc': 0.2316, 'loss_type': 0.0757, 'score': 0.9451, 'uas': 0.9576, 'las': 0.9451, 'support': 113789}


In [None]:
# deeppavlov: 95.2 (uas), 93.7 (las)