In [1]:
!pip install transformers
!pip install tensorboardx
!pip install simpletransformers
!pip install -U jupyter ipywidgets

Requirement already up-to-date: jupyter in /home/user/conda/lib/python3.7/site-packages (1.0.0)
Requirement already up-to-date: ipywidgets in /home/user/conda/lib/python3.7/site-packages (7.6.3)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import math 

import pandas as pd
import numpy as np

import json

import re
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\d+[ ]+\d+[ ]+\d+|\d+[ ]+\d+|[a-zA-Z]+[.]+[a-zA-Z]+|[A-Z]+[a-z]+|\d+[.,:+-]+\d+|\w+')

from tqdm import tqdm
import random

In [3]:
!wget https://onti2020.ai-academy.ru/task/rucos_train.jsonl.zip -c
!unzip -o rucos_train.jsonl.zip
!wget https://onti2020.ai-academy.ru/task/rucos_test.jsonl -c
!wget https://onti2020.ai-academy.ru/task/rucos_val.jsonl -c

--2021-03-02 05:41:28--  https://onti2020.ai-academy.ru/task/rucos_train.jsonl.zip
Resolving onti2020.ai-academy.ru (onti2020.ai-academy.ru)... 213.159.215.214
Connecting to onti2020.ai-academy.ru (onti2020.ai-academy.ru)|213.159.215.214|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 46875590 (45M) [application/zip]
Saving to: ‘rucos_train.jsonl.zip’


2021-03-02 05:41:31 (15.0 MB/s) - ‘rucos_train.jsonl.zip’ saved [46875590/46875590]

Archive:  rucos_train.jsonl.zip
  inflating: rucos_train.jsonl       
  inflating: __MACOSX/._rucos_train.jsonl  
--2021-03-02 05:41:34--  https://onti2020.ai-academy.ru/task/rucos_test.jsonl
Resolving onti2020.ai-academy.ru (onti2020.ai-academy.ru)... 213.159.215.214
Connecting to onti2020.ai-academy.ru (onti2020.ai-academy.ru)|213.159.215.214|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17002756 (16M)
Saving to: ‘rucos_test.jsonl’


2021-03-02 05:41:35 (43.1 MB/s) - ‘rucos_test.jsonl’ saved [17002

In [3]:
def repl_bad_chars(text):
    text = re.sub("[“”«»]", '"', text)
    text = re.sub("[\"]+", '"', text)
    text = re.sub("[—–]+", '-', text)

    text = re.sub('\n', ' ', text)
    text = re.sub("[\ ]+", ' ', text)

    return text

def strip_text(text):
    text = text.strip()
    if text[0] == text[-1] == '"':
        text = text[1:-1]
    return text.strip()

def make_df(name):
    df = []
    
    json_list = list()
    with open("rucos_{}.jsonl".format(name), "r", encoding = "UTF-8", newline = "\n") as fin:
        json_list = list(fin)

    for json_str in tqdm(json_list):
        result = json.loads(json_str)  
        
        text = result['passage']['text']
        
        labels = []      
        for entity in result['passage']['entities']:
            label = text[entity['start']:entity['end']]
            if label not in labels:
                labels.append(label)
        
        text = strip_text(repl_bad_chars(text))
        text = re.sub('\@+[a-z]+', '', text)
        question = strip_text(repl_bad_chars(result['qas'][0]['query']))
        
        if name == 'train':
            answers = []
            for answer in result['qas'][0]['answers']:
                answers.append(answer['text'])

            correct_labels = []
            incorrect_labels = []
        
            for label in labels:
                if int(label in answers):
                    correct_labels.append(label)
                else:
                    incorrect_labels.append(label)
        
            if random.randint(0, 100) >= 0:
                if len(correct_labels) > 0:
                    ind_c = random.randint(0, len(correct_labels) - 1)

                    features = {
                        'text': f'Содержание: {question.replace("@placeholder", correct_labels[ind_c])} Текст: {text}',
                        'label': 1
                    }
                    df.append(features)

                if len(incorrect_labels) > 10:
                    ind_inc = random.randint(0, len(incorrect_labels) - 1)

                    features = {
                        'text': f'Содержание: {question.replace("@placeholder", incorrect_labels[ind_inc])} Текст: {text}',
                        'label': 0
                    }
                    df.append(features)
        else:
            for label in labels:
                features = {
                    'text': f'Содержание: {question.replace("@placeholder", label)} Текст: {text}',
                    'label': 0
                }

                df.append(features)
        
    return df

In [4]:
df_train_bert = make_df('train')
df_train_bert = pd.DataFrame(df_train_bert)
df_train_bert

100%|██████████| 72193/72193 [00:09<00:00, 7495.93it/s]


Unnamed: 0,text,label
0,"Содержание: Кроме того, серьезным вызовом для ...",1
1,"Содержание: Кроме того, серьезным вызовом для ...",0
2,Содержание: Россия категорически опровергла со...,1
3,Содержание: Россия категорически опровергла со...,0
4,"Содержание: Инго Маннтойфель, руководитель отд...",1
...,...,...
140801,"Содержание: В Иосифа Сталина подчеркнули, что ...",0
140802,Содержание: Это же заметили в комментариях к п...,1
140803,Содержание: Это же заметили в комментариях к п...,0
140804,"Содержание: По версии защиты, Фоменко в указан...",1


In [5]:
df_val_bert = make_df('val')
df_val_bert = pd.DataFrame(df_val_bert)
df_val_bert

100%|██████████| 7577/7577 [00:01<00:00, 7074.69it/s]


Unnamed: 0,text,label
0,"Содержание: В него вошли ООН, Россия, Украина ...",0
1,"Содержание: В него вошли Донбасса, Россия, Укр...",0
2,"Содержание: В него вошли МИДа, Россия, Украина...",0
3,"Содержание: В него вошли Берлине, Россия, Укра...",0
4,"Содержание: В него вошли Германии, Россия, Укр...",0
...,...,...
75308,"Содержание: Как сообщил ""Ленте.ру"" источник в ...",0
75309,"Содержание: Как сообщил ""Ленте.ру"" источник в ...",0
75310,"Содержание: Как сообщил ""Ленте.ру"" источник в ...",0
75311,"Содержание: Как сообщил ""Ленте.ру"" источник в ...",0


In [6]:
df_test_bert = make_df('test')
df_test_bert = pd.DataFrame(df_test_bert)
df_test_bert

100%|██████████| 7257/7257 [00:00<00:00, 7415.25it/s]


Unnamed: 0,text,label
0,Содержание: Благодаря этому компромиссу местны...,0
1,Содержание: Благодаря этому компромиссу местны...,0
2,Содержание: Благодаря этому компромиссу местны...,0
3,Содержание: Благодаря этому компромиссу местны...,0
4,Содержание: Благодаря этому компромиссу местны...,0
...,...,...
67746,Содержание: РИА Новости объяснил введение таки...,0
67747,Содержание: Синиша Мали объяснил введение таки...,0
67748,Содержание: Вучича объяснил введение таких стр...,0
67749,Содержание: Сербия объяснил введение таких стр...,0


In [7]:
from simpletransformers.classification import ClassificationModel

In [7]:
!rm -rf outputs/ runs/ cache/

rm: cannot remove 'runs/Mar02_05-44-40_basic-0': Directory not empty


In [8]:
model = ClassificationModel('xlmroberta', 'vicgalle/xlm-roberta-large-xnli-anli', use_cuda=True, args=dict(
  overwrite_output_dir=True,
  max_seq_length=512,
  train_batch_size=6,
  gradient_accumulation_steps=8,
  eval_batch_size=6,
  use_cuda=True,
  warmup_steps=500,
  num_train_epochs=3,
  learning_rate=1e-5,
  cache_dir="cache",
  manual_seed = 56,
  save_model_every_epoch = False,
  save_steps = -1
))

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at vicgalle/xlm-roberta-large-xnli-anli and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  f"use_multiprocessing automatically disabled as {model_type}"


In [9]:
model.train_model(df_train_bert, use_cuda=True)

  "Dataframe headers not specified. Falling back to using column 0 as text and column 1 as labels."


HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=3.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 3'), FloatProgress(value=0.0, max=23468.0), HTML(value='')))






HBox(children=(HTML(value='Running Epoch 1 of 3'), FloatProgress(value=0.0, max=23468.0), HTML(value='')))




HBox(children=(HTML(value='Running Epoch 2 of 3'), FloatProgress(value=0.0, max=23468.0), HTML(value='')))





(8799, 0.26171744821430937)

In [10]:
preds_val_bert, raw_outputs_val_bert = model.predict(df_val_bert['text'])
preds_test_bert, raw_outputs_test_bert = model.predict(df_test_bert['text'])

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=12553.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11292.0), HTML(value='')))




In [11]:
raw_outputs_val1 = [i[1] for i in raw_outputs_val_bert]
raw_outputs_test1 = [i[1] for i in raw_outputs_test_bert]

np.save('preds_bert_val1', raw_outputs_val1)
np.save('preds_bert_test1', raw_outputs_test1)

In [12]:
raw_outputs_val0 = [i[0] for i in raw_outputs_val_bert]
raw_outputs_test0 = [i[0] for i in raw_outputs_test_bert]

np.save('preds_bert_val0', raw_outputs_val0)
np.save('preds_bert_test0', raw_outputs_test0)

In [14]:
from transformers import set_seed, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, Trainer, TrainingArguments
import torch
import random

set_seed(56)

  and should_run_async(code)


In [13]:
from transformers import set_seed, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, Trainer, TrainingArguments
import torch
import random

set_seed(56)

import json
def preprocess_data(split, over=False, ren=True):
    data = list(map(json.loads, open(f"rucos_{split}.jsonl")))
    proc = []
    if over:
        data = data[:len(data) // 1]
    for par in data:
        text = par["passage"]["text"]
        entities = []
        for ent in par["passage"]["entities"]:
            ent = text[ent["start"]:ent["end"]]
            if ent not in entities:
                entities.append(ent)
        for qa in par["qas"]:
            query = qa["query"]
            ref = lambda x: ("passage: " if ren else '') + text + ' ' + ("query: " if ren else '') + query.replace("@placeholder", x)
            correct_answers = []
            if "answers" in qa:
                for ans in qa["answers"]:
                    ans = text[ans["start"]:ans["end"]]
                    if ans not in correct_answers:
                        correct_answers.append(ans)
            correct = [ent for ent in entities if ent in correct_answers]
            incorrect = [ent for ent in entities if ent not in correct_answers]
            if over:
                oversample = (len(incorrect) // max(1, len(correct))) if over else 1
                oversample = 1
                correct *= oversample
                incorrect = random.sample(incorrect, min(1, len(incorrect)))
            for corr in correct:
                yield (ref(corr), 1)
            for incorr in incorrect:
                yield (ref(incorr), 0)

# facebook/mbart-large-cc25
print("Loading tokenizer")
model_call = "facebook/mbart-large-cc25"
tokenizer = AutoTokenizer.from_pretrained(model_call)

msl = 450

def prep_df(df, tok):
    x, y = map(list, zip(*df))
    print("Tokenizing...")
    x = tok(x, truncation=True, padding=True, add_special_tokens=True, max_length=msl)
    print("To list...")
    return ListDataset(x, y)


class ListDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(self.encodings[key][idx]) for key in self.encodings.keys()}
        if self.labels[idx] is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

print("train")
train_data = prep_df(list(preprocess_data("train", over=True)), tokenizer)
print('val')
val_data = prep_df(list(preprocess_data("val")), tokenizer)
print('test')
test_data = prep_df(list(preprocess_data("test")), tokenizer)


Loading tokenizer


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=1185.0), HTML(value='')))




HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=5069051.0), HTML(value='')))


train
Tokenizing...
To list...
val
Tokenizing...
To list...
test
Tokenizing...
To list...
Loading train...
Loading val...
Loading test...
Training transformer...
Loading tokenizer...
Loading model...


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=2444517405.0), HTML(value='')))




Some weights of the model checkpoint at facebook/mbart-large-cc25 were not used when initializing MBartForSequenceClassification: ['final_logits_bias']
- This IS expected if you are initializing MBartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MBartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MBartForSequenceClassification were not initialized from the model checkpoint at facebook/mbart-large-cc25 and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream ta

Preparing data...
Preparing train...
Preparing val...
Preparing test...


W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


Training...


KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/user/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2895, in get_loc
    return self._engine.get_loc(casted_key)
  File "pandas/_libs/index.pyx", line 70, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/index.pyx", line 101, in pandas._libs.index.IndexEngine.get_loc
  File "pandas/_libs/hashtable_class_helper.pxi", line 1675, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas/_libs/hashtable_class_helper.pxi", line 1683, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 22677

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/home/user/conda/lib/python3.7/site-packages/torch/utils/data/_utils/worker.py", line 185, in _worker_loop
    data = fetcher.fetch(index)
  File "/home/user/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/user/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/home/user/conda/lib/python3.7/site-packages/pandas/core/frame.py", line 2902, in __getitem__
    indexer = self.columns.get_loc(key)
  File "/home/user/conda/lib/python3.7/site-packages/pandas/core/indexes/base.py", line 2897, in get_loc
    raise KeyError(key) from err
KeyError: 22677


In [None]:
model_call = "facebook/mbart-large-cc25"
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import set_seed, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, Trainer, TrainingArguments
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
import logging
import mlflow
mlflow.end_run()


def train_transformer(df, eval_df, test_df, model_name, out_dir, train_args=None, seed=56, model_nick="outputs/model"):
    if train_args is None:
        train_args = {}
    print("Loading tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(model_call)
    print("Loading model...")
    model = AutoModelForSequenceClassification.from_pretrained(model_call)

    print("Preparing data...")
    print("Preparing train...")
    df = prep_df(df, tokenizer)
    print("Preparing val...")
    eval_df = prep_df(eval_df, tokenizer)
    print("Preparing test...")
    test_df = prep_df(test_df, tokenizer)

    training_args = TrainingArguments(
        output_dir=out_dir,  # output directory
        num_train_epochs=1,  # total number of training epochs
        per_device_train_batch_size=1,  # batch size per device during training
        per_device_eval_batch_size=1,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        # weight_decay=0.01,  # strength of weight decay
        # logging_dir='./logs',  # directory for storing logs
        # logging_steps=10,
        learning_rate=1e-5,
        seed=seed,
        overwrite_output_dir=True,
        dataloader_num_workers=1,
        # num_cores=1,
        do_train=True
        # fp16=True,
        # use_multiprocessing=False
        # evaluation_strategy="epoch",
    )
    for key, value in train_args.items():
        training_args.__setattr__(key, value)
  
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    trainer = Trainer(model=model, args=training_args, train_dataset=df, eval_dataset=eval_df)
    if training_args.num_train_epochs > 0:
        print("Training...")
        trainer.train()
        trainer.save_model()
    print("Evaluating..")
    if training_args.local_rank in [-1, 0]:
        model.eval()
        with torch.no_grad():
            for df, split in zip([df, eval_df, test_df][1:], ["train", "val", "test"][1:]):
                ys = []
                for batch in tqdm(DataLoader(df, batch_size=training_args.eval_batch_size)):
                    ys += list(model(**{k: v.to(model.device) for k, v in batch.items()}).logits[:, 1].detach().cpu())
                open(f"{model_nick}.{split}.scores", 'w').write('\n'.join([str(y.item()) for y in ys]))
    return model


def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


def prep_df(df, tok, d=False):
    df = [tuple(y) for x, y in df.iterrows()]
    # print(x)
    # print(y)
    if d:
        x, = map(list, zip(*df))
        y = [0 for _ in x]
    else:
        x, y = map(list, zip(*df))
    print("Tokenizing...")
    x = tok(x, truncation=True, padding=True, add_special_tokens=True, max_length=msl)
    print("To list...")
    return ListDataset(x, y)


class ListDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(self.encodings[key][idx]) for key in self.encodings.keys()}
        if self.labels[idx] is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


def load_data(split):
    if split == "train":
        return train_data
    elif split == "val":
        return val_data
    else:
        return test_data


model_name = "mbartes"
batch = 8
msl = 450


def main(*args, **kwargs):
    set_seed(56)
    print("Loading train...")
    train_df = df_train_bert  # load_data("train")
    print("Loading val...")
    val_df = df_val_bert  # load_data("val")
    print("Loading test...")
    test_df = df_test_bert  # load_data("test")
    print("Training transformer...")
    train_transformer(train_df, val_df, test_df,
                      model_call,
                      "outputs",
                      train_args=dict(
        num_train_epochs=1,
        # tpu_num_cores=8,
        # num_cores=4,
        # dataloader_num_workers=4,
        fp16=True,
        # fp16_opt_level="O2",
        per_device_train_batch_size=batch,  # batch size per device during training
        per_device_eval_batch_size=batch,  # batch size for evaluation
        gradient_accumulation_steps=32//batch,
        save_steps=1000,
        save_total_limit=1,
    ), model_nick=model_name)

main()
# import torch_xla.distributed.xla_multiprocessing as xmp
# xmp.spawn(main, args=(), nprocs=8, start_method='fork')
from google.colab import files
# files.download(f"{model_name}.train.scores")  
files.download(f"{model_name}.val.scores")
files.download(f"{model_name}.test.scores")

Loading train...
Loading val...
Loading test...
Training transformer...
Loading tokenizer...
Loading model...


Some weights of the model checkpoint at facebook/mbart-large-cc25 were not used when initializing MBartForSequenceClassification: ['final_logits_bias']
- This IS expected if you are initializing MBartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing MBartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of MBartForSequenceClassification were not initialized from the model checkpoint at facebook/mbart-large-cc25 and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream ta

Preparing data...
Preparing train...
Tokenizing...
To list...
Preparing val...
Tokenizing...
To list...
Preparing test...
Tokenizing...
To list...


W&B installed but not logged in. Run `wandb login` or set the WANDB_API_KEY env variable.


Training...




Step,Training Loss
500,0.7362


  0%|          | 0/9415 [00:00<?, ?it/s]

Evaluating..


 60%|██████    | 5682/9415 [34:28<21:34,  2.88it/s]  