In [1]:
import gc
import glob
import yaml
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pytorch_lightning as pl

from transformers import DataCollatorWithPadding
from transformers import AutoTokenizer, AutoModel, AutoConfig

%env TOKENIZERS_PARALLELISM = true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

env: TOKENIZERS_PARALLELISM=true


In [2]:
IS_DEBUG = False

if IS_DEBUG:
    TEST_ROOT = "../input/feedback-prize-2021/train/"
    MAX_NUM_MODELS = 1
    NUM_SAMPLES = 200
else:
    TEST_ROOT = "../input/feedback-prize-2021/test/"
#     MAX_NUM_MODELS = None
    MAX_NUM_MODELS = 1
    NUM_SAMPLES = None
    
print(f"TEST_ROOT: {TEST_ROOT}\nMAX_NUM_MODELS: {MAX_NUM_MODELS}\nNUM_SAMPLES: {NUM_SAMPLES}")

TEST_ROOT: ../input/feedback-prize-2021/test/
MAX_NUM_MODELS: 1
NUM_SAMPLES: None


# Crodoc

In [3]:
class TextDataset(Dataset):
    def __init__(self, df, tokenizer, cfg):

        self.tokenizer = tokenizer
        self.max_length = 4096
        
        self.texts = df['text'].values.tolist()
        self.ids = df['id'].values.tolist()

        self.x, self.offset_mappings = [], []

        for text in self.texts:
            x, offset_mapping = self.make_item(text)
            self.x.append(x)
            self.offset_mappings.append(offset_mapping)

    def get_offset_mapping(self, text):

        tokenized = self.tokenizer(
            text,
            add_special_tokens = True,
            max_length = self.max_length,
            truncation=True,
            return_offsets_mapping = True,
        )

        offset_mapping = tokenized['offset_mapping']
        skip_indices = np.where(np.array(tokenized.sequence_ids()) != 0)[0]

        return offset_mapping, skip_indices

    def make_item(self, text):

        tokenized = self.tokenizer(
            text,
            add_special_tokens = True,
            max_length = self.max_length,
            truncation=True,
            return_offsets_mapping = False,
        )

        offset_mapping, _ = self.get_offset_mapping(text)

        for k, v in tokenized.items():
            tokenized[k] = torch.tensor(v, dtype=torch.long)

        return tokenized, offset_mapping

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.x[idx]

class CustomCollator():

    def __init__(self, tokenizer):
        self.data_collator = DataCollatorWithPadding(tokenizer)

    def __call__(self, batch):
        text = []
        for item in batch:
            text.append(item)

        text = self.data_collator(text)
        return text


class TextDataModule(pl.LightningDataModule):
    def __init__(
        self,
        test_df,
        tokenizer,
        cfg,
        test_dataset
    ):
        super().__init__()
        self.test_df = test_df
        self.tokenizer = tokenizer
        self.cfg = cfg
        
        self.test_dataset = test_dataset

    def setup(self, stage):
        pass

    def predict_dataloader(self):
        custom_collator = CustomCollator(self.tokenizer)
        return DataLoader(self.test_dataset, **self.cfg["val_loader"], collate_fn=custom_collator)

In [4]:
class CutTextDataset(Dataset):
    def __init__(self, df, tokenizer, cfg):

        self.tokenizer = tokenizer
        #self.max_length = cfg['max_length']
        self.max_length = cfg['max_length_valid']

        self.texts = df['text'].values.tolist()
        self.ids = df['id'].values.tolist()
        self.stride = cfg['stride']
        
        self.x, self.x_cut, self.offset_mappings, self.text_indexes = [], [], [], []
        
        text_index = 0

        for text in self.texts:
            x, offset_mapping = self.make_item(text)

            self.x.append(x)
            self.offset_mappings.append(offset_mapping)

            start = 0
            total_tokens = len(offset_mapping)

            break_bool = False

            while start < total_tokens and not break_bool:

                if start + self.max_length > total_tokens:
                    start = max(0, total_tokens - self.max_length)
                    break_bool = True

                x_cut, _ = self.get_cut_item(x, offset_mapping, start)

                self.x_cut.append(x_cut)
                self.text_indexes.append((text_index, start))

                start += self.stride

            text_index += 1

    def get_cut_element(self, tokenized_element, start, length, is_list=False):

        new_tokenized_element = tokenized_element[start:start+length]
        if not is_list:
            new_tokenized_element = new_tokenized_element.clone()

        #new_tokenized_element[0] = tokenized_element[0]
        #new_tokenized_element[-1] = tokenized_element[-1]

        return new_tokenized_element

    def get_cut_item(self, tokenized, offset_mapping, start):

        cut_length = min(self.max_length, len(offset_mapping))

        new_tokenized = {}

        for k in tokenized:
            new_tokenized[k] = self.get_cut_element(tokenized[k], start, cut_length)

        if offset_mapping is not None:
            offset_mapping = self.get_cut_element(offset_mapping, start, cut_length, is_list=True)

        return new_tokenized, offset_mapping

    def make_item(self, text):

        tokenized = self.tokenizer(
            text,
            add_special_tokens = True,
            return_offsets_mapping = True,
        )

        offset_mapping = tokenized['offset_mapping']
        del tokenized['offset_mapping']

        for k, v in tokenized.items():
            tokenized[k] = torch.tensor(v, dtype=torch.long)

        return tokenized, offset_mapping

    def __len__(self):
        return len(self.x_cut)

    def __getitem__(self, idx):
        return self.x_cut[idx]

In [5]:
class TextModel(pl.LightningModule):

    def __init__(self, cfg, config_path=None):
        super().__init__()

        self.cfg = cfg
        model_cfg = cfg['model']
        self.num_labels = model_cfg['num_labels']

        self.config = torch.load(config_path)
        self.backbone = AutoModel.from_config(self.config)

        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)

        self.fc = nn.Linear(self.config.hidden_size, self.num_labels)

    def forward(self, x):

        x = self.backbone(**x).last_hidden_state
        x = self.dropout(x)

        x1 = self.dropout1(x)
        x2 = self.dropout2(x)
        x3 = self.dropout3(x)
        x4 = self.dropout4(x)
        x5 = self.dropout5(x)

        x = (x1+x2+x3+x4+x5) / 5.0

        x = self.fc(x)

        return x

    def predict_step(self, batch, batch_idx):

            output = self(batch)
            pred = output.softmax(dim=-1).detach().cpu()

            return pred

    def configure_optimizers(self):
        return None

In [6]:
def get_test_df():
#     test_files = glob.glob('../input/feedback-prize-2021/test/*.txt')
    test_files = glob.glob(str(TEST_ROOT) + '*.txt')[:NUM_SAMPLES]
    test_ids = [test_file.split('/')[-1][:-4] for test_file in test_files]
    
    test_texts = []
    
    for test_file, test_id in zip(test_files, test_ids):
        
        with open(test_file, 'r') as f:
            text = f.read()

        # no-break space
        text = text.replace(u'\xa0', u' ')
        # next line
        text = text.replace(u'\x85', u'\n')
        
        test_texts.append(text)
    
    values = list(zip(test_ids, test_texts))
    values.sort(key=lambda x: -len(x[1]))
    
    test_df = pd.DataFrame(values, columns=['id','text'])
    return test_df

In [7]:
def text_to_words(text):
    word = text.split()
    word_offset = []

    start = 0
    for w in word:
        r = text[start:].find(w)

        if r==-1:
            raise NotImplementedError
        else:
            start = start+r
            end   = start+len(w)
            word_offset.append((start,end))
        start = end

    return word, word_offset

def word_probability_to_predict_df(text_to_word_probability, id):
    
    len_word = len(text_to_word_probability)
    word_predict = text_to_word_probability.argmax(-1)
    word_score   = text_to_word_probability.max(-1)
    predict_df = []

    t = 0
    while 1:
        if word_predict[t] not in [
            discourse_marker_to_label['O'],
        ]:
            start = t
            b_marker_label = word_predict[t]
        else:
            t = t+1
            if t== len_word-1:break
            continue

        t = t+1
        if t== len_word-1: break

        if   label_to_discourse_marker[b_marker_label][0]=='B':
            i_marker_label = b_marker_label+1
        else:
            i_marker_label = b_marker_label

        while 1:
            if (word_predict[t] != i_marker_label) or (t ==len_word-1):
                end = t
                prediction_string = ' '.join([str(i) for i in range(start,end)])
                discourse_type = label_to_discourse_marker[b_marker_label][2:]
                discourse_score = word_score[start:end].tolist()
                predict_df.append((id, discourse_type, prediction_string, str(discourse_score)))
                break
            else:
                t = t+1
                continue
        if t== len_word-1: break

    predict_df = pd.DataFrame(predict_df, columns=['id', 'class', 'predictionstring', 'score'])
    return predict_df


# def do_threshold(submit_df, use=['length','probability']):
#     df = submit_df.copy()
#     df = df.fillna('')

#     if 'length' in use:
#         df['l'] = df.predictionstring.apply(lambda x: len(x.split()))
#         for key, value in min_thresh.items():
#             #value=3
#             index = df.loc[df['class'] == key].query('l<%d'%value).index
#             df.drop(index, inplace=True)

#     if 'probability' in use:
#         df['s'] = df.score.apply(lambda x: np.mean(eval(x)))
#         for key, value in proba_thresh.items():
#             index = df.loc[df['class'] == key].query('s<%f'%value).index
#             df.drop(index, inplace=True)

#     df = df[['id', 'class', 'predictionstring']]
#     return df


def do_threshold(submit_df, use=['length','probability']):
    df = submit_df.copy()
    df = df.fillna('')

    if 'length' in use:
        df['l'] = df.predictionstring.apply(lambda x: len(x.split()))
        for key, value in min_thresh.items():
            #value=3
            index = df.loc[df['class'] == key].query('l<%d'%value).index
            df.drop(index, inplace=True)

    if 'probability' in use:
        df['score'] = df.score.apply(lambda x: np.mean(eval(x)))
        for key, value in proba_thresh.items():
            index = df.loc[df['class'] == key].query('score<%f'%value).index
            df.drop(index, inplace=True)
    
    df.rename(columns={"l": "num_tokens"}, inplace=True)
    df["start"] = df.predictionstring.apply(lambda x: int(x.split()[0]))
    df["end"] = df.predictionstring.apply(lambda x: int(x.split()[-1]) + 1)
    df = df[['id', 'class', 'predictionstring', "num_tokens", "score", "start", "end"]]
    return df

In [8]:
def reset_crodoc():
    global text_words, text_word_offsets, text_word_preds, text_ids, text_lenghts, test_df, num_labels
    
    test_df = get_test_df()

    num_labels = 10

    text_words, text_word_offsets, text_word_preds, text_ids, text_lenghts = [], [], [], [], []

    for idx in range(len(test_df)):
        row = test_df.iloc[idx]
        text_ids.append(row.id)
        text_lenghts.append(len(row.text))

        row_words, row_word_offsets = text_to_words(row.text)
        text_words.append(row_words)
        text_word_offsets.append(row_word_offsets)

        word_preds = np.full((len(row_words),num_labels),0, np.float32)
        text_word_preds.append(word_preds)

In [9]:
def update_word_preds(model_preds, offset_mappings, coef):
    idx = 0
    
    for idx, row_preds in enumerate(model_preds):
            
        character_preds = np.full((text_lenghts[idx],num_labels),0, np.float32)

        for pos,(start,end) in enumerate(offset_mappings[idx]):
            character_preds[start:end] = row_preds[pos] * coef
            
        for pos,(start,end) in enumerate(text_word_offsets[idx]):
            text_word_preds[idx][pos] += character_preds[start:end].mean(0)

In [10]:
def merge_cut_preds(model_preds, dataset):

    dataset_length = len(set(dataset.texts))
    
    index = 0
    preds_tmp = []
    text_indexes = dataset.text_indexes

    overlap = dataset.stride // 2

    while index < len(model_preds):

        text_index, _ = text_indexes[index]
        offset_mapping = dataset.offset_mappings[text_index]

        preds = np.zeros((len(offset_mapping), 10))

        while index < len(model_preds):
            curr_text_index, start = text_indexes[index]

            if curr_text_index != text_index:
                break

            curr_preds = model_preds[index]

            if start == 0:
                length = min(len(preds), len(curr_preds))
                preds[:length] = curr_preds[:length]
            elif start + len(curr_preds) > len(offset_mapping):
                preds[-len(curr_preds)+overlap:] = curr_preds[overlap:]
            else:
                preds[start+overlap:start+len(curr_preds)] = curr_preds[overlap:]

            index += 1

        preds_tmp.append(preds)

    return preds_tmp

In [11]:
discourse_marker_to_label = {
    'O': 0,
    'B-Claim': 1,
    'I-Claim': 2,
    'B-Evidence': 3,
    'I-Evidence': 4,
    'X-Lead': 5,
    'X-Position': 6,
    'X-Counterclaim': 7,
    'X-Rebuttal': 8,
    'X-Concluding Statement': 9,
}

min_thresh = {
    'Lead': 6,
    'Position': 4,
    'Evidence': 16,
    'Claim': 2,
    'Concluding Statement': 11,
    'Counterclaim': 7,
    'Rebuttal': 6,
}

proba_thresh = {
    "Lead": 0.7,
    "Position": 0.6,
    "Evidence": 0.65,
    "Claim": 0.55,
    "Concluding Statement": 0.7,
    "Counterclaim": 0.6,
    "Rebuttal": 0.6,
}

label_to_discourse_marker = {v: k for k, v in discourse_marker_to_label.items()}

def get_sub_crodoc():
    sub_crodoc = []

    for idx, row_word_preds in enumerate(text_word_preds):
        sub_crodoc.append(word_probability_to_predict_df(row_word_preds, text_ids[idx]))

    sub_crodoc = pd.concat(sub_crodoc).reset_index(drop=True) 
    sub_crodoc = do_threshold(sub_crodoc, use=['length', 'probability'])
    
    return sub_crodoc

In [13]:
model_names = ['cp-deberta-xlarge-v2', 'cp-deberta-xlarge-v2', 'cp-deberta-xlarge-v2', 'deberta-bs2', 'deberta-bs2', 'deberta-bs2']
model_weights_crodoc = [0.60, 0.60, 0.60, 0.40, 0.40, 0.40]

model_start_ends = [(0, 1), (1, 2), (2, 3),  (2, 3), (3, 4), (4, 5)]

subs_crodoc = []


for model_name, (start, end) in zip(model_names, model_start_ends):
    
    reset_crodoc()
    
    with open('../input/' + model_name + '/hparams.yml', 'r') as f:
        cfg = yaml.safe_load(f)

    cfg['val_loader']['num_workers'] = 2
    cfg['val_loader']['batch_size'] = 8
    
    #if 'deberta-large' in model_name:
        #cfg['val_loader']['batch_size'] *= 3
    
    config_path = '../input/' + model_name + '/config.pth'    
#     model_paths = glob.glob('../input/' + model_name + '/*.ckpt')[:MAX_NUM_MODELS]
    model_paths = glob.glob('../input/' + model_name + '/*.ckpt')[start:end]
    
    folds = len(model_paths)
    model_preds = []
    
    print(f"{model_name}: {folds}")
    
    tokenizer = AutoTokenizer.from_pretrained('../input/' + model_name + '/tokenizer/tokenizer')
    
    if 'stride' in cfg and cfg['stride'] > 0:
        test_dataset = CutTextDataset(test_df, tokenizer, cfg)
    else:
        test_dataset = TextDataset(test_df, tokenizer, cfg)
    
    for model_path in model_paths:
        
        datamodule = TextDataModule(test_df, tokenizer, cfg, test_dataset)
        trainer = pl.Trainer(logger=False, **cfg['trainer'])
        model = TextModel.load_from_checkpoint(checkpoint_path=model_path, cfg=cfg, config_path=config_path)
        
        fold_preds = trainer.predict(model, datamodule)
        
        if not model_preds:
            for pred_batch in fold_preds:            
                for pred in pred_batch:
                    model_preds.append(pred.numpy().copy())
        else:
            idx = 0
            for pred_batch in fold_preds:            
                for pred in pred_batch:
                    model_preds[idx] += pred.numpy().copy()
                    idx += 1
        
        del fold_preds
        del trainer
        del model
        del datamodule
    
        gc.collect()
        torch.cuda.empty_cache()
    
    if 'stride' in cfg and cfg['stride'] > 0:
        model_preds = merge_cut_preds(model_preds, test_dataset)
#     update_word_preds(model_preds, test_dataset.offset_mappings, model_coef / folds)
    update_word_preds(model_preds, test_dataset.offset_mappings, 1 / folds)

    del test_dataset
    del tokenizer
    del model_preds
    
    gc.collect()
    torch.cuda.empty_cache()
    
    sub = get_sub_crodoc()
    sub.drop(["predictionstring"], axis=1, inplace=True)
    subs_crodoc.append(sub)
    
    print(subs_crodoc[-1].shape)

Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


cp-deberta-xlarge-v2: 1


Predicting: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


(47, 6)
cp-deberta-xlarge-v2: 1


Predicting: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


(47, 6)
cp-deberta-xlarge-v2: 1


Predicting: 0it [00:00, ?it/s]

(47, 6)
deberta-bs2: 1


Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


Predicting: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


(43, 6)
deberta-bs2: 1


Predicting: 0it [00:00, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1304 > 512). Running this sequence through the model will result in indexing errors


(47, 6)
deberta-bs2: 1


Predicting: 0it [00:00, ?it/s]

(46, 6)


In [14]:
if not IS_DEBUG:
    del text_words, text_word_offsets, text_word_preds, text_ids, text_lenghts 
    gc.collect()
    torch.cuda.empty_cache()

# Kkiller

In [15]:
import sys, os
sys.path.insert(0, "../input/fprize-kkiller-tools/fprize")
sys.path.insert(0, "../input/weighted-boxes-fusion")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [16]:
import pandas as pd, numpy as np
import torch

from tqdm.auto import tqdm
from pathlib import Path
from  datetime import datetime

import mtask_v2.src.inference as inference
import mtask_v2.src.dataset as dataset
import mtask_v2.src.configs as configs

from mtask_v2.src.dataset import read_from_id, read_train_df
from mtask_v2.src.post_processing import get_seg_from_ner
from mtask_v2.src.wbf import fusion_boxes_for_subs

In [17]:
q_crodoc = 0.50
iou_thr = 0.333
skip_box_thr = 0.001
# out_skip_box_thr = 0.10
out_iou_q = 0.015

In [18]:
def q_prune_sub(sub, q=None):
    if q is None:
        q = out_iou_q
    
    if "num_tokens" not in sub.columns:
        sub["num_tokens"] = sub["end"] - sub["start"]
    
    out_iou_class_score_qs = sub.groupby("class").score.quantile(q, interpolation="nearest").to_dict()
    out_iou_class_ntokens_qs = sub.groupby("class").num_tokens.quantile(q, interpolation="nearest").to_dict()

    sub["score_thresh"] = sub["class"].map(out_iou_class_score_qs)
    sub["num_tokens_thresh"] = sub["class"].map(out_iou_class_ntokens_qs)

    sub = sub.query("(score >= score_thresh) & (num_tokens >= num_tokens_thresh)")
    sub.reset_index(inplace=True, drop=True)
    return sub

In [19]:
for sub_crodoc in subs_crodoc:
    sub_crodoc["class_id"] = sub_crodoc["class"].map(configs.Discourse2ID)

sub_crodoc = fusion_boxes_for_subs(subs_crodoc, model_weights_crodoc, iou_thr=iou_thr, skip_box_thr=skip_box_thr)

if not IS_DEBUG:
    del subs_crodoc
    gc.collect()
    
print(sub_crodoc.shape)
sub_crodoc = q_prune_sub(sub_crodoc)
print(sub_crodoc.shape)
sub_crodoc.head()

  0%|          | 0/5 [00:00<?, ?it/s]

(57, 7)
(57, 10)


Unnamed: 0,id,class_id,class,score,start,end,predictionstring,num_tokens,score_thresh,num_tokens_thresh
0,0FB0700DAF44,5,Lead,0.984794,0,54,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,54,0.894573,20
1,0FB0700DAF44,6,Position,0.165986,41,57,41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56,16,0.165986,8
2,0FB0700DAF44,1,Claim,0.086306,66,84,66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 8...,18,0.082887,2
3,0FB0700DAF44,6,Position,0.651675,108,120,108 109 110 111 112 113 114 115 116 117 118 119,12,0.165986,8
4,0FB0700DAF44,4,Evidence,0.917166,121,265,121 122 123 124 125 126 127 128 129 130 131 13...,144,0.192897,53


In [20]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda


In [21]:
TEST_ROOT = Path(TEST_ROOT)

configs.TRAIN_ROOT = TEST_ROOT
uuids = sub_crodoc["id"].unique()
               
uuids = sorted(uuids, key=lambda uuid: -len(read_from_id(uuid, root=TEST_ROOT).split()))

print(len(uuids))
uuids[:10]

5


['18409261F5C2',
 'DF920E0A7337',
 '0FB0700DAF44',
 'D72CB1C11673',
 'D46BCB48440A']

In [22]:
params = [
    
    inference.get_params(
        model_name="microsoft/deberta-xlarge",
        batch_size=14,
        maxlen=1024,
        stride=1024,
        num_workers=2,
        weight=.60,
        config_path="../input/fprize-kkiller-tools/microsoft_deberta-xlarge",
        tokenizer_path="../input/fprize-kkiller-tools/microsoft_deberta-xlarge",
        is_pickle=False,
        device=DEVICE,
        model_paths=sorted(Path("../input/cp-deberta-xlarge-kkiller").glob("*.pth"))[:5],
        root=TEST_ROOT,
        use_position_embeddings=False,
    ),
    
    
    inference.get_params(
        model_name="microsoft/deberta-large",
        batch_size=24,
        maxlen=1024,
        stride=1024,
        num_workers=2,
        weight=.40,
        config_path="../input/fprize-kkiller-tools/microsoft_deberta-large",
        tokenizer_path="../input/fprize-kkiller-tools/microsoft_deberta-large",
        is_pickle=False,
        device=DEVICE,
        model_paths=sorted(Path("../input/gdrive-db1l-1024-v2-v11-no-pe-weights/microsoft_deberta-large_maxlen1024_clb_mtask_msd_v2_v11_no_pe/"
                               ).glob("*.pth"))[3:],
        root=TEST_ROOT,
        use_position_embeddings=False,
    ),
    
]

S = sum([param["weight"] for param in params])
assert abs(S- 1.0) < 1e-6
params[0]

{'model_name': 'microsoft/deberta-xlarge',
 'model_name_slug': 'microsoft_deberta-xlarge',
 'minlength': 5,
 'maxlen': 1024,
 'num_targets': 15,
 'stride': 1024,
 'test_batch_size': 64,
 'test_num_workers': 2,
 'config_path': '../input/fprize-kkiller-tools/microsoft_deberta-xlarge',
 'tokenizer_path': '../input/fprize-kkiller-tools/microsoft_deberta-xlarge',
 'is_pickle': False,
 'config': DebertaConfig {
   "_name_or_path": "../input/fprize-kkiller-tools/microsoft_deberta-xlarge",
   "attention_probs_dropout_prob": 0.1,
   "hidden_act": "gelu",
   "hidden_dropout_prob": 0.1,
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "layer_norm_eps": 1e-07,
   "max_position_embeddings": 512,
   "max_relative_positions": -1,
   "model_type": "deberta",
   "num_attention_heads": 16,
   "num_hidden_layers": 48,
   "pad_token_id": 0,
   "pooler_dropout": 0,
   "pooler_hidden_act": "gelu",
   "pooler_hidden_size": 1024,
   "pos_att_type": [
     "c2p",
     "p2c

In [23]:
subs_kkiller = []
model_weights_kkiller = []

for param in params:
    print("{}: {}".format(Path(param["model_paths"][0]).stem,  len(param["model_paths"])))
#     preds, preds_seg  = inference.predict_from_param(uuids=uuids, param=param, make_sub=False, oof=False, model_bar=False)

    results  = inference.predict_from_param(uuids=uuids, param=param, make_sub=False, oof=False, model_bar=False, reduce=None)
    
    for preds, preds_seg in results:
        preds_seg = 0.60 * preds_seg + 0.40 * get_seg_from_ner(preds)

        subs_kkiller.append(
            inference.make_sub_from_res(uuids=uuids, res=preds, res_seg=preds_seg, q=0.015, threshs=None)
        )
        
        subs_kkiller[-1].drop(["predictionstring"], inplace=True, axis=1)
    
        model_weights_kkiller.append(param["weight"])
    
    print(subs_kkiller[-1].shape)
    
print("model_weights_kkiller:", model_weights_kkiller)

sub_kkiller = fusion_boxes_for_subs(subs_kkiller, model_weights_kkiller, iou_thr=iou_thr, skip_box_thr=skip_box_thr)

if not IS_DEBUG:
    del subs_kkiller, preds, preds_seg
    gc.collect()
    
print(sub_kkiller.shape)
# sub_kkiller = sub_kkiller.query(f"score >= {out_skip_box_thr}")
sub_kkiller = q_prune_sub(sub_kkiller)
print(sub_kkiller.shape)
sub_kkiller.head()

fprize_microsoft_deberta-xlarge_fold0_epoch_04_iov_v2_val_0.6932_20220309065910: 5


  warn("You should sort your UUIDs for faster prediction")


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  warn("The current model has NOT <position_embeddings> enabled.")


(39, 7)
fprize_microsoft_deberta-large_fold3_epoch_03_iov_v2_val_0.6976_20220310104853: 5


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

(42, 7)
model_weights_kkiller: [0.6, 0.6, 0.6, 0.6, 0.6, 0.4, 0.4, 0.4, 0.4, 0.4]


  0%|          | 0/5 [00:00<?, ?it/s]

(50, 7)
(50, 10)


Unnamed: 0,id,class_id,class,score,start,end,predictionstring,num_tokens,score_thresh,num_tokens_thresh
0,0FB0700DAF44,5,Lead,0.768978,0,84,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,84,0.70717,20
1,0FB0700DAF44,6,Position,0.701401,108,120,108 109 110 111 112 113 114 115 116 117 118 119,12,0.237378,8
2,0FB0700DAF44,4,Evidence,0.340591,122,298,122 123 124 125 126 127 128 129 130 131 132 13...,176,0.340591,74
3,0FB0700DAF44,1,Claim,0.138401,283,314,283 284 285 286 287 288 289 290 291 292 293 29...,31,0.058144,3
4,0FB0700DAF44,1,Claim,0.690662,310,341,310 311 312 313 314 315 316 317 318 319 320 32...,31,0.058144,3


In [24]:
sub_kkiller.shape, sub_crodoc.shape

((50, 10), (57, 10))

# Box Fusion

In [26]:
subs = [sub_crodoc, sub_kkiller]
weights = [q_crodoc, 1-q_crodoc]

sub = fusion_boxes_for_subs(subs, weights, iou_thr=iou_thr, skip_box_thr=skip_box_thr)


print(sub.shape)
# sub = sub.query(f"score >= {out_skip_box_thr}")
sub = q_prune_sub(sub, q=0.0125)
print(sub.shape)

sub[["id", "class", "predictionstring"]].to_csv("submission.csv", index=False)

sub.head(30)

  0%|          | 0/5 [00:00<?, ?it/s]

(57, 7)
(57, 10)


Unnamed: 0,id,class_id,class,score,start,end,predictionstring,num_tokens,score_thresh,num_tokens_thresh
0,0FB0700DAF44,5,Lead,0.876886,0,67,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...,67,0.816193,20
1,0FB0700DAF44,6,Position,0.082993,41,57,41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56,16,0.082993,8
2,0FB0700DAF44,1,Claim,0.043153,66,84,66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 8...,18,0.043153,2
3,0FB0700DAF44,6,Position,0.676538,108,120,108 109 110 111 112 113 114 115 116 117 118 119,12,0.082993,8
4,0FB0700DAF44,4,Evidence,0.628879,121,274,121 122 123 124 125 126 127 128 129 130 131 13...,153,0.096449,53
5,0FB0700DAF44,1,Claim,0.061359,196,230,196 197 198 199 200 201 202 203 204 205 206 20...,34,0.043153,2
6,0FB0700DAF44,4,Evidence,0.096449,230,283,230 231 232 233 234 235 236 237 238 239 240 24...,53,0.096449,53
7,0FB0700DAF44,1,Claim,0.24652,283,314,283 284 285 286 287 288 289 290 291 292 293 29...,31,0.043153,2
8,0FB0700DAF44,1,Claim,0.755973,313,341,313 314 315 316 317 318 319 320 321 322 323 32...,28,0.043153,2
9,0FB0700DAF44,4,Evidence,0.981122,329,560,329 330 331 332 333 334 335 336 337 338 339 34...,231,0.096449,53
