In [None]:
import shutil
from pathlib import Path
import os

transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

input_dir = Path("../input/deberta-v2-3-fast-tokenizer")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    if str(filename).startswith("deberta"):
        filepath = deberta_v2_path/str(filename).replace("deberta", "")
    else:
        filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings

warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.set_option("display.width", 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers

from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import (
    get_linear_schedule_with_warmup,
    get_cosine_schedule_with_warmup,
)

%env TOKENIZERS_PARALLELISM=false

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def get_logger(filename="inference"):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter

    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger


LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything(seed=42)

# 4種のモデルの読み込み

## exp099 deberta 

In [None]:
def exp099_oof_char_prob():
    from transformers.models.deberta_v2.tokenization_deberta_v2_fast import (
        DebertaV2TokenizerFast,
    )
    import tokenizers
    import transformers

    def get_char_probs(texts, predictions, tokenizer):
        results = [np.zeros(len(t)) for t in texts]
        for i, (text, prediction) in enumerate(zip(texts, predictions)):
            encoded = tokenizer(
                text, add_special_tokens=True, return_offsets_mapping=True
            )
            prev_pred = 0
            prev_end = -1
            for idx, (offset_mapping, pred) in enumerate(
                zip(encoded["offset_mapping"], prediction)
            ):
                start = offset_mapping[0]
                end = offset_mapping[1]
                results[i][start:end] = pred
                if start != prev_end:
                    results[i][prev_end:start] = (pred + prev_pred) / 2
                prev_pred = pred
                prev_end = end
        return results

    tokenizer = DebertaV2TokenizerFast.from_pretrained("../input/get-token/tokenizer")
    max_len = 354
    # oof作成

    p = Path("../input/dict-oof-exp093-099/exp099-nbme-microsoft-deberta-v3-large/")
    oof = []
    for f in p.glob("*fold*.jb"):
        tmpdic = joblib.load(f)
        oof.append(pd.DataFrame(tmpdic))
    oof = pd.concat(oof).sort_values("id").reset_index(drop=True)
    char_probs = get_char_probs(
        oof["pn_history"].values, oof[[i for i in range(max_len)]].values, tokenizer
    )
    return char_probs, oof

In [None]:
exp099_char_probs, oof099 = exp099_oof_char_prob()

## exp096 roberta

In [None]:
def exp096_oof_char_prob():
    import tokenizers
    import transformers

    def get_char_probs(texts, predictions, tokenizer):
        results = [np.zeros(len(t)) for t in texts]
        for i, (text, prediction) in enumerate(zip(texts, predictions)):
            encoded = tokenizer(
                text, add_special_tokens=True, return_offsets_mapping=True
            )
            prev_pred = 0
            prev_end = -1
            for idx, (offset_mapping, pred) in enumerate(
                zip(encoded["offset_mapping"], prediction)
            ):
                start = offset_mapping[0]
                end = offset_mapping[1]
                results[i][start:end] = pred
                if start != prev_end:
                    results[i][prev_end:start] = (pred + prev_pred) / 2
                prev_pred = pred
                prev_end = end
        return results

    tokenizer = AutoTokenizer.from_pretrained(
        "../input/exp093-roberta-large-leakage-pseudo-labeling-ssl/tokenizer",
        trim_offsets=False,
    )
    max_len = 321

    # oof作成
    p = Path("../input/dict-oof-exp093-099/exp096-nbme-roberta-large/")
    oof = []
    for f in p.glob("*fold*.jb"):
        tmpdic = joblib.load(f)
        oof.append(pd.DataFrame(tmpdic))
    oof = pd.concat(oof).sort_values("id").reset_index(drop=True)

    char_probs = get_char_probs(
        oof["pn_history"].values, oof[[i for i in range(max_len)]].values, tokenizer
    )
    return char_probs, oof

In [None]:
exp096_char_probs, oof096 = exp096_oof_char_prob()

## exp098 spanbert

In [None]:
def exp098_oof_char_prob():
    import tokenizers
    import transformers

    def get_char_probs(texts, predictions, tokenizer):
        results = [np.zeros(len(t)) for t in texts]
        for i, (text, prediction) in enumerate(zip(texts, predictions)):
            encoded = tokenizer(
                text, add_special_tokens=True, return_offsets_mapping=True
            )
            prev_pred = 0
            prev_end = -1
            for idx, (offset_mapping, pred) in enumerate(
                zip(encoded["offset_mapping"], prediction)
            ):
                start = offset_mapping[0]
                end = offset_mapping[1]
                results[i][start:end] = pred
                if start != prev_end:
                    results[i][prev_end:start] = (pred + prev_pred) / 2
                prev_pred = pred
                prev_end = end
        return results

    tokenizer = AutoTokenizer.from_pretrained(
        "../input/exp094/tokenizer", trim_offsets=False
    )
    max_len = 361

    # oof作成
    p = Path("../input/dict-oof-exp093-099/exp098/")
    oof = []
    for f in p.glob("*fold*.jb"):
        tmpdic = joblib.load(f)
        oof.append(pd.DataFrame(tmpdic))
    oof = pd.concat(oof).sort_values("id").reset_index(drop=True)

    char_probs = get_char_probs(
        oof["pn_history"].values, oof[[i for i in range(max_len)]].values, tokenizer
    )
    return char_probs, oof

In [None]:
exp098_char_probs, oof098 = exp098_oof_char_prob()

## exp097 biolinkbert

In [None]:
def exp097_oof_char_prob():
    import tokenizers
    import transformers

    def get_char_probs(texts, predictions, tokenizer):
        results = [np.zeros(len(t)) for t in texts]
        for i, (text, prediction) in enumerate(zip(texts, predictions)):
            encoded = tokenizer(
                text, add_special_tokens=True, return_offsets_mapping=True
            )
            prev_pred = 0
            prev_end = -1
            for idx, (offset_mapping, pred) in enumerate(
                zip(encoded["offset_mapping"], prediction)
            ):
                start = offset_mapping[0]
                end = offset_mapping[1]
                results[i][start:end] = pred
                if start != prev_end:
                    results[i][prev_end:start] = (pred + prev_pred) / 2
                prev_pred = pred
                prev_end = end
        return results

    tokenizer = AutoTokenizer.from_pretrained(
        "../input/exp095/tokenizer", trim_offsets=False
    )
    max_len = 321

    # oof作成
    p = Path("../input/dict-oof-exp093-099/exp097/")
    oof = []
    for f in p.glob("*fold*.jb"):
        tmpdic = joblib.load(f)
        oof.append(pd.DataFrame(tmpdic))
    oof = pd.concat(oof).sort_values("id").reset_index(drop=True)

    char_probs = get_char_probs(
        oof["pn_history"].values, oof[[i for i in range(max_len)]].values, tokenizer
    )
    return char_probs, oof

In [None]:
exp097_char_probs, oof097=exp097_oof_char_prob()

## wの探索

In [None]:
def create_labels_for_scoring(df):
    # example: ['0 1', '3 4'] -> ['0 1; 3 4']
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(
                f'[["{new_lst}"]]'
            )
    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)
    return truths


def get_raw_location_annotation(
    char_probs: "list[np.ndarray]", pn_histories: "list[str]", th: float = 0.5
) -> "tuple[list[list[tuple[int,int]]], list[list[str]]]":
    """前処理なしのlocationと抜き出し"""
    locations = []
    for char_prob in char_probs:
        location = np.where(char_prob >= th)[0]
        location = [
            list(g)
            for _, g in itertools.groupby(
                location, key=lambda n, c=itertools.count(): n - next(c)
            )
        ]
        location = [(min(r), max(r) + 1) for r in location]
        locations.append(location)

    annotations = []
    for text, location in zip(pn_histories, locations):
        annotation = []
        for i, j in location:
            annotation.append(text[i:j])
        annotations.append(annotation)
    return locations, annotations


def remove_white_space_from_head(
    locations: "list[list[tuple[int,int]]]", annotations: "list[list[str]]"
) -> "tuple[list[list[tuple[int,int]]], list[list[str]]]":
    """先頭の後処理。空白や改行等を抜く"""
    # pp
    to_delete = {" ", "\n", "\r"}
    annotations2 = []
    locations2 = []
    for annotation, location in zip(annotations, locations):
        new_annotation = []
        new_location = []
        if len(annotation) == 0:
            annotations2.append([])
            locations2.append([])
            continue
        for anno_seg, (i, j) in zip(annotation, location):
            while anno_seg and anno_seg[0] in to_delete:  # 先頭から変なのを抜いていきます。
                anno_seg = anno_seg[1:]
                i += 1
            new_annotation.append(anno_seg)
            new_location.append((i, j))
        annotations2.append(new_annotation)
        locations2.append(new_location)
    return locations2, annotations2


def get_results(char_probs: "list[np.ndarray]", pn_histories: "list[str]", th=0.5):
    """文字ごとの出力確率と文章→後処理→提出用結果を生成"""
    locations, annotations = get_raw_location_annotation(
        char_probs, pn_histories, th=th
    )
    locations, annotations = remove_white_space_from_head(
        locations, annotations
    )  # 後処理1
    results = []
    for loc in locations:
        result = [f"{i} {j}" for i, j in loc]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(
            np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0
        )
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score


def mix_char_probs(probs_list: "list[list[np.ndarray]]", w_list: "list[float]"):
    """複数の char_probs と w_list で指定した重みで線形結合する"""
    # Optuna 探索だと assert で落ちる可能性があるので雑にコメントアウト
    # assert sum(w_list)==1.0
    # assert all([0<=w<=1 for w in w_list])
    # assert len(probs_list) == len(w_list)
    ret = []
    for char_prob_list in zip(*probs_list):
        tmp = np.zeros_like(char_prob_list[0])
        for w, char_prob in zip(w_list, char_prob_list):
            tmp += w * char_prob
        ret.append(tmp)
    return ret

In [None]:
import numpy as np


def get_optimal_weight(oofs: list, in_char_probs: list) -> list:
    assert len(oofs) == len(in_char_probs)
    truths = create_labels_for_scoring(oofs[0])
    w_range = np.arange(0.0, 1.0, 0.1)
    if len(oofs) == 1:
        print("oofs length should be more than 1")
        raise ValueError()
    else:
        w_list = list(itertools.product(w_range, repeat=len(oofs) - 1))
        best_score = -1.0
        best_th = -1.0
        best_w = None
        for w_combination in tqdm(w_list):
            w_last = 1.0 - sum(w_combination)
            w_combination = list(w_combination)
            w_combination.append(w_last)
            char_probs = [np.zeros_like(char_prob) for char_prob in in_char_probs[0]]
            for i, _ in enumerate(char_probs):
                for w, in_char_prob in zip(w_combination, in_char_probs):
                    char_probs[i] += w * in_char_prob[i]
                    # break
            for th in np.arange(0.5, 0.6, 0.1):
                th = np.round(th, 2)
                results = get_results(
                    char_probs, oofs[0]["pn_history"].to_numpy(), th=th
                )
                preds = get_predictions(results)
                score = get_score(truths, preds)
                if best_score < score:
                    best_th = th
                    best_score = score
                    best_w = w_combination
        return best_w, best_th, best_score

In [None]:
oof099["char_probs"] = exp099_char_probs
oof096["char_probs"] = exp096_char_probs
oof098["char_probs"] = exp098_char_probs
oof097["char_probs"] = exp097_char_probs

In [None]:
import optuna


def get_optimal_weight_for_optuna(
    oofs: list, in_char_probs: list, w_combination, th
) -> list:
    assert len(oofs) == len(in_char_probs)
    assert len(oofs) == len(w_combination)

    truths = create_labels_for_scoring(oofs[0])
    if len(oofs) == 1:
        print("oofs length should be more than 1")
        raise ValueError()
    best_score = -1.0
    best_th = -1.0
    best_w = None
    char_probs = [np.zeros_like(char_prob) for char_prob in in_char_probs[0]]
    for i, _ in enumerate(char_probs):
        for w, in_char_prob in zip(w_combination, in_char_probs):
            char_probs[i] += w * in_char_prob[i]
    th = np.round(th, 2)
    results = get_results(char_probs, oofs[0]["pn_history"].to_numpy(), th=th)
    preds = get_predictions(results)
    score = get_score(truths, preds)
    if best_score < score:
        best_th = th
        best_score = score
        best_w = w_combination
    return best_w, best_th, best_score


def search_weight(oofs: list, in_char_probs: list, timeout) -> list:
    def objective(trial: optuna.Trial) -> float:
        optuna_param = {
            "w1": trial.suggest_uniform("w1", 0.0, 1.0),
            "w2": trial.suggest_uniform("w2", 0.0, 1.0),
            "w3": trial.suggest_uniform("w3", 0.0, 1.0),
            "w4": trial.suggest_uniform("w4", 0.0, 1.0),
            "th": trial.suggest_uniform("th", 0.0, 1.0),
        }
        weight_sum = (
            optuna_param["w1"]
            + optuna_param["w2"]
            + optuna_param["w3"]
            + optuna_param["w4"]
        )
        w_combination = [
            optuna_param["w1"] / weight_sum,
            optuna_param["w2"] / weight_sum,
            optuna_param["w3"] / weight_sum,
            optuna_param["w4"] / weight_sum,
        ]
        th = optuna_param["th"] / weight_sum
        best_w, best_th, score = get_optimal_weight_for_optuna(
            oofs, in_char_probs, w_combination, th
        )
        print(f"w: {best_w}  th:{th:.3f}  score:{score:.5f}")
        return score

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(multivariate=True, group=True, seed=42),
    )
    study.optimize(objective, timeout=timeout)

    print(f"best parameter = {study.best_trial.params}")
    weight_sum = (
        study.best_trial.params["w1"]
        + study.best_trial.params["w2"]
        + study.best_trial.params["w3"]
        + study.best_trial.params["w4"]
    )
    w_combination = [
        study.best_trial.params["w1"] / weight_sum,
        study.best_trial.params["w2"] / weight_sum,
        study.best_trial.params["w3"] / weight_sum,
        study.best_trial.params["w4"] / weight_sum,
    ]
    th = study.best_trial.params["th"] / weight_sum
    print(f"weight: {w_combination}, th: {th}")
    best_w, best_th, score = get_optimal_weight_for_optuna(
        oofs, in_char_probs, w_combination, th
    )
    return best_w, best_th, score

In [None]:
weights = []
ths = []
val_scores = []
for fold in range(4):
    print("fold", fold)
    oof099_train = oof099.query("fold!=@fold").reset_index(drop=True)
    oof099_val = oof099.query("fold==@fold").reset_index(drop=True)
    oof096_train = oof096.query("fold!=@fold").reset_index(drop=True)
    oof096_val = oof096.query("fold==@fold").reset_index(drop=True)
    oof098_train = oof098.query("fold!=@fold").reset_index(drop=True)
    oof098_val = oof098.query("fold==@fold").reset_index(drop=True)
    oof097_train = oof097.query("fold!=@fold").reset_index(drop=True)
    oof097_val = oof097.query("fold==@fold").reset_index(drop=True)

    train_oof_list = [oof099_train, oof096_train, oof098_train, oof097_train]
    train_char_probs_list = [oof["char_probs"].to_list() for oof in train_oof_list]
    val_oof_list = [oof099_val, oof096_val, oof098_val, oof097_val]
    val_char_probs_list = [oof["char_probs"].to_list() for oof in val_oof_list]

    w, bt, bs = search_weight(train_oof_list, train_char_probs_list, timeout=1200)
    print(f"================ fold {fold} =================")
    print(w, bt, bs)
    print(f"==============================================")
    # 評価
    truths = create_labels_for_scoring(val_oof_list[0])
    char_probs = mix_char_probs(val_char_probs_list, w)
    results = get_results(char_probs, val_oof_list[0]["pn_history"].to_numpy(), th=bt)
    preds = get_predictions(results)
    score = get_score(truths, preds)
    weights.append(w)
    ths.append(bt)
    val_scores.append(score)
    print("val score", score)

In [None]:
print(f"weights: {weights}")
print(f"ths: {ths}")
print(f"val_scores: {val_scores}")

In [None]:
def search_threshold(oof: list, char_probs: list, truths, timeout) -> list:
    def objective(trial: optuna.Trial) -> float:
        optuna_param = {
            "th": trial.suggest_uniform("th", 0.0, 1.0),
        }
        th = optuna_param["th"]
        results = get_results(char_probs, oof["pn_history"].to_numpy(), th=th)
        preds = get_predictions(results)
        score = get_score(truths, preds)
        print(f"th:{th:.3f}  score:{score:.5f}")
        return score

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(multivariate=True, group=True, seed=42),
    )
    study.optimize(objective, timeout=timeout)

    print(f"best parameter = {study.best_trial.params}")

    th = study.best_trial.params["th"]
    print(f"best th: {th}")
    results = get_results(char_probs, oof["pn_history"].to_numpy(), th=th)
    preds = get_predictions(results)
    score = get_score(truths, preds)
    return th, score

In [None]:
# しきい値の探索
w = np.zeros(4)
for weight in weights:
    w += np.array(weight)
w = list(w / len(weights))
char_probs = mix_char_probs(
    [exp099_char_probs, exp096_char_probs, exp098_char_probs, exp097_char_probs], w
)

## ============== 一応 th の平均でも score をだす =======================
th = sum(ths) / len(ths)
results = get_results(char_probs, oof096["pn_history"].to_numpy(), th=th)
preds = get_predictions(results)
score = get_score(truths, preds)
print(f"th:{th:.3f}  score:{score:.5f}")
## ==================================================================


truths = create_labels_for_scoring(oof096)
best_th, score = search_threshold(oof096, char_probs, truths, timeout=600)
print(f"final weights: {w}")
print(f"best_th: {best_th}, best_score: {score}")