In [1]:
import gc
import re
import os
import sys
import html
import time
import math
import json
import copy
import torch
import random
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F

from torch import Tensor
from math import ceil, floor
from sklearn.cluster import KMeans
from torch.utils.data import Dataset, DataLoader


START_TIME = time.time()

In [2]:
%%time
!pip install ../input/sacremoses/sacremoses-master/  
sys.path.insert(0, "../input/transformers/transformers-master/")
import transformers

Processing /kaggle/input/sacremoses/sacremoses-master
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sacremoses: filename=sacremoses-0.0.35-cp36-none-any.whl size=882724 sha256=91da4f9a0c40aff1ba26beb8eaff0106091816c07f189a1899a6af0f321aabe2
  Stored in directory: /root/.cache/pip/wheels/82/48/4b/05cb49d913a40c9d76f97931cd747d72fb17a77b0f6415cdba
Successfully built sacremoses
Installing collected packages: sacremoses
Successfully installed sacremoses-0.0.35
CPU times: user 2.53 s, sys: 535 ms, total: 3.07 s
Wall time: 35.5 s


# Params

In [3]:
MODEL_PATHS = {
    'bert-base-uncased': '../input/bertconfigs/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12/',
    'bert-base-cased':   '../input/bertconfigs/cased_L-12_H-768_A-12/cased_L-12_H-768_A-12/',
    'bert-large-uncased': '../input/bertconfigs/uncased_L-24_H-1024_A-16/uncased_L-24_H-1024_A-16/',
    'bert-large-cased': '../input/bertconfigs/cased_L-24_H-1024_A-16/cased_L-24_H-1024_A-16/',
    'bert-large-uncased-whole-word-masking': '../input/bertconfigs/wwm_uncased_L-24_H-1024_A-16/wwm_uncased_L-24_H-1024_A-16/',
    'bert-large-cased-whole-word-masking': '../input/bertconfigs/wwm_cased_L-24_H-1024_A-16/wwm_cased_L-24_H-1024_A-16/',
}

In [4]:
SEED = 2019

CP_PATH = "../input/qa-cp/"
DATA_PATH = "../input/google-quest-challenge/"

sub = pd.read_csv(DATA_PATH + "sample_submission.csv")
TARGETS = list(sub.columns[1:])
NUM_TARGETS = len(TARGETS)

NUM_WORKERS = 1
VAL_BS = 64

MAX_LEN_T = 50
MAX_LEN_Q = 229
MAX_LEN_A = 229
MAX_LEN = 512

SPECIAL_TOKENS = [f"[tgt{i}]" for i in range(len(TARGETS))]

In [5]:
os.listdir('../input/pretrained-bert-models-for-pytorch/')

['bert-base-chinese',
 'bert-large-cased-vocab.txt',
 'bert-base-chinese-vocab.txt',
 'bert-large-uncased-vocab.txt',
 'bert-base-multilingual-uncased',
 'bert-base-multilingual-uncased-vocab.txt',
 'bert-base-uncased',
 'bert-base-multilingual-cased',
 'bert-large-uncased',
 'bert-base-cased',
 'bert-base-cased-vocab.txt',
 'bert-large-cased',
 'bert-base-multilingual-cased-vocab.txt',
 'bert-base-uncased-vocab.txt']

## Embeddings

In [6]:
def create_emb_list(df, varname):
    cat = {"unknown": 0}
    unique_vals = df[varname].unique()

    for i in range(len(unique_vals)):
        cat[unique_vals[i]] = i + 1

    return cat

In [7]:
df_train = pd.read_csv(DATA_PATH + "train.csv")

HOST_EMB_LIST = create_emb_list(df_train, "host")
CAT_EMB_LIST = create_emb_list(df_train, "category")

# Data

### Text cleaning

In [8]:
def clean_urls(x):
    x = re.sub(r'http\S+', ' URL ', x)
    x = re.sub(r'www\S+', ' URL ', x)
    return re.sub(r'@\S+', ' USERNAME ', x)


def clean_apostrophes(x):
    apostrophes = ["’", "‘", "´", "`"]
    for s in apostrophes:
        x = re.sub(s, "'", x)
    return x


def clean_latex_tags(text):
    text = re.sub('(\[ math \]).+(\[ / math \])', 'MATHS', text)
    text = re.sub('(\$\$).+(\$\$)', 'MATHS', text)
    text = re.sub('(\$).+(\$)', 'MATHS', text)
    return text


spaces = ['\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', 
          '\u2061', '\x10', '\x7f', '\x9d', '\xad', '\xa0', '\t', '\n']

def clean_spaces(text):
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text


def clean_numbers(text):
    text = re.sub(r'(\d+)([a-zA-Z])', '\g<1> \g<2>', text)
    text = re.sub(r'(\d+) (th|st|nd|rd) ', '\g<1>\g<2> ', text)
    text = re.sub(r'(\d+),(\d+)', '\g<1>\g<2>', text)
    return text


def clean_text(text):
    # text = html.unescape(text)
    text = clean_apostrophes(text)
    text = clean_urls(text)
#     text = clean_numbers(text)
    text = clean_latex_tags(text)
    text = clean_spaces(text)
    
    return text

### Convert Texts

In [9]:
def trim_input(t, q, a, max_len_q=200, max_len_a=200, max_len_t=60):
    max_len = max_len_t + max_len_q + max_len_a + 4
    len_t, len_q, len_a = len(t), len(q), len(a)
        
    if max_len_t > len_t:
        new_len_t = len_t
        max_len_a = max_len_a + floor((max_len_t - len_t)/2)
        max_len_q = max_len_q + ceil((max_len_t - len_t)/2)
    else:
        new_len_t = max_len_t
    
    if max_len_a > len_a:
        new_len_a = len_a 
        new_len_q = max_len_q + (max_len_a - len_a)
    elif max_len_q > len_q:
        new_len_a = max_len_a + (max_len_q - len_q)
        new_len_q = len_q
    else:
        new_len_a = max_len_a
        new_len_q = max_len_q

    return t[:new_len_t], q[:new_len_q], a[:new_len_a]

In [10]:
def convert_text(
    title, question, answer, transformer, max_len_q=200, max_len_a=200, max_len_t=50,
):
    max_len = 4 + max_len_q + max_len_a + max_len_t  # with sep tokens

    title = transformer.tokenizer.tokenize(title)
    question = transformer.tokenizer.tokenize(question)
    answer = transformer.tokenizer.tokenize(answer)

    tokens_t, tokens_q, tokens_a = trim_input(
        title,
        question,
        answer,
        max_len_t=max_len_t,
        max_len_q=max_len_q,
        max_len_a=max_len_a,
    )

    tokens = ["[CLS]"] + tokens_t + ["[q]"] + tokens_q + ["[a]"] + tokens_a + ["[SEP]"]
    question = transformer.tokenizer.convert_tokens_to_ids(tokens)

    segments = (
        [0] * (1 + len(tokens_t))
        + [1] * (1 + len(tokens_q))
        + [2] * (2 + len(tokens_a))
    )

    padding = [0] * (max_len - len(question))

    return question + padding, segments + padding


def convert_text_sep(
    title,
    question,
    answer,
    transformer,
    max_len_q=512,
    max_len_a=512,
    max_len_t=50,
    use_special=True,
):

    title = transformer.tokenizer.tokenize(title)
    question = transformer.tokenizer.tokenize(question)
    answer = transformer.tokenizer.tokenize(answer)

    tokens_t = title[:max_len_t]

    if use_special:
        tokens_q = ["[q]"] + tokens_t + ["[SEP]"] + question
        tokens_a = ["[a]"] + tokens_t + ["[SEP]"] + answer
    else:
        tokens_q = ["[CLS]"] + tokens_t + ["[SEP]"] + question
        tokens_a = ["[CLS]"] + tokens_t + ["[SEP]"] + answer

    tokens_q = tokens_q[: max_len_q - 1] + ["[SEP]"]
    tokens_a = tokens_a[: max_len_a - 1] + ["[SEP]"]

    question = transformer.tokenizer.convert_tokens_to_ids(tokens_q)
    answer = transformer.tokenizer.convert_tokens_to_ids(tokens_a)

    if use_special:
        segments_q = [0] * (1 + len(tokens_t)) + [1] * (
            len(question) - (1 + len(tokens_t))
        )
        segments_a = [0] * (1 + len(tokens_t)) + [2] * (
            len(answer) - (1 + len(tokens_t))
        )
    else:
        segments_q = [0] * (1 + len(tokens_t)) + [1] * (
            len(question) - (1 + len(tokens_t))
        )
        segments_a = [0] * (1 + len(tokens_t)) + [1] * (
            len(answer) - (1 + len(tokens_t))
        )

    padding_q = [0] * (max_len_q - len(question))
    padding_a = [0] * (max_len_a - len(answer))

    return (
        question + padding_q,
        answer + padding_a,
        segments_q + padding_q,
        segments_a + padding_a,
    )


def convert_text_special(
    title,
    question,
    answer,
    transformer,
    max_len_q=200,
    max_len_a=200,
    max_len_t=50,
    augment=False,
    margin=20,
):
    max_len_q -= len(SPECIAL_TOKENS) - 1
    max_len = 33 + max_len_q + max_len_a + max_len_t

    title = transformer.tokenizer.tokenize(title)
    question = transformer.tokenizer.tokenize(question)
    answer = transformer.tokenizer.tokenize(answer)

    if augment:
        question = augment_text(question, min_len=max_len_q + margin)
        answer = augment_text(answer, min_len=max_len_a + margin)

    tokens_t, tokens_q, tokens_a = trim_input(
        title,
        question,
        answer,
        max_len_t=max_len_t,
        max_len_q=max_len_q,
        max_len_a=max_len_a,
    )

    tokens = (
        SPECIAL_TOKENS + tokens_t + ["[q]"] + tokens_q + ["[a]"] + tokens_a + ["[SEP]"]
    )

    question = transformer.tokenizer.convert_tokens_to_ids(tokens)

    segments = (
        [0] * (len(SPECIAL_TOKENS) + len(tokens_t))
        + [1] * (1 + len(tokens_q))
        + [2] * (2 + len(tokens_a))
    )

    padding = [0] * (max_len - len(question))

    return question + padding, segments + padding

### Datasets

In [11]:
class QATestDataset(Dataset):
    def __init__(
        self,
        df,
        transformer,
        max_len_q=200,
        max_len_a=200,
        max_len_t=60,
        special=False,
        preprocess=False,
    ):
        super().__init__()
        self.tokens = []
        self.ids = []

        if preprocess:
            df = df.copy()
            df["question_title"] = df["question_title"].apply(clean_text)
            df["question_body"] = df["question_body"].apply(clean_text)
            df["answer"] = df["answer"].apply(clean_text)

        for title, question, answer in zip(
            df["question_title"].values, df["question_body"].values, df["answer"].values
        ):
            if special:
                tokens, idx = convert_text_special(
                    title,
                    question,
                    answer,
                    transformer,
                    max_len_q=max_len_q,
                    max_len_a=max_len_a,
                    max_len_t=max_len_t,
                )
            else:
                tokens, idx = convert_text(
                    title,
                    question,
                    answer,
                    transformer,
                    max_len_q=max_len_q,
                    max_len_a=max_len_a,
                    max_len_t=max_len_t,
                )

            self.tokens.append(tokens)
            self.ids.append(idx)

        self.tokens = np.array(self.tokens)
        self.ids = np.array(self.ids)
        self.df = df 


    def __len__(self):
        return len(self.df)

    def getembed(self, idx):
        row = self.df.iloc[idx]

        host = HOST_EMB_LIST.get(row.host)
        cat = CAT_EMB_LIST.get(row.category)

        if host is None:
            host = HOST_EMB_LIST.get("unknown")

        if cat is None:
            cat = CAT_EMB_LIST.get("unknown")
            
        return host, cat

    def __getitem__(self, idx):

        host, cat = self.getembed(idx)

        return (
            torch.tensor(self.tokens[idx]),
            torch.tensor(self.ids[idx]),
            torch.tensor(host),
            torch.tensor(cat),
            torch.tensor(0),
        )

In [12]:
class QATestDatasetSep(Dataset):
    """
    Question and answer are separed
    """
    def __init__(
        self,
        df,
        transformer,
        max_len_q=512,
        max_len_a=512,
        max_len_t=60,
        special=False,
    ):
        super().__init__()
        self.df = df

        self.tokens_q = []
        self.tokens_a = []
        self.idxs_q = []
        self.idxs_a = []

        for title, question, answer in zip(
            df["question_title"].values, df["question_body"].values, df["answer"].values
        ):

            tokens_q, tokens_a, idx_q, idx_a = convert_text_sep(
                title,
                question,
                answer,
                transformer,
                max_len_q=512,
                max_len_a=512,
                max_len_t=max_len_t,
                use_special=special,
            )

            self.tokens_q.append(tokens_q)
            self.tokens_a.append(tokens_a)
            self.idxs_q.append(idx_q)
            self.idxs_a.append(idx_a)

        self.tokens_q = np.array(self.tokens_q)
        self.tokens_a = np.array(self.tokens_a)
        self.idxs_q = np.array(self.idxs_q)
        self.idxs_a = np.array(self.idxs_a)
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.tokens_q[idx]),
            torch.tensor(self.tokens_a[idx]),
            torch.tensor(self.idxs_q[idx]),
            torch.tensor(self.idxs_a[idx]),
            torch.tensor(0),
        )

# Models

In [13]:
class BertMultiPooler(nn.Module):
    def __init__(self, nb_layers=1, input_size=768, nb_ft=768, drop_p=0.1, weights=None):
        super().__init__()
        
        self.nb_layers = nb_layers
        self.input_size = input_size
        self.poolers = nn.ModuleList([])
        
        
        for i in range(nb_layers):
            pooler = nn.Sequential(
                nn.Linear(input_size, nb_ft),
                # nn.Dropout(drop_p),
                nn.Tanh(),
            )

            if weights is not None:
                with torch.no_grad():
                    pooler[0].weight = nn.Parameter(weights.clone())
                    # print('loaded')
            self.poolers.append(pooler)
        

    def forward(self, hidden_states, idx=0):
        bs = hidden_states[0].size()[0]
        if type(idx) == int:
            idx = torch.tensor([idx] * bs).cuda()

        outputs = []
        idx = idx.view(-1, 1, 1).repeat(1, 1, self.input_size)

        for i, (state) in enumerate(hidden_states[:self.nb_layers]):
            token_tensor = state.gather(1, idx).view(bs, -1)

            pooled = self.poolers[i](token_tensor)
            outputs.append(pooled)

        return torch.cat(outputs, -1)

## Old models

In [14]:
class QA_TransformerSpecialOld(nn.Module):
    """ Individual pooler and logits for each targe """

    def __init__(self, model, nb_layers=1, pooler_ft=None):
        super().__init__()
        self.name = model

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
        self.tokenizer = tokenizer_class.from_pretrained(MODEL_PATHS[model])

        bert_config = transformers.BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        bert_config.num_labels = len(TARGETS)
        bert_config.output_hidden_states = True
        
        self.transformer = transformers.BertModel(bert_config)
        
        self.nb_features = self.transformer.pooler.dense.out_features

        with torch.no_grad():

            self.tokenizer.add_tokens(["[q]", "[a]"])

            w = self.transformer.embeddings.word_embeddings.weight
            sep_w = w[102].view(1, -1).detach()
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w, sep_w.clone(), sep_w.clone()])
            )

            self.tokenizer.add_tokens(SPECIAL_TOKENS)
            cls_w = w[101].view(1, -1).detach()

            w = self.transformer.embeddings.word_embeddings.weight
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w] + [cls_w.clone() for _ in range(len(SPECIAL_TOKENS))])
            )

            w = self.transformer.embeddings.token_type_embeddings.weight
            self.transformer.embeddings.token_type_embeddings = nn.Embedding.from_pretrained(
                torch.cat(
                    [
                        w[0].view(1, -1).detach().clone(),
                        w[0].detach().view(1, -1).clone(),
                        w[1].detach().view(1, -1).clone(),
                    ],
                    0,
                )
            )

        if pooler_ft is None:
            pooler_ft = self.nb_features

        self.pooler = nn.ModuleList([])
        self.logit = nn.ModuleList([])

        for i in range(NUM_TARGETS):
            self.pooler.append(
                BertMultiPooler(
                    nb_layers=nb_layers, input_size=self.nb_features, nb_ft=pooler_ft,
                )
            )
            self.logit.append(
                nn.Sequential(nn.Dropout(0.1), nn.Linear(pooler_ft * nb_layers, 1))
            )

    def forward(self, tokens, token_types, host, cat):

        _, _, hidden_states = self.transformer(
            tokens, attention_mask=(tokens > 0).long(), token_type_ids=token_types,
        )

        hidden_states = hidden_states[::-1]

        pooled = [self.pooler[i](hidden_states, idx=i) for i in range(NUM_TARGETS)]
        outputs = [self.logit[i](pooled[i]) for i in range(NUM_TARGETS)]

        return torch.cat(outputs, 1)

In [15]:
class QA_TransformerOld(nn.Module):
    def __init__(self, model, nb_layers=1, pooler_ft=None, use_special_tokens=False):
        super().__init__()
        self.name = model
        ref = None
        self.use_special_tokens = use_special_tokens

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
        self.tokenizer = tokenizer_class.from_pretrained(MODEL_PATHS[model])

        bert_config = transformers.BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        bert_config.num_labels = len(TARGETS)
        bert_config.output_hidden_states = True
        
        self.transformer = transformers.BertModel(bert_config)
        
        self.nb_features = self.transformer.pooler.dense.out_features

        with torch.no_grad():
            self.tokenizer.add_tokens(["[q]", "[a]"])

            w = self.transformer.embeddings.word_embeddings.weight
            sep_w = w[102].view(1, -1).detach()
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w, sep_w.clone(), sep_w.clone()])
            )

            w = self.transformer.embeddings.token_type_embeddings.weight
            self.transformer.embeddings.token_type_embeddings = nn.Embedding.from_pretrained(
                torch.cat(
                    [
                        w[0].view(1, -1).detach().clone(),
                        w[0].detach().view(1, -1).clone(),
                        w[1].detach().view(1, -1).clone(),
                    ],
                    0,
                )
            )

        if pooler_ft is None or pooler_ft == self.nb_features:
            pooler_ft = self.nb_features
            ref = self.transformer.pooler.dense.weight.detach().clone()

        self.pooler_all = BertMultiPooler(
            nb_layers=nb_layers,
            input_size=self.nb_features,
            nb_ft=pooler_ft,
            weights=ref,
        )

        if self.use_special_tokens:  # Use features corresponding to [q] and [a] tokens
            self.pooler_q = BertMultiPooler(
                nb_layers=nb_layers,
                input_size=self.nb_features,
                nb_ft=pooler_ft,
                weights=ref,
            )

            self.pooler_a = BertMultiPooler(
                nb_layers=nb_layers,
                input_size=self.nb_features,
                nb_ft=pooler_ft,
                weights=ref,
            )

            self.logit = nn.Linear(3 * pooler_ft * nb_layers, len(TARGETS))

        else:
            self.logit = nn.Linear(pooler_ft * nb_layers, len(TARGETS))

    def forward(self, tokens, token_types, host, cat):

        _, _, hidden_states = self.transformer(
            tokens, attention_mask=(tokens > 0).long(), token_type_ids=token_types,
        )

        hidden_states = hidden_states[::-1]

        ft = self.pooler_all(hidden_states, 0)

        if self.use_special_tokens:
            ft_q = self.pooler_q(hidden_states, q_idx)
            ft_a = self.pooler_a(hidden_states, a_idx)
            ft = torch.cat([ft, ft_q, ft_a], 1)

        return self.logit(ft)

In [16]:
class QA_TransformerMixOld(nn.Module):
    def __init__(self, model, nb_layers=1, pooler_ft=None, use_special_tokens=False):
        super().__init__()
        self.name = model
        ref = None
        self.use_special_tokens = use_special_tokens

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
        self.tokenizer = tokenizer_class.from_pretrained(MODEL_PATHS[model])

        bert_config = transformers.BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        bert_config.num_labels = len(TARGETS)
        bert_config.output_hidden_states = True
        
        self.transformer = transformers.BertModel(bert_config)
        
        self.nb_features = self.transformer.pooler.dense.out_features

        if pooler_ft is None:
            pooler_ft = self.nb_features

        with torch.no_grad():
            self.tokenizer.add_tokens(["[q]", "[a]"])

            w = self.transformer.embeddings.word_embeddings.weight
            sep_w = w[101].view(1, -1).detach()
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w, sep_w.clone(), sep_w.clone()])
            )

            w = self.transformer.embeddings.token_type_embeddings.weight
            self.transformer.embeddings.token_type_embeddings = nn.Embedding.from_pretrained(
                torch.cat(
                    [
                        w[0].detach().view(1, -1).clone(),
                        w[0].detach().view(1, -1).clone(),
                        w[1].detach().view(1, -1).clone(),
                    ],
                    0,
                )
            )

        self.pooler_a = BertMultiPooler(
            nb_layers=nb_layers,
            input_size=self.nb_features,
            nb_ft=pooler_ft,
        )

        self.pooler_q = BertMultiPooler(
            nb_layers=nb_layers,
            input_size=self.nb_features,
            nb_ft=pooler_ft,
        )
        
        # 0 = both, 1 = q, 2 = a
        self.mix = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 2]
        assert len(self.mix) == len(TARGETS)

        self.logit = nn.ModuleList([])

        for i in range(len(self.mix)):
            self.logit.append(nn.Linear((2 - (self.mix[i] > 0)) * pooler_ft * nb_layers, 1))

    def forward(self, tokens_q, tokens_a, token_types_q, token_types_a):

        _, _, hidden_states_q = self.transformer(
            tokens_q, attention_mask=(tokens_q > 0).long(), token_type_ids=token_types_q,
        )

        _, _, hidden_states_a = self.transformer(
            tokens_a, attention_mask=(tokens_a > 0).long(), token_type_ids=token_types_a,
        )

        hidden_states_q = hidden_states_q[::-1]
        hidden_states_a = hidden_states_a[::-1]

        ft_q = self.pooler_q(hidden_states_q, 0)
        ft_a = self.pooler_a(hidden_states_a, 0)

        outs = []

        for i in range(len(self.mix)):
            if self.mix[i] == 0:
                outs.append(self.logit[i](torch.cat([ft_q, ft_a], -1)))
            elif self.mix[i] == 1:
                outs.append(self.logit[i](ft_q))
            else:
                outs.append(self.logit[i](ft_a))

        return torch.cat(outs, -1)

## New Models

In [17]:
class QA_TransformerMix(nn.Module):
    def __init__(self, model, nb_layers=1, pooler_ft=None, use_special_tokens=False):
        super().__init__()
        self.name = model
        self.use_special_tokens = use_special_tokens

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
        self.tokenizer = tokenizer_class.from_pretrained(MODEL_PATHS[model])

        bert_config = transformers.BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        bert_config.num_labels = len(TARGETS)
        bert_config.output_hidden_states = True
        
        self.transformer = transformers.BertModel(bert_config)
        
        self.nb_features = self.transformer.pooler.dense.out_features

        if pooler_ft is None:
            pooler_ft = self.nb_features

        with torch.no_grad():
            self.tokenizer.add_tokens(["[q]", "[a]"])

            w = self.transformer.embeddings.word_embeddings.weight
            sep_w = w[101].view(1, -1).detach()
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w, sep_w.clone(), sep_w.clone()])
            )

            w = self.transformer.embeddings.token_type_embeddings.weight
            self.transformer.embeddings.token_type_embeddings = nn.Embedding.from_pretrained(
                torch.cat(
                    [
                        w[0].detach().view(1, -1).clone(),
                        w[0].detach().view(1, -1).clone(),
                        w[1].detach().view(1, -1).clone(),
                    ],
                    0,
                )
            )

        self.pooler_a = BertMultiPooler(
            nb_layers=nb_layers, input_size=self.nb_features, nb_ft=pooler_ft,
        )

        self.pooler_q = BertMultiPooler(
            nb_layers=nb_layers, input_size=self.nb_features, nb_ft=pooler_ft,
        )

        # 0 = both, 1 = q, 2 = a   > I have to work on this
        self.mix = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 2]
        assert len(self.mix) == len(TARGETS)

        self.logit = nn.ModuleList([])

        for i in range(len(self.mix)):
            self.logit.append(
                nn.Linear((2 - (self.mix[i] > 0)) * pooler_ft * nb_layers, 1)
            )

    def forward(self, tokens_q, tokens_a, token_types_q, token_types_a):

        _, _, hidden_states_q = self.transformer(
            tokens_q,
            attention_mask=(tokens_q > 0).long(),
            token_type_ids=token_types_q,
        )

        _, _, hidden_states_a = self.transformer(
            tokens_a,
            attention_mask=(tokens_a > 0).long(),
            token_type_ids=token_types_a,
        )

        hidden_states_q = hidden_states_q[::-1]
        hidden_states_a = hidden_states_a[::-1]

        ft_q = self.pooler_q(hidden_states_q, 0)
        ft_a = self.pooler_a(hidden_states_a, 0)

        outs = []

        for i in range(len(self.mix)):
            if self.mix[i] == 0:
                outs.append(self.logit[i](torch.cat([ft_q, ft_a], -1)))
            elif self.mix[i] == 1:
                outs.append(self.logit[i](ft_q))
            else:
                outs.append(self.logit[i](ft_a))

        return torch.cat(outs, -1)


In [18]:
class QA_TransformerFt(nn.Module):
    def __init__(self, model, nb_layers=1, pooler_ft=None, use_special_tokens=False):
        super().__init__()
        self.name = model

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
        self.tokenizer = tokenizer_class.from_pretrained(MODEL_PATHS[model])

        bert_config = transformers.BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        bert_config.num_labels = len(TARGETS)
        bert_config.output_hidden_states = True
        
        self.transformer = transformers.BertModel(bert_config)
        
        self.nb_features = self.transformer.pooler.dense.out_features
        
        if pooler_ft is None:
            pooler_ft = self.nb_features

        with torch.no_grad():
            self.tokenizer.add_tokens(["[q]", "[a]"])

            w = self.transformer.embeddings.word_embeddings.weight
            sep_w = w[102].view(1, -1).detach()
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w, sep_w.clone(), sep_w.clone()])
            )

            w = self.transformer.embeddings.token_type_embeddings.weight
            self.transformer.embeddings.token_type_embeddings = nn.Embedding.from_pretrained(
                torch.cat(
                    [
                        w[0].view(1, -1).detach().clone(),
                        w[0].detach().view(1, -1).clone(),
                        w[1].detach().view(1, -1).clone(),
                    ],
                    0,
                )
            )

        self.pooler_all = BertMultiPooler(
            nb_layers=nb_layers,
            input_size=self.nb_features,
            nb_ft=pooler_ft,
        )

        cat_ft = 64
        host_ft = 64

        self.host_emb = nn.Embedding(64, host_ft)
        self.cat_emb = nn.Embedding(6, cat_ft)

        self.logit = nn.Linear(pooler_ft * nb_layers + host_ft + cat_ft, len(TARGETS))

    def forward(self, tokens, token_types, host, cat):

        _, _, hidden_states = self.transformer(
            tokens, attention_mask=(tokens > 0).long(), token_type_ids=token_types,
        )

        hidden_states = hidden_states[::-1]

        ft = self.pooler_all(hidden_states, 0)

        cat_emb = torch.tanh(self.cat_emb(cat))
        host_emb = torch.tanh(self.host_emb(host))

        ft = torch.cat((ft, cat_emb, host_emb), 1)

        return self.logit(ft)

In [19]:
class QA_TransformerSpecial(nn.Module):
    """ Individual pooler and logits for each targe """

    def __init__(self, model, nb_layers=1, pooler_ft=None):
        super().__init__()
        self.name = model

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
        self.tokenizer = tokenizer_class.from_pretrained(MODEL_PATHS[model])

        bert_config = transformers.BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        bert_config.num_labels = len(TARGETS)
        bert_config.output_hidden_states = True
        
        self.transformer = transformers.BertModel(bert_config)
        
        self.nb_features = self.transformer.pooler.dense.out_features

        with torch.no_grad():
            self.tokenizer.add_tokens(["[q]", "[a]"])

            w = self.transformer.embeddings.word_embeddings.weight
            sep_w = w[102].view(1, -1).detach()
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w, sep_w.clone(), sep_w.clone()])
            )

            self.tokenizer.add_tokens(SPECIAL_TOKENS)
            cls_w = w[101].view(1, -1).detach()

            w = self.transformer.embeddings.word_embeddings.weight
            self.transformer.embeddings.word_embeddings = nn.Embedding.from_pretrained(
                torch.cat([w] + [cls_w.clone() for _ in range(len(SPECIAL_TOKENS))])
            )

            w = self.transformer.embeddings.token_type_embeddings.weight
            self.transformer.embeddings.token_type_embeddings = nn.Embedding.from_pretrained(
                torch.cat(
                    [
                        w[0].view(1, -1).detach().clone(),
                        w[0].detach().view(1, -1).clone(),
                        w[1].detach().view(1, -1).clone(),
                    ],
                    0,
                )
            )

        if pooler_ft is None:
            pooler_ft = self.nb_features

        self.pooler = nn.ModuleList([])
        self.logit = nn.ModuleList([])

        for i in range(NUM_TARGETS):
            self.pooler.append(
                BertMultiPooler(
                    nb_layers=nb_layers, input_size=self.nb_features, nb_ft=pooler_ft,
                )
            )
            self.logit.append(
                nn.Sequential(nn.Dropout(0.1), nn.Linear(pooler_ft * nb_layers, 1))
            )
    
    def forward(self, tokens, token_types, host, cat):
        _, _, hidden_states = self.transformer(
            tokens, attention_mask=(tokens > 0).long(), token_type_ids=token_types,
        )

        hidden_states = hidden_states[::-1]

        pooled = [self.pooler[i](hidden_states, idx=i) for i in range(NUM_TARGETS)]
        outputs = [self.logit[i](pooled[i]) for i in range(NUM_TARGETS)]

        return torch.cat(outputs, 1)

In [20]:
class QA_TransformerDouble(nn.Module):
    def __init__(self, model, nb_layers=1, pooler_ft=None, use_special_tokens=False):
        super().__init__()
        self.name = model
        ref = None
        self.use_special_tokens = use_special_tokens

        model_class, tokenizer_class, pretrained_weights = TRANSFORMERS[model]
        self.tokenizer = tokenizer_class.from_pretrained(MODEL_PATHS[model])

        bert_config = transformers.BertConfig.from_json_file(MODEL_PATHS[model] + 'bert_config.json')
        bert_config.num_labels = len(TARGETS)
        bert_config.output_hidden_states = True
        
        self.transformer = transformers.BertModel(bert_config)
        self.transformer2 = transformers.BertModel(bert_config)

        self.nb_features = self.transformer.pooler.dense.out_features

        if pooler_ft is None:
            pooler_ft = self.nb_features

        self.pooler_a = BertMultiPooler(
            nb_layers=nb_layers, input_size=self.nb_features, nb_ft=pooler_ft,
        )

        self.pooler_q = BertMultiPooler(
            nb_layers=nb_layers, input_size=self.nb_features, nb_ft=pooler_ft,
        )

        # 0 = both, 1 = q, 2 = a
        # self.mix = [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 2]
        # self.mix = [1] * 21 + [2] * 9
        self.mix = [0] * 30 #+ [2] * 9

        assert len(self.mix) == len(TARGETS)

        self.logit = nn.ModuleList([])

        for i in range(len(self.mix)):
            self.logit.append(
                nn.Linear((2 - (self.mix[i] > 0)) * pooler_ft * nb_layers, 1)
            )

    def forward(self, tokens_q, tokens_a, token_types_q, token_types_a):

        _, _, hidden_states_q = self.transformer(
            tokens_q,
            attention_mask=(tokens_q > 0).long(),
            token_type_ids=token_types_q,
        )

        _, _, hidden_states_a = self.transformer2(
            tokens_a,
            attention_mask=(tokens_a > 0).long(),
            token_type_ids=token_types_a,
        )

        hidden_states_q = hidden_states_q[::-1]
        hidden_states_a = hidden_states_a[::-1]

        ft_q = self.pooler_q(hidden_states_q, 0)
        ft_a = self.pooler_a(hidden_states_a, 0)

        outs = []

        for i in range(len(self.mix)):
            if self.mix[i] == 0:
                outs.append(self.logit[i](torch.cat([ft_q, ft_a], -1)))
            elif self.mix[i] == 1:
                outs.append(self.logit[i](ft_q))
            else:
                outs.append(self.logit[i](ft_a))

        return torch.cat(outs, -1)

# Tools

In [21]:
def load_model_weights(model, filename, verbose=1, cp_folder='', strict=True):
    if verbose:
        print(f'\n -> Loading weights from {os.path.join(cp_folder,filename)}\n')
    try:
        model.load_state_dict(os.path.join(cp_folder, filename), strict=strict)
    except BaseException:
        model.load_state_dict(torch.load(os.path.join(cp_folder, filename), map_location='cpu'), strict=strict)
    return model

In [22]:
def predict(model, dataset, batch_size=8, sep=False, num_workers=1):
    model.eval()
    preds = np.empty((0, NUM_TARGETS))
    loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

    with torch.no_grad():
        for data in loader:
            if sep:
                tokens_q, tokens_a, idx_q, idx_a, y_batch = data
                y_pred = model(tokens_q.cuda(), tokens_a.cuda(), idx_q.cuda(), idx_a.cuda()).detach()
            else:
                tokens, idx, host, cat, y_batch = data
                y_pred = model(tokens.cuda(), idx.cuda(), host.cuda(), cat.cuda()).detach()

            preds = np.concatenate([preds, torch.sigmoid(y_pred).cpu().numpy()])

    del y_pred, y_batch, loader
    torch.cuda.empty_cache()

    return preds

In [23]:
def inference(model, dataset, weights_folder, sep=False, batch_size=VAL_BS, num_workers=NUM_WORKERS):
    pred_test = np.zeros((len(dataset), NUM_TARGETS))
    
    cps = [f for f in os.listdir(weights_folder) if '.pt' in f or '.bin' in f]
    assert len(cps) == 5
    
    for weight in cps:
        model = load_model_weights(model, weight, cp_folder=weights_folder) 
        pred_test += predict(model, dataset, sep=sep, batch_size=batch_size, num_workers=num_workers) / len(cps)
        
    return pred_test

# Inference

In [24]:
TRANSFORMERS  = {
    'bert-base-uncased' :  (transformers.BertModel,       transformers.BertTokenizer,       'bert-base-uncased'),
    'bert-base-cased' :    (transformers.BertModel,       transformers.BertTokenizer,       'bert-base-cased'),
    'bert-large-uncased' : (transformers.BertModel,       transformers.BertTokenizer,       'bert-large-uncased'),
    'bert-large-cased' :   (transformers.BertModel,       transformers.BertTokenizer,       'bert-large-cased'),
    'bert-large-uncased-whole-word-masking': (transformers.BertModel,       transformers.BertTokenizer,       'bert-large-uncased-whole-word-masking'),
}

In [25]:
df_test = pd.read_csv(DATA_PATH + "test.csv")
pred_tests = []

## Bert Base - QA_TransformerOld
> '28_01': '../output/pred_oof2020-01-28_0.405.npy',

In [26]:
%%time

model = QA_TransformerOld('bert-base-uncased', nb_layers=8, pooler_ft=1024).cuda()
test_dataset = QATestDataset(df_test, model, max_len_q=MAX_LEN_Q, max_len_a=MAX_LEN_A, max_len_t=MAX_LEN_T)

pred_test = inference(model, test_dataset, '../input/specialdataset/0.405/2020-01-23/')
pred_tests.append(pred_test)


 -> Loading weights from ../input/specialdataset/0.405/2020-01-23/bert-base-uncased_2_cp.pt


 -> Loading weights from ../input/specialdataset/0.405/2020-01-23/bert-base-uncased_3_cp.pt


 -> Loading weights from ../input/specialdataset/0.405/2020-01-23/bert-base-uncased_5_cp.pt


 -> Loading weights from ../input/specialdataset/0.405/2020-01-23/bert-base-uncased_1_cp.pt


 -> Loading weights from ../input/specialdataset/0.405/2020-01-23/bert-base-uncased_4_cp.pt

CPU times: user 36.8 s, sys: 21.1 s, total: 57.8 s
Wall time: 1min 4s


## Bert Base -  QA_TransformerSpecialOld
>'25_01': '../output/pred_oof2020-01-25_0.406.npy',

In [27]:
# %%time

# model = QA_TransformerSpecialOld('bert-base-uncased', nb_layers=8, pooler_ft=128).cuda()
# test_dataset_special = QATestDataset(df_test, model, max_len_q=MAX_LEN_Q, max_len_a=MAX_LEN_A, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_special, '../input/specialdataset/0.4447/2020-01-28/')
# pred_tests.append(pred_test)

## Bert Base - QA_TransformerMixOld
> '01_02': '../output/pred_oof2020-02-01_0.411.npy',

In [28]:
# %%time

# model = QA_TransformerMixOld('bert-base-uncased', nb_layers=8, pooler_ft=512).cuda()
# test_dataset_sep = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_sep, '../input/specialdataset/2020-02-01/2020-02-01/', sep=True)
# pred_tests.append(pred_test)

## Bert Base - QA_TransformerFt
> 'embed': '../output/pred_oof2020-02-06_0.4089.npy',

In [29]:
# %%time

# model = QA_TransformerFt('bert-base-uncased', nb_layers=8, pooler_ft=1024).cuda()
# test_dataset_pp = QATestDataset(df_test, model, max_len_q=MAX_LEN_Q, max_len_a=MAX_LEN_A, max_len_t=MAX_LEN_T, preprocess=True)

# pred_test = inference(model, test_dataset_pp, '../input/bertbase06022/2020-02-06/')
# pred_tests.append(pred_test)

## Bert Base - QA_TransformerFt Weighted BCE
> 'embedw' : '../output/pred_oof2020-02-07_0.4088.npy',

In [30]:
%%time

model = QA_TransformerFt('bert-base-uncased', nb_layers=8, pooler_ft=1024).cuda()
test_dataset_pp = QATestDataset(df_test, model, max_len_q=MAX_LEN_Q, max_len_a=MAX_LEN_A, max_len_t=MAX_LEN_T, preprocess=True)

pred_test = inference(model, test_dataset_pp, '../input/qa-bert-weightedbce/2020-02-07/')
pred_tests.append(pred_test)


 -> Loading weights from ../input/qa-bert-weightedbce/2020-02-07/bert-base-uncased_2_cp.pt


 -> Loading weights from ../input/qa-bert-weightedbce/2020-02-07/bert-base-uncased_3_cp.pt


 -> Loading weights from ../input/qa-bert-weightedbce/2020-02-07/bert-base-uncased_5_cp.pt


 -> Loading weights from ../input/qa-bert-weightedbce/2020-02-07/bert-base-uncased_1_cp.pt


 -> Loading weights from ../input/qa-bert-weightedbce/2020-02-07/bert-base-uncased_4_cp.pt

CPU times: user 32.6 s, sys: 19.4 s, total: 52 s
Wall time: 54 s


## Bert Base - QA_TransformerSpecial
> 'special':   '../output/pred_oof2020-02-06_0.4059.npy',

In [31]:
# %%time

# model = QA_TransformerSpecial('bert-base-uncased', nb_layers=8, pooler_ft=128).cuda()
# test_dataset_special = QATestDataset(df_test, model, max_len_q=MAX_LEN_Q, max_len_a=MAX_LEN_A, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_special, '../input/bertbase0602/0.4059/2020-02-06/')
# pred_tests.append(pred_test)

## Bert Base - QA_TransformerSpecial Weighted BCE
> 'specialw' : '../output/pred_oof2020-02-07_0.4058.npy',

In [32]:
# %%time

# model = QA_TransformerSpecial('bert-base-uncased', nb_layers=8, pooler_ft=128).cuda()
# test_dataset_special = QATestDataset(df_test, model, max_len_q=MAX_LEN_Q, max_len_a=MAX_LEN_A, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_special, '../input/bertbase06022/0.4058/2020-02-08/')
# pred_tests.append(pred_test)

## Bert Base - QA_TransformerMix
> 'mix':     '../output/pred_oof2020-02-06_0.414.npy',

In [33]:
# %%time

# model = QA_TransformerMix('bert-base-uncased', nb_layers=8, pooler_ft=512).cuda()
# test_dataset_sep = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_sep, '../input/bertbase0602/0.414/2020-02-05/', sep=True)
# pred_tests.append(pred_test)

## Bert Base - QA_TransformerMix Weighted BCE
> 'mixw' : '../output/pred_oof2020-02-07_0.414-wbce.npy',

In [34]:
# %%time

# model = QA_TransformerMix('bert-base-uncased', nb_layers=8, pooler_ft=512).cuda()
# test_dataset_sep = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_sep, '../input/weightedbce-oof-04465/', sep=True)
# pred_tests.append(pred_test)

## Bert Base - QA_TransformerDouble Weighted BCE
> 'double': '../output/pred_oof2020-02-07_0.4111.npy', 

In [35]:
# %%time

# model = QA_TransformerDouble('bert-base-uncased', nb_layers=8, pooler_ft=512).cuda()
# test_dataset_double = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=False)

# pred_test = inference(model, test_dataset_double, '../input/qadoubleweighted/2020-02-07/', sep=True)
# pred_tests.append(pred_test)

## Bert Large WWM - QA_TransformerMix
> 'wwm' : '../output/pred_oof2020-02-06_0.416-wwm.npy',

In [36]:
# %%time



# model = QA_TransformerMix('bert-large-uncased-whole-word-masking', nb_layers=8, pooler_ft=512).cuda()
# test_dataset_sep_wwm = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_sep_wwm, '../input/questqabertlarge/bert-large-custom/', sep=True)
# pred_tests.append(pred_test)

## Bert Large WWM - QA_TransformerMix 2
> 'wwm2' : '../output/pred_oof2020-02-08_0.416_wwm_v2.npy',

In [37]:
# %%time

# model = QA_TransformerMix('bert-large-uncased-whole-word-masking', nb_layers=8, pooler_ft=768).cuda()
# test_dataset_sep_wwm = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

# pred_test = inference(model, test_dataset_sep_wwm, '../input/questqabertlargev2/bert-large-custom/', sep=True)
# pred_tests.append(pred_test)

## Bert Large WWM - QA_TransformerMix Weighted BCE
> 'wwm3': '../input/questqabertlargev3/pred_oof2020-02-08_0.415.npy'

In [38]:
%%time

model = QA_TransformerMix('bert-large-uncased-whole-word-masking', nb_layers=8, pooler_ft=768).cuda()
test_dataset_sep_wwm = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

pred_test = inference(model, test_dataset_sep_wwm, '../input/questqabertlargev3/bert-large-custom/bert-large-custom/', sep=True)
pred_tests.append(pred_test)


 -> Loading weights from ../input/questqabertlargev3/bert-large-custom/bert-large-custom/bert_pytorch_fold_1.bin


 -> Loading weights from ../input/questqabertlargev3/bert-large-custom/bert-large-custom/bert_pytorch_fold_4.bin


 -> Loading weights from ../input/questqabertlargev3/bert-large-custom/bert-large-custom/bert_pytorch_fold_2.bin


 -> Loading weights from ../input/questqabertlargev3/bert-large-custom/bert-large-custom/bert_pytorch_fold_0.bin


 -> Loading weights from ../input/questqabertlargev3/bert-large-custom/bert-large-custom/bert_pytorch_fold_3.bin

CPU times: user 2min 42s, sys: 1min 56s, total: 4min 39s
Wall time: 4min 44s


In [39]:
pred_test_spelling = pred_test[:, 19]

## Bert Large Cased
> 'large1': '../output/pred_oof2020-02-07_0.416-bl1.npy',

In [40]:
%%time

model = QA_TransformerMix('bert-large-cased', nb_layers=24, pooler_ft=128).cuda()
test_dataset_sep_blc = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

pred_test = inference(model, test_dataset_sep_blc, '../input/bertlarge1/', sep=True)
pred_tests.append(pred_test)


 -> Loading weights from ../input/bertlarge1/bert-large-cased_4_cp.pt


 -> Loading weights from ../input/bertlarge1/bert-large-cased_1_cp.pt


 -> Loading weights from ../input/bertlarge1/bert-large-cased_3_cp.pt


 -> Loading weights from ../input/bertlarge1/bert-large-cased_5_cp.pt


 -> Loading weights from ../input/bertlarge1/bert-large-cased_2_cp.pt

CPU times: user 2min 43s, sys: 1min 57s, total: 4min 40s
Wall time: 4min 45s


## Bert Large Uncased
>  'large3': '../output/pred_oof2020-02-08_0.417-bl3.npy',

In [41]:
%%time

model = QA_TransformerMix('bert-large-uncased', nb_layers=24, pooler_ft=128).cuda()
test_dataset_sep_blu = QATestDatasetSep(df_test, model, max_len_q=MAX_LEN, max_len_a=MAX_LEN, max_len_t=MAX_LEN_T, special=True)

pred_test = inference(model, test_dataset_sep_blu, '../input/bertlarge3/', sep=True)
pred_tests.append(pred_test)


 -> Loading weights from ../input/bertlarge3/bert-large-uncased_5_cp.pt


 -> Loading weights from ../input/bertlarge3/bert-large-uncased_1_cp.pt


 -> Loading weights from ../input/bertlarge3/bert-large-uncased_4_cp.pt


 -> Loading weights from ../input/bertlarge3/bert-large-uncased_2_cp.pt


 -> Loading weights from ../input/bertlarge3/bert-large-uncased_3_cp.pt

CPU times: user 2min 42s, sys: 1min 57s, total: 4min 39s
Wall time: 4min 44s


# Post processing

In [42]:
ALPHA = 1

ENSEMBLING = True

CHOSEN = 'wwm2'
TO_ENS = ['28_01', 'embedw', 'large1', 'large3', 'wwm3']

N_CLUSTER_ENS = [0, 0, 3, 0, 0, 3, 0, 0, 0, 2, 0, 6, 3, 2, 3, 3, 8, 0, 0, 3, 0, 9, 0, 9, 0, 0, 9, 0, 0, 0]

In [43]:
pred_test = np.mean(np.array(pred_tests) ** ALPHA, axis=0)

if ENSEMBLING:
    assert len(pred_tests) == len(TO_ENS)
    print(f'Ensembling {len(pred_tests)} models')
else:
    print(f'Using {CHOSEN} model')

Ensembling 5 models


In [44]:
N_CLUSTERS = {
    '25_01': [0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 6, 3, 6, 3, 3, 6, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # CV 0.440
    '28_01': [0, 0, 3, 0, 0, 3, 0, 0, 0, 2, 0, 6, 3, 3, 3, 3, 8, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # CV 0.4448
    '01_02': [0, 0, 6, 0, 9, 3, 0, 3, 0, 2, 0, 7, 3, 2, 3, 3, 8, 0, 0, 3, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0],  # CV 0.4372
    'embed' : [0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 6, 3, 2, 3, 3, 7, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # CV 0.4404 - 
    'special' : [0, 0, 3, 0, 0, 3, 0, 3, 0, 2, 0, 6, 3, 3, 3, 5, 6, 0, 0, 2, 0, 0, 0, 28, 9, 29, 0, 0, 0, 0], # CV 0.4436 - 
    'mix': [0, 0, 3, 0, 0, 3, 0, 7, 0, 2, 0, 8, 2, 2, 3, 5, 7, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13], # CV 0.4419 - 
    'wwm' : [0, 0, 5, 0, 0, 3, 0, 8, 3, 2, 0, 7, 3, 4, 4, 3, 8, 0, 0, 6, 0, 0, 0, 0, 3, 0, 8, 0, 0, 0] , # CV 0.4455
    'mixw' : [0, 0, 3, 0, 0, 3, 0, 0, 0, 2, 0, 7, 3, 2, 3, 3, 7, 0, 0, 3, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0], # CV 0.4464
    'embedw' : [0, 0, 3, 0, 0, 3, 0, 0, 0, 3, 0, 8, 3, 2, 3, 3, 6, 0, 0, 2, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0], # CV 0.4458
    'specialw' : [0, 0, 3, 0, 0, 3, 0, 9, 0, 2, 0, 6, 2, 3, 3, 5, 8, 0, 0, 2, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0], # CV 0.444
    'doublew': [0, 0, 3, 0, 0, 3, 0, 6, 8, 3, 0, 8, 3, 3, 3, 5, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], # CV 0.4401
    'large1': [0, 0, 3, 0, 0, 3, 0, 0, 0, 2, 0, 8, 3, 3, 4, 5, 9, 0, 0, 3, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0], # CV 0.4441
    'large3': [0, 0, 3, 0, 0, 3, 8, 8, 0, 2, 0, 8, 3, 3, 5, 3, 6, 0, 0, 5, 0, 0, 0, 0, 7, 0, 8, 0, 0, 0], # CV 0.4461
    'wwm2': [0, 0, 3, 0, 0, 3, 0, 9, 6, 3, 0, 8, 3, 3, 3, 3, 7, 0, 0, 3, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0], # CV 0.4483
    'wwm3': [0, 0, 3, 0, 0, 3, 0, 6, 6, 3, 0, 8, 3, 3, 3, 3, 8, 0, 0, 3, 0, 0, 0, 22, 0, 0, 0, 0, 0, 0], # 0.4525
}

PRED_OOFS = {
    '25_01': '../input/quest-oof/pred_oof2020-01-25_0.406.npy',
    '28_01': '../input/quest-oof/pred_oof2020-01-28_0.405.npy',
    '01_02': '../input/quest-oof/pred_oof2020-02-01_0.411.npy',
    'embed': '../input/quest-oof/pred_oof2020-02-06_0.4089.npy',
    'special': '../input/quest-oof/pred_oof2020-02-06_0.4059.npy',
    'mix': '../input/quest-oof/pred_oof2020-02-06_0.414.npy',
    'wwm' : '../input/quest-oof/pred_oof2020-02-06_0.416-wwm.npy',
    'mixw' : '../input/quest-oof/pred_oof2020-02-07_0.414.npy',
    'embedw' : '../input/quest-oof/pred_oof2020-02-07_0.4088.npy',
    'specialw' : '../input/quest-oof/pred_oof2020-02-08_0.4058.npy',
    'doublew': '../input/quest-oof/pred_oof2020-02-07_0.4111.npy',
    'large1': '../input/quest-oof/pred_oof2020-02-07_0.416-bl1.npy',
    'large3': '../input/quest-oof/pred_oof2020-02-08_0.417-bl3.npy',
    'wwm2' : '../input/quest-oof/pred_oof2020-02-08_0.416_wwm_v2.npy',
    'wwm3': '../input/questqabertlargev3/pred_oof2020-02-08_0.415.npy'
}

In [45]:
if ENSEMBLING:
    pred_oof = np.mean(np.array([np.load(PRED_OOFS[date]) for date in TO_ENS]) ** ALPHA, axis=0)
else:
    pred_oof = np.load(PRED_OOFS[CHOSEN])

In [46]:
post_processed_preds = pred_test.copy()

for col in range(pred_test.shape[1]):  
    if ENSEMBLING:
        n_clusts = N_CLUSTER_ENS[col]
    else:
        n_clusts = N_CLUSTERS[CHOSEN][col]
    
    if n_clusts:
        preds = [0]
        while len(np.unique(preds)) == 1: # At least 2 clusters on test data
            kmeans = KMeans(n_clusters=n_clusts)

        #     kmeans.fit(pred_test[:, col].reshape(-1, 1))
            kmeans.fit(np.concatenate([pred_oof, pred_test])[:, col].reshape(-1, 1))
            preds = kmeans.cluster_centers_[kmeans.predict(pred_test[:, col].reshape(-1, 1))].reshape(-1)

            if len(np.unique(preds)) <= 1:
                print(f"{TARGETS[col]} took only 1 value using {n_clusts} clusters.")
            n_clusts += 1

        post_processed_preds[:, col] = preds

In [47]:
DEBUG_QUESTION_TYPE_SPELLING = False

if DEBUG_QUESTION_TYPE_SPELLING:
    print('Using working preds for question spelling...')
    oof_spelling = np.load(PRED_OOFS['wwm3'])[:, 19]
    N_CLUSTS_SPELLING = 6
    post_processed_preds = pred_test.copy()


    preds_spell = [0]
    n_clusts = N_CLUSTS_SPELLING
    while len(np.unique(preds_spell)) == 1: # At least 2 clusters on test data
        kmeans = KMeans(n_clusters=n_clusts)

    #     kmeans.fit(pred_test[:, col].reshape(-1, 1))
        kmeans.fit(np.concatenate([oof_spelling, pred_test_spelling]).reshape(-1, 1))
        preds_spell = kmeans.cluster_centers_[kmeans.predict(pred_test_spelling.reshape(-1, 1))].reshape(-1)

        if len(np.unique(preds)) <= 1:
            print(f"{TARGETS[19]} took only 1 value using {n_clusts} clusters.")
        n_clusts += 1

    post_processed_preds[:, 19] = preds_spell

In [48]:
from collections import Counter
Counter(post_processed_preds[:, 19])

Counter({0.0006080824540700975: 475, 0.024543984965188424: 1})

In [49]:
post_processed_preds = (post_processed_preds - post_processed_preds.min(0)) / (post_processed_preds.max(0) - post_processed_preds.min(0) + 1e-2) + 1e-2

In [50]:
post_processed_preds.std(0)

array([0.20024112, 0.27199909, 0.15964718, 0.14436481, 0.17210523,
       0.21936884, 0.2082748 , 0.21009018, 0.24164612, 0.19867148,
       0.20718873, 0.35191846, 0.15175598, 0.16893079, 0.1365924 ,
       0.18239254, 0.38705961, 0.21832731, 0.29400285, 0.03229463,
       0.25250739, 0.14896059, 0.17559896, 0.16229467, 0.16685342,
       0.16432833, 0.3728473 , 0.19257205, 0.29692411, 0.12792454])

In [51]:
sub.loc[:, 'question_asker_intent_understanding':] = post_processed_preds
# sub.loc[:, 'question_asker_intent_understanding':] = pred_test

sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,qa_id,question_asker_intent_understanding,question_body_critical,question_conversational,question_expect_short_answer,question_fact_seeking,question_has_commonly_accepted_answer,question_interestingness_others,question_interestingness_self,question_multi_intent,...,question_well_written,answer_helpful,answer_level_of_information,answer_plausible,answer_relevance,answer_satisfaction,answer_type_instructions,answer_type_procedure,answer_type_reason_explanation,answer_well_written
0,39,0.853302,0.477433,0.375735,0.439522,0.350621,0.584233,0.770825,0.672719,0.663801,...,0.852609,0.706211,0.065373,0.710834,0.643477,0.386876,0.01,0.077778,0.862586,0.721979
1,46,0.529993,0.205285,0.01,0.760525,0.629067,0.993384,0.257333,0.188769,0.120047,...,0.119693,0.862692,0.418266,0.797313,0.803752,0.657086,0.998684,0.405071,0.083281,0.649691
2,70,0.712409,0.485875,0.01,0.761411,0.821999,0.993384,0.355109,0.213505,0.199167,...,0.710335,0.791976,0.195472,0.797313,0.638598,0.494461,0.01,0.140911,0.878453,0.732529
3,132,0.598202,0.124502,0.01,0.653458,0.573879,0.993384,0.186443,0.108334,0.094425,...,0.342772,0.92252,0.604636,0.873346,0.872145,0.763046,0.908874,0.486404,0.575739,0.673916
4,200,0.738152,0.086272,0.01,0.798367,0.629815,0.993384,0.543789,0.48909,0.162136,...,0.087105,0.706211,0.472729,0.797313,0.670121,0.536581,0.392128,0.394999,0.632064,0.618456


In [52]:
print(f'Done in {(time.time() - START_TIME) / 60 :.1f} minutes')

Done in 16.9 minutes
