In [None]:
import os

os.listdir("/kaggle/input")

In [None]:
from cuml import ForestInference


discourses = ['Lead', 'Position', 'Evidence', 'Claim', 'Concluding Statement', 'Counterclaim', 'Rebuttal']
xgb_models, lgb_models = dict(), dict()

ensemble_weights = {"Rebuttal": 0.65,
                    "Counterclaim": 0.75,
                    "Concluding Statement": 0.60,
                    "Claim": 0.65,
                    "Evidence": 0.60,
                    "Position": 0.75,
                    "Lead": 0.70
                    }


thresholds = {'Lead': 0.66,
 'Position': 0.56,
 'Evidence': 0.57,
 'Claim': 0.54,
 'Concluding Statement': 0.56,
 'Counterclaim': 0.7,
 'Rebuttal': 0.74}

features_dict = {'Lead': [i for i in range(34)],
 'Position': [i for i in range(34)],
 'Evidence': [i for i in range(20)],
 'Claim': [i for i in range(20)],
 'Concluding Statement': [i for i in range(34)],
 'Counterclaim': [i for i in range(17)] + [i for i in range(27, 34)],
 'Rebuttal': [i for i in range(17)]}

N_XGB_FOLDS = 5

for d in discourses:
    model_list = []
    for f in range(N_XGB_FOLDS):
        xgb_model = ForestInference.load(f"../input/student-writing-7322/xgb_{d}_{f}.json", output_class=True, model_type="xgboost_json")
        model_list.append(xgb_model)
    xgb_models[d] = model_list

    model_list = []
    for f in range(N_XGB_FOLDS):
        lgb_model = ForestInference.load(f"../input/student-writing-7322/lgb_{d}_{f}.txt", output_class=True, model_type="lightgbm")
        model_list.append(lgb_model)
    lgb_models[d] = model_list

In [None]:
def get_tp_prob(testDs, disc_type):

    if testDs.features.shape[0] == 0:
        return np.array([])

    pred = np.mean([clf.predict_proba(testDs.features[:, features_dict[disc_type]].astype("float32"))[:,1] for clf in xgb_models[disc_type]], axis=0)/2
    pred += np.mean([clf.predict_proba(testDs.features[:, features_dict[disc_type]].astype("float32"))[:, 1] for clf in lgb_models[disc_type]], axis=0)/2

    return pred

In [None]:
import os, sys
# DECLARE HOW MANY GPUS YOU WISH TO USE.
# KAGGLE ONLY HAS 1, BUT OFFLINE, YOU CAN USE MORE
os.environ["CUDA_VISIBLE_DEVICES"]="0" #0,1,2,3 for four gpu

# IF VARIABLE IS NONE, THEN NOTEBOOK COMPUTES TOKENS
# OTHERWISE NOTEBOOK LOADS TOKENS FROM PATH
LOAD_TOKENS_FROM = '../input/py-bigbird-v26'

# IF VARIABLE IS NONE, THEN NOTEBOOK TRAINS A NEW MODEL
# OTHERWISE IT LOADS YOUR PREVIOUSLY TRAINED MODEL
LOAD_MODEL_FROM = '../input/fp-test78'

# IF FOLLOWING IS NONE, THEN NOTEBOOK
# USES INTERNET AND DOWNLOADS HUGGINGFACE
# CONFIG, TOKENIZER, AND MODEL
DOWNLOADED_MODEL_PATH = '../input/deberta-xlarge'


# A cache of the BigBird predictions for the validation/sequence training set and the corresponding sequence dataset
KAGGLE_CACHE = '../input/feedbackcache2'

N_FEATURES=34

TEST_PERCENT = None

cache = 'cache'
cacheExists = os.path.exists(cache)
if not cacheExists:
  os.makedirs(cache)

In [None]:
from torch import cuda
config = {'model_name': '',
         'max_length': 2048,
         'train_batch_size':4,
         'valid_batch_size':4,
         'epochs':5,
         'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm':10,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

In [None]:
import numpy as np, os
from scipy import stats
import pandas as pd, gc
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification, AdamW
from transformers import *


from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import accuracy_score
from torch.cuda import amp
import warnings

warnings.filterwarnings('ignore', '.*__floordiv__ is deprecated.*',)

In [None]:
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
test_names, test_texts = [], []
for f in list(os.listdir('../input/feedback-prize-2021/test')):
    test_names.append(f.replace('.txt', ''))
    test_texts.append(open('../input/feedback-prize-2021/test/' + f, 'r').read())
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})

if TEST_PERCENT is not None:
    print(f"testing and submitting with only {TEST_PERCENT} of test data")
    np.random.seed(2022)
    test_select=np.arange(len(test_texts))
    np.random.shuffle(test_select)
    test_texts=test_texts.iloc[test_select[:int(TEST_PERCENT*len(test_texts))]].reset_index()

#sort by length of texts to minimize padding in each batch
test_texts['len']=test_texts['text'].apply(lambda x:len(x.split()))
test_texts=test_texts.sort_values(by=['len']).reset_index()




test_texts

SUBMISSION = True
if len(test_names) > 5:
      SUBMISSION = True

test_texts.head()

# Convert Train Text to NER Labels
We will now convert all text words into NER labels and save in a dataframe.

In [None]:
# CREATE DICTIONARIES THAT WE CAN USE DURING TRAIN AND INFER
output_labels = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim',
          'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

labels_to_ids = {v:k for k,v in enumerate(output_labels)}
ids_to_labels = {k:v for k,v in enumerate(output_labels)}
disc_type_to_ids = {'Evidence':(11,12),'Claim':(5,6),'Lead':(1,2),'Position':(3,4),'Counterclaim':(7,8),'Rebuttal':(9,10),'Concluding Statement':(13,14)}

In [None]:
labels_to_ids


Define the dataset function

Below is our PyTorch dataset function. It always outputs tokens and attention. During training it also provides labels. And during inference it also provides word ids to help convert token predictions into word predictions.

Note that we use text.split() and is_split_into_words=True when we convert train text to labeled train tokens. This is how the HugglingFace tutorial does it. However, this removes characters like \n new paragraph. If you want your model to see new paragraphs, then we need to map words to tokens ourselves using return_offsets_mapping=True.

See my TensorFlow notebook here for an example: https://www.kaggle.com/cdeotte/tensorflow-longformer-ner-cv-0-617

Some of the following code comes from the example at HuggingFace here. https://huggingface.co/docs/transformers/custom_datasets#tok_ner

However I think the code at that link is wrong. The HuggingFace original code is here: https://github.com/huggingface/transformers/blob/86b40073e9aee6959c8c85fcba89e47b432c4f4d/examples/pytorch/token-classification/run_ner.py#L371

With the flag LABEL_ALL we can either label just the first subword token (when one word has more than one subword token). Or we can label all the subword tokens (with the word's label). In this notebook version, we label all the tokens.

There is a Kaggle discussion here: https://www.kaggle.com/c/feedback-prize-2021/discussion/296713

In [None]:
# Return an array that maps character index to index of word in list of split() words
def split_mapping(unsplit):
    splt = unsplit.split()
    offset_to_wordidx = np.full(len(unsplit),-1)
    txt_ptr = 0
    for split_index, full_word in enumerate(splt):
        while unsplit[txt_ptr:txt_ptr + len(full_word)] != full_word:
            txt_ptr += 1
        offset_to_wordidx[txt_ptr:txt_ptr + len(full_word)] = split_index
        txt_ptr += len(full_word)
    return offset_to_wordidx

In [None]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len, get_wids):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids # for validation

  def __getitem__(self, index):
        # GET TEXT AND WORD LABELS
        text = self.data.text[index]
        word_labels = self.data.entities[index] if not self.get_wids else None

        # TOKENIZE TEXT
        encoding = self.tokenizer(text,
                             return_offsets_mapping=True,
                             padding=False,
                             truncation=True,
                             max_length=self.max_len)

        word_ids = encoding.word_ids()
        split_word_ids = np.full(len(word_ids),-1)
        offset_to_wordidx = split_mapping(text)
        offsets = encoding['offset_mapping']

        # CREATE TARGETS AND MAPPING OF TOKENS TO SPLIT() WORDS
        label_ids = []
        # Iterate in reverse to label whitespace tokens until a Begin token is encountered
        for token_idx, word_idx in reversed(list(enumerate(word_ids))):

            if word_idx is None:
                if not self.get_wids: label_ids.append(-100)
            else:
                if offsets[token_idx][0] != offsets[token_idx][1]:
                    #Choose the split word that shares the most characters with the token if any
                    split_idxs = offset_to_wordidx[offsets[token_idx][0]:offsets[token_idx][1]]
                    split_index = stats.mode(split_idxs[split_idxs != -1]).mode[0] if len(np.unique(split_idxs)) > 1 else split_idxs[0]

                    if split_index != -1:
                        if not self.get_wids: label_ids.append( labels_to_ids[word_labels[split_index]] )
                        split_word_ids[token_idx] = split_index
                    else:
                        # Even if we don't find a word, continue labeling 'I' tokens until a 'B' token is found
                        if label_ids and label_ids[-1] != -100 and ids_to_labels[label_ids[-1]][0] == 'I':
                            split_word_ids[token_idx] = split_word_ids[token_idx + 1]
                            if not self.get_wids: label_ids.append(label_ids[-1])
                        else:
                            if not self.get_wids: label_ids.append(-100)
                else:
                    if not self.get_wids: label_ids.append(-100)

        encoding['labels'] = list(reversed(label_ids))

        # CONVERT TO TORCH TENSORS
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.get_wids:
            item['wids'] = torch.as_tensor(split_word_ids)

        return item

  def __len__(self):
        return self.len

class CustomCollate:
    def __init__(self,tokenizer,sliding_window=None):
        self.tokenizer=tokenizer
        self.sliding_window=sliding_window

    def __call__(self,data):
        """
        need to collate: input_ids, attention_mask, labels
        input_ids is padded with 1, attention_mask 0, labels -100

        """

        bs=len(data)
        lengths=[]
        for i in range(bs):
            lengths.append(len(data[i]['input_ids']))
            # print(data[i]['input_ids'].shape)
            # print(data[i]['attention_mask'].shape)
            # print(data[i]['labels'].shape)
        max_len=max(lengths)
        if self.sliding_window is not None and max_len > self.sliding_window:
            max_len= int((np.floor(max_len/self.sliding_window-1e-6)+1)*self.sliding_window)
        #always pad the right side
        input_ids, attention_mask, labels, BIO_labels, discourse_labels=[],[],[],[],[]
        #if np.random.uniform()>0.5:
        #print(data[0].keys())
        #print(max_len)
        if 'wids' in data[0]:
            get_wids=True
        else:
            get_wids=False
        #print(get_wids)
        wids = []
            #wids.append(torch.nn.functional.pad(data[i]['wids'],(0,max_len-lengths[i]),value=-1))
        for i in range(bs):
            input_ids.append(torch.nn.functional.pad(data[i]['input_ids'],(0,max_len-lengths[i]),value=self.tokenizer.pad_token_id))
            attention_mask.append(torch.nn.functional.pad(data[i]['attention_mask'],(0,max_len-lengths[i]),value=0))
            #labels.append(torch.nn.functional.pad(data[i]['labels'],(0,max_len-lengths[i]),value=-100))
            #BIO_labels.append(torch.nn.functional.pad(data[i]['BIO_labels'],(0,max_len-lengths[i]),value=-100))
            #discourse_labels.append(torch.nn.functional.pad(data[i]['discourse_labels'],(0,max_len-lengths[i]),value=-100))
            if get_wids:
                wids.append(torch.nn.functional.pad(data[i]['wids'],(0,max_len-lengths[i]),value=-1))
        # else:
        #     for i in range(bs):
        #         input_ids.append(torch.nn.functional.pad(data[i]['input_ids'],(max_len-lengths[i],0),value=1))
        #         attention_mask.append(torch.nn.functional.pad(data[i]['attention_mask'],(max_len-lengths[i],0),value=0))
        #         labels.append(torch.nn.functional.pad(data[i]['labels'],(max_len-lengths[i],0),value=-100))

        input_ids=torch.stack(input_ids)
        attention_mask=torch.stack(attention_mask)
        #labels=torch.stack(labels)
        #BIO_labels=torch.stack(BIO_labels)
        #discourse_labels=torch.stack(discourse_labels)
        if get_wids:
            wids=torch.stack(wids)
        #exit()
        if get_wids:
            return {"input_ids":input_ids,"attention_mask":attention_mask,
            "labels":labels,"BIO_labels":BIO_labels,"discourse_labels":discourse_labels,
            "wids":wids}
        else:
            return {"input_ids":input_ids,"attention_mask":attention_mask,
            "labels":labels,"BIO_labels":BIO_labels,"discourse_labels":discourse_labels}

In [None]:
test_params = {'batch_size': config['valid_batch_size'],
                'shuffle': False,
                'num_workers': 2,
                'pin_memory':True
                }

tokenizer = AutoTokenizer.from_pretrained(DOWNLOADED_MODEL_PATH)


# TEST DATASET
test_texts_set = dataset(test_texts, tokenizer, config['max_length'], True)
test_texts_loader = DataLoader(test_texts_set, **test_params,collate_fn=CustomCollate(tokenizer,512))

tokenizer_longformer = AutoTokenizer.from_pretrained("../input/pytorch-longformer-large")
test_texts_set_longformer = dataset(test_texts, tokenizer_longformer, config['max_length'], True)
test_texts_loader_longformer = DataLoader(test_texts_set_longformer, **test_params,collate_fn=CustomCollate(tokenizer))

# Network

In [None]:
from transformers import *
import torch.nn as nn
import torch.nn.functional as F
rearrange_indices=[14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
class ResidualLSTM(nn.Module):

    def __init__(self, d_model,rnn):
        super(ResidualLSTM, self).__init__()
        self.downsample=nn.Linear(d_model,d_model//2)
        if rnn=='GRU':
            self.LSTM=nn.GRU(d_model//2, d_model//2, num_layers=2, bidirectional=False, dropout=0.2)
        else:
            self.LSTM=nn.LSTM(d_model//2, d_model//2, num_layers=2, bidirectional=False, dropout=0.2)
        self.dropout1=nn.Dropout(0.2)
        self.norm1= nn.LayerNorm(d_model//2)
        self.linear1=nn.Linear(d_model//2, d_model*4)
        self.linear2=nn.Linear(d_model*4, d_model)
        self.dropout2=nn.Dropout(0.2)
        self.norm2= nn.LayerNorm(d_model)

    def forward(self, x):
        res=x
        x=self.downsample(x)
        x, _ = self.LSTM(x)
        x=self.dropout1(x)
        x=self.norm1(x)
        x=F.relu(self.linear1(x))
        x=self.linear2(x)
        x=self.dropout2(x)
        x=res+x
        return self.norm2(x)


class ConvLSTMHead(nn.Module):
    def __init__(self):
        super(ConvLSTMHead, self).__init__()
        self.downsample=nn.Sequential(nn.Linear(1024,256))
        self.conv1=  nn.Sequential(nn.Conv1d(256,256,3,padding=1),
                                  nn.ReLU())
        self.norm1 = nn.LayerNorm(256)
        self.conv2=  nn.Sequential(nn.Conv1d(256,256,3,padding=1),
                                  nn.ReLU())
        self.norm2 = nn.LayerNorm(256)
        #self.lstm=nn.LSTM(256,256,2,bidirectional=True)
        self.lstm=ResidualLSTM(256)
        self.upsample=nn.Sequential(nn.Linear(256,1024),nn.ReLU())
        self.classification_head=nn.Sequential(nn.Linear(1024,15))


    def forward(self,x):

        x=self.downsample(x)
        res=x
        x=self.conv1(x.permute(0,2,1))
        x=self.norm1(x.permute(0,2,1)).permute(0,2,1)
        x=self.conv2(x)
        x=self.norm1(x.permute(0,2,1))
        x=x+res
        x=self.lstm(x.permute(1,0,2))
        x=x.permute(1,0,2)
        x=self.upsample(x)
        x=self.classification_head(x)
        #print(x.shape)
        #exit()
        return x


class TransformerModel(nn.Module):
    def __init__(self,DOWNLOADED_MODEL_PATH, rnn='LSTM'):
        super(TransformerModel, self).__init__()
        config_model = AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH+'/config.json')

        self.backbone=AutoModel.from_pretrained(
                           DOWNLOADED_MODEL_PATH+'/pytorch_model.bin',config=config_model)

        self.lstm=ResidualLSTM(1024,rnn)
        self.classification_head=nn.Linear(1024,15)
        #self.head=nn.Sequential(nn.Linear(1024,15))

        # self.downsample=nn.Sequential(nn.Linear(1024,256))
        # self.conv1d=nn.Sequential(nn.Conv1d(256,256,3,padding=0),
        #                           nn.ReLU(),
        #                           nn.LayerNorm(256),
        #                           nn.Conv1d(256,256,3,padding=1),
        #                           nn.ReLU(),
        #                           nn.LayerNorm(256))

        #self.BIO_head=nn.Sequential(nn.Linear(1024,3))

    def forward(self,x,attention_mask):
        x=self.backbone(input_ids=x,attention_mask=attention_mask,return_dict=False)[0]

        x=self.lstm(x.permute(1,0,2)).permute(1,0,2)
        x=self.classification_head(x)
        # x=x.permute(0,2,1)
        # x=self.conv1d(x)
        # print(x.shape)
        # exit()
        # classification_output=self.classification_head(x)
        #BIO_output=self.BIO_head(x[0])
        # print(x.shape)
        # exit()
        return [x[:,:,rearrange_indices]]#, BIO_output

class SlidingWindowTransformerModel(nn.Module):
    def __init__(self,DOWNLOADED_MODEL_PATH, rnn, window_size=512, edge_len=64):
        super(SlidingWindowTransformerModel, self).__init__()
        config_model = AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH+'/config.json')

        self.backbone=AutoModel.from_pretrained(
                           DOWNLOADED_MODEL_PATH+'/pytorch_model.bin',config=config_model)

        self.lstm=ResidualLSTM(1024,rnn)
        self.classification_head=nn.Linear(1024,15)
        self.window_size=window_size
        self.edge_len=edge_len
        self.inner_len=window_size-edge_len*2
        #self.head=nn.Sequential(nn.Linear(1024,15))

        # self.downsample=nn.Sequential(nn.Linear(1024,256))
        # self.conv1d=nn.Sequential(nn.Conv1d(256,256,3,padding=0),
        #                           nn.ReLU(),
        #                           nn.LayerNorm(256),
        #                           nn.Conv1d(256,256,3,padding=1),
        #                           nn.ReLU(),
        #                           nn.LayerNorm(256))

        #self.BIO_head=nn.Sequential(nn.Linear(1024,3))

    def forward(self,input_ids,attention_mask):

        B,L=input_ids.shape

        # print(L)
        # exit()
        #x=self.backbone(input_ids=input_ids,attention_mask=attention_mask,return_dict=False)[0]
        if L<=self.window_size:
            x=self.backbone(input_ids=input_ids,attention_mask=attention_mask,return_dict=False)[0]
            #pass
        else:
            #print("####")
            #print(input_ids.shape)
            segments=(L-self.window_size)//self.inner_len
            if (L-self.window_size)%self.inner_len>self.edge_len:
                segments+=1
            elif segments==0:
                segments+=1
            x=self.backbone(input_ids=input_ids[:,:self.window_size],attention_mask=attention_mask[:,:self.window_size],return_dict=False)[0]
            for i in range(1,segments+1):
                start=self.window_size-self.edge_len+(i-1)*self.inner_len
                end=self.window_size-self.edge_len+(i-1)*self.inner_len+self.window_size
                end=min(end,L)
                x_next=input_ids[:,start:end]
                mask_next=attention_mask[:,start:end]
                x_next=self.backbone(input_ids=x_next,attention_mask=mask_next,return_dict=False)[0]
                #L_next=x_next.shape[1]-self.edge_len,
                if i==segments:
                    x_next=x_next[:,self.edge_len:]
                else:
                    x_next=x_next[:,self.edge_len:self.edge_len+self.inner_len]
                #print(x_next.shape)
                x=torch.cat([x,x_next],1)

                #print(start,end)
        #print(x.shape)
        x=self.lstm(x.permute(1,0,2)).permute(1,0,2)
        x=self.classification_head(x)

        # x=x.permute(0,2,1)
        # x=self.conv1d(x)
        # print(x.shape)
        # exit()
        # classification_output=self.classification_head(x)
        #BIO_output=self.BIO_head(x[0])
        # print(x.shape)
        # exit()
        #return x
        return [x[:,:,rearrange_indices]]#, BIO_output


# Inference

In [None]:
# Returns per-word, mean class prediction probability over all tokens corresponding to each word
def inference(data_loader, model_ids, model, path):

    gc.collect()
    torch.cuda.empty_cache()

    ensemble_preds = np.zeros((len(data_loader.dataset), config['max_length'], len(labels_to_ids)), dtype=np.float32)
    wids = np.full((len(data_loader.dataset), config['max_length']), -100)
    for model_i, model_id in enumerate(model_ids):

        model.load_state_dict(torch.load(f'{path}/fold{model_id}.pt', map_location=config['device']))

        # put model in training mode
        model.eval()
        for batch_i, batch in tqdm(enumerate(data_loader)):

            if model_i == 0: wids[batch_i*config['valid_batch_size']:(batch_i+1)*config['valid_batch_size'],:batch['wids'].shape[1]] = batch['wids'].numpy()

            # MOVE BATCH TO GPU AND INFER
            ids = batch["input_ids"].to(config['device'])
            mask = batch["attention_mask"].to(config['device'])
            with torch.no_grad():
                #with amp.autocast():
                outputs = model(ids, attention_mask=mask)
            all_preds = torch.nn.functional.softmax(outputs[0], dim=2).cpu().detach().numpy()
            ensemble_preds[batch_i*config['valid_batch_size']:(batch_i+1)*config['valid_batch_size'],:all_preds.shape[1]] += all_preds

            del ids
            del mask
            del outputs
            del all_preds

        gc.collect()
        torch.cuda.empty_cache()

    ensemble_preds /= len(model_ids)
    predictions = []
    # INTERATE THROUGH EACH TEXT AND GET PRED
    for text_i in range(ensemble_preds.shape[0]):
        token_preds = ensemble_preds[text_i]

        prediction = []
        previous_word_idx = -1
        prob_buffer = []
        word_ids = wids[text_i][wids[text_i] != -100]
        for idx,word_idx in enumerate(word_ids):
            if word_idx == -1:
                pass
            elif word_idx != previous_word_idx:
                if prob_buffer:
                    prediction.append(np.mean(prob_buffer, dtype=np.float32, axis=0))
                    prob_buffer = []
                prob_buffer.append(token_preds[idx])
                previous_word_idx = word_idx
            else:
                prob_buffer.append(token_preds[idx])
        prediction.append(np.mean(prob_buffer, dtype=np.float32, axis=0))
        predictions.append(prediction)

    gc.collect()
    torch.cuda.empty_cache()
    return predictions

In [None]:
model = SlidingWindowTransformerModel(DOWNLOADED_MODEL_PATH,'GRU').to(config['device'])
test_word_preds = inference(test_texts_loader, [0, 1, 3, 4, 6, 7], model, LOAD_MODEL_FROM)

In [None]:
model = TransformerModel("../input/pytorch-longformer-large",'GRU').to(config['device'])
test_word_preds2 = inference(test_texts_loader_longformer, [0, 2, 3, 4, 5, 6], model, "../input/fp-test63")

# Sequence Datasets

We will create datasets that, instead of describing individual words or tokens, describes sequences of words. Within some heuristic constraints, every possible sub-sequence of words in a text will converted to a dataset sample with the following attributes:
- features- sequence length, position, and various kinds of class probability predictions/statistics
- labels- whether the sequence matches exactly a discourse instance
- truePos- whether the sequence matches a discourse instance by competition criteria for true positive groups- the integer index of the text where the sequence is found
- wordRanges- the start and end word index of the sequence in the text

Sequence datasets are generated for each discourse type and for validation and submission datasets.


In [None]:
from collections import Counter
from bisect import bisect_left

# Percentile code taken from https://www.kaggle.com/vuxxxx/tensorflow-longformer-ner-postprocessing
# Thank Vu!
#
# Use 99.5% of the distribution of lengths for a disourse type as maximum.
# Increasing this constraint makes this step slower but generally increases performance.
train_df=pd.read_csv("../input/feedback-prize-2021/train.csv")
MAX_SEQ_LEN = {}
train_df['len'] = train_df['predictionstring'].apply(lambda x:len(x.split()))
max_lens = train_df.groupby('discourse_type')['len'].quantile(.995)
for disc_type in disc_type_to_ids:
    MAX_SEQ_LEN[disc_type] = int(max_lens[disc_type])

#The minimum probability prediction for a 'B'egin class for which we will evaluate a word sequence
MIN_BEGIN_PROB = {
    'Claim': .35*0.8,
    'Concluding Statement': .15*1.0,
    'Counterclaim': .04*1.25,
    'Evidence': .1*0.8,
    'Lead': .32*1.0,
    'Position': .25*0.8,
    'Rebuttal': .01*1.25,
}

class SeqDataset(object):

    def __init__(self, features, labels, groups, wordRanges, truePos):

        self.features = np.array(features, dtype=np.float32)
        self.labels = np.array(labels)
        self.groups = np.array(groups, dtype=np.int16)
        self.wordRanges = np.array(wordRanges, dtype=np.int16)
        self.truePos = np.array(truePos)

# Adapted from https://stackoverflow.com/questions/60467081/linear-interpolation-in-numpy-quantile
# This is used to prevent re-sorting to compute quantile for every sequence.
def sorted_quantile(array, q):
    array = np.array(array)
    n = len(array)
    index = (n - 1) * q
    left = np.floor(index).astype(int)
    fraction = index - left
    right = left
    right = right + (fraction > 0).astype(int)
    i, j = array[left], array[right]
    return i + (j - i) * fraction

def seq_dataset(disc_type, pred_indices=None, submit=False):
    begin_class_ids = [0, 1, 3, 5, 7, 9, 11, 13]
    word_preds = valid_word_preds if not submit else test_word_preds
    w = ensemble_weights[disc_type]


    window = pred_indices if pred_indices else range(len(word_preds))
    X = np.empty((int(1e6),N_FEATURES), dtype=np.float32)
    X_ind = 0
    y = []
    truePos = []
    wordRanges = []
    groups = []
    for text_i in window:
        text_preds, text_preds2 = np.array(test_word_preds[text_i]), np.array(test_word_preds2[text_i])

        if len(text_preds) <= len(text_preds2):
            text_preds = w*text_preds + (1-w)*text_preds2[:len(text_preds)]
        else:
            text_preds[:len(text_preds2)] = w*text_preds[:len(text_preds2)] + (1-w)*text_preds2

        num_words = len(text_preds)

        global_features, global_locs = [], []

        for dt in disc_type_to_ids:
            disc_begin, disc_inside = disc_type_to_ids[dt]

            gmean = (text_preds[:, disc_begin] + text_preds[:, disc_inside]).mean()
            global_features.append(gmean)
            global_locs.append(np.argmax(text_preds[:, disc_begin])/float(num_words))

        disc_begin, disc_inside = disc_type_to_ids[disc_type]

        # The probability that a word corresponds to either a 'B'-egin or 'I'-nside token for a class
        prob_or = lambda word_preds: word_preds[:,disc_begin] + word_preds[:,disc_inside]

        if not submit:
            gt_idx = set()
            gt_arr = np.zeros(num_words, dtype=int)
            text_gt = valid.loc[valid.id == test_dataset.id.values[text_i]]
            disc_gt = text_gt.loc[text_gt.discourse_type == disc_type]

            # Represent the discourse instance locations in a hash set and an integer array for speed
            for row_i, row in enumerate(disc_gt.iterrows()):
                splt = row[1]['predictionstring'].split()
                start, end = int(splt[0]), int(splt[-1]) + 1
                gt_idx.add((start, end))
                gt_arr[start:end] = row_i + 1
            gt_lens = np.bincount(gt_arr)

        # Iterate over every sub-sequence in the text
        quants = np.linspace(0,1,7)
        prob_begins = np.copy(text_preds[:,disc_begin])
        min_begin = MIN_BEGIN_PROB[disc_type]
        for pred_start in range(num_words):
            prob_begin = prob_begins[pred_start]
            if prob_begin > min_begin:
                begin_or_inside = []
                for pred_end in range(pred_start+1,min(num_words+1, pred_start+MAX_SEQ_LEN[disc_type]+1)):

                    new_prob = prob_or(text_preds[pred_end-1:pred_end])
                    insert_i = bisect_left(begin_or_inside, new_prob)
                    begin_or_inside.insert(insert_i, new_prob[0])

                    # Generate features for a word sub-sequence

                    # The length and position of start/end of the sequence
                    features = [pred_end - pred_start, pred_start / float(num_words), pred_end / float(num_words)]

                    # 7 evenly spaced quantiles of the distribution of relevant class probabilities for this sequence
                    features.extend(list(sorted_quantile(begin_or_inside, quants)))

                    # The probability that words on either edge of the current sub-sequence belong to the class of interest
                    features.append(prob_or(text_preds[pred_start-1:pred_start])[0] if pred_start > 0 else 0)
                    features.append(prob_or(text_preds[pred_end:pred_end+1])[0] if pred_end < num_words else 0)
                    features.append(prob_or(text_preds[pred_start-2:pred_start-1])[0] if pred_start > 1 else 0)
                    features.append(prob_or(text_preds[pred_end+1:pred_end+2])[0] if pred_end < (num_words-1) else 0)

                    # The probability that the first word corresponds to a 'B'-egin token
                    features.append(text_preds[pred_start,disc_begin])
                    features.append(text_preds[pred_start-1,disc_begin])

                    if pred_end < num_words:
                        features.append(text_preds[pred_end, begin_class_ids].sum())
                    else:
                        features.append(1.0)

                    s = prob_or(text_preds[pred_start:pred_end])
                    features.append(np.argmax(s)/features[0]) # maximum point location on sequence
                    features.append(np.argmin(s)/features[0]) # minimum point location on sequence
                    instability = 0
                    if len(s) > 1:
                        instability = (np.diff(s)**2).mean()
                    features.append(instability)

                    features.extend(list(global_features))
                    features.extend(list([loc - features[1] for loc in global_locs]))

                    exact_match = (pred_start, pred_end) in gt_idx if not submit else None

                    if not submit:
                        true_pos = False
                        for match_cand, count in Counter(gt_arr[pred_start:pred_end]).most_common(2):
                            if match_cand != 0 and count / float(pred_end - pred_start) >= .5 and float(count) / gt_lens[match_cand] >= .5: true_pos = True
                    else: true_pos = None

                    # For efficiency, use a numpy array instead of a list that doubles in size when full to conserve constant "append" time complexity
                    if X_ind >= X.shape[0]:
                        new_X = np.empty((X.shape[0]*2,N_FEATURES), dtype=np.float32)
                        new_X[:X.shape[0]] = X
                        X = new_X
                    X[X_ind] = features
                    X_ind += 1

                    y.append(exact_match)
                    truePos.append(true_pos)
                    wordRanges.append((np.int16(pred_start), np.int16(pred_end)))
                    groups.append(np.int16(text_i))

    return SeqDataset(X[:X_ind], y, groups, wordRanges, truePos)

# predict strings and submit

In [None]:
from joblib import Parallel, delayed
from multiprocessing import Manager
from sklearn.model_selection import cross_val_score, GroupKFold
from sklearn.ensemble import GradientBoostingClassifier
from skopt.space import Real
from skopt import gp_minimize
import sys
import xgboost

NUM_FOLDS = 8

warnings.filterwarnings('ignore', '.*ragged nested sequences*',)

prob_cache = {} # Cache each fold's probability predictions for speed
clfs = []  # Each fold will add its classifier here
# Predict sub-sequences for a discourse type and set of train/test texts
def predict_strings(disc_type, probThresh, test_groups, train_ind=None, submit=False):
    string_preds = []
    #validSeqDs = validSeqSets[disc_type]
    #submitSeqDs = submitSeqSets[disc_type]

    # Average the probability predictions of a set of classifiers



    predict_df = test_texts
    text_df = test_texts

    for text_idx in tqdm(test_groups):
        # The probability of true positive and (start,end) of each sub-sequence in the curent text

        testDs=seq_dataset(disc_type, pred_indices=[text_idx],submit=True)

        prob_tp_curr = get_tp_prob(testDs, disc_type)
        word_ranges_curr = testDs.wordRanges[testDs.groups == text_idx]

        split_text = text_df.loc[text_df.id == predict_df.id.values[text_idx]].iloc[0].text.split()
        full_preds = np.zeros(len(split_text))
        # Include the sub-sequence predictions in order of predicted probability
        for prob, wordRange in reversed(sorted(zip(prob_tp_curr, [tuple(wr) for wr in word_ranges_curr]))):

            # Until the predicted probability is lower than the tuned threshold
            if prob < probThresh: break

            intersect = np.sum(full_preds[wordRange[0]:wordRange[1]])
            total = wordRange[1] - wordRange[0]
            condition = intersect/total <= 0.15

            if condition:
                full_preds[wordRange[0]:wordRange[1]] = 1
                string_preds.append((predict_df.id.values[text_idx], disc_type, ' '.join(map(str, list(range(wordRange[0], wordRange[1]))))))
    return string_preds

def sub_df(string_preds):
    return pd.DataFrame(string_preds, columns=['id','class','predictionstring'])

Load the tuned probability thresholds from tuning result files, and make sub-sequence predictions!

In [None]:
uniqueSubmitGroups = range(len(test_word_preds))

sub = pd.concat([sub_df(predict_strings(disc_type, thresholds[disc_type],
                                        uniqueSubmitGroups, submit=True)) for disc_type in disc_type_to_ids ]).reset_index(drop=True)

In [None]:
sub.to_csv("submission.csv", index=False)
sub.head()