In [1]:
!pip install -q "../input/autocorrect/autocorrect-2.6.1.tar"

from transformers import AutoConfig, AutoModel, AutoTokenizer
from torch.utils.data.sampler import *
from joblib import Parallel, delayed
from autocorrect import Speller
from tqdm import tqdm
import torch.nn as nn
import glob
import pandas as pd
import numpy as np
import torch
import sys
import os

spell_correct = Speller(lang='en', fast=True)
sys.path.append("../input/tez-lib/")
import tez
import gc
gc.enable()



In [2]:
# ARGS
target_id_map = {
    "B-Lead": 0,
    "I-Lead": 1,
    "B-Position": 2,
    "I-Position": 3,
    "B-Evidence": 4,
    "I-Evidence": 5,
    "B-Claim": 6,
    "I-Claim": 7,
    "B-Concluding Statement": 8,
    "I-Concluding Statement": 9,
    "B-Counterclaim": 10,
    "I-Counterclaim": 11,
    "B-Rebuttal": 12,
    "I-Rebuttal": 13,
    "O": 14,
    "PAD": -100,
}


id_target_map = {v: k for k, v in target_id_map.items()}
class args1: #longformer
    input_path = "../input/feedback-prize-2021/"
    model = "../input/longformerlarge4096/longformer-large-4096"
    tez_model= "../input/spellchecker-fold0"
    output = "."
    batch_size = 4
    max_len = 1600
    
class args2: #longformer trivia
    input_path = "../input/feedback-prize-2021/"
    model = "../input/triviabase/longformer-large-4096-finetuned-triviaqa"
    tez_model= "../input/trivia4096"
    output = "."
    batch_size = 4
    max_len = 1600

class args3: #deberta
    input_path = "../input/feedback-prize-2021/"
    model = "../input/deberta-xlarge/"
    tez_model= "../input/at-model-deberta-xlarge/data_deberta"
    output = "."
    batch_size = 4
    max_len = 1600
    
class args4: #bigbird
    input_path = "../input/feedback-prize-2021/"
    model = "../input/bigbirdrobertalarge/bigbird-roberta-large/"
    tez_model= "../input/at-model-sufferin-bird/bigbird"
    output = "."
    batch_size = 2
    max_len = 1536
    
class args5: #funnel
    input_path = "../input/feedback-prize-2021/"
    model = "../input/funnelbasefiles/large"
    tez_model= "../input/funnellarge"
    output = "."
    batch_size = 2
    max_len = 1536

# class args6: #crawl model
#     input_path = "../input/feedback-prize-2021/"
#     model = "../input/longformerlarge4096/longformer-large-4096/"
#     tez_model= "../input/at-model-diff"
#     output = "."
#     batch_size = 4
#     max_len = 1536
    
# class args7: #at model different data
#     input_path = "../input/feedback-prize-2021/"
#     model = "../input/triviaqa/longformer-large-4096-finetuned-triviaqa"
#     tez_model= "../input/at-model-trivia-2/different_data/"
#     output = "."
#     batch_size = 4
#     max_len = 1536
    
class args8: #deberta base
    input_path = "../input/feedback-prize-2021/"
    model = "../input/debertabasehf/deberta-base"
    tez_model= "../input/debertaallfolds"
    output = "."
    batch_size = 4
    max_len = 1600
    
class args9: #deberta large
    input_path = "../input/feedback-prize-2021/"
    model = "../input/debertalargesample/deberta-large"
    tez_model= "../input/debertalarge"
    output = "."
    batch_size = 4
    max_len = 1600
    
class args10: #deberta large
    input_path = "../input/feedback-prize-2021/"
    model = "../input/debertalargesample/deberta-large"
    tez_model= "../input/debertasmoothlarge"
    output = "."
    batch_size = 4
    max_len = 1600

In [3]:
# GEN UTILS
class FeedbackDataset:
    def __init__(self, samples, max_len, tokenizer):
        self.samples = samples
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.length = len(samples)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        input_ids = self.samples[idx]["input_ids"]
        input_ids = [self.tokenizer.cls_token_id] + input_ids
        if len(input_ids) > self.max_len - 1:
            input_ids = input_ids[: self.max_len - 1]
        input_ids = input_ids + [self.tokenizer.sep_token_id]
        attention_mask = [1] * len(input_ids)
        return {
            "ids": input_ids,
            "mask": attention_mask,
        }
    
class Collate:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        output = dict()
        output["ids"] = [sample["ids"] for sample in batch]
        output["mask"] = [sample["mask"] for sample in batch]
        batch_max = max([len(ids) for ids in output["ids"]])
        if self.tokenizer.padding_side == "right":
            output["ids"] = [s + (batch_max - len(s)) * [self.tokenizer.pad_token_id] for s in output["ids"]]
            output["mask"] = [s + (batch_max - len(s)) * [0] for s in output["mask"]]
        else:
            output["ids"] = [(batch_max - len(s)) * [self.tokenizer.pad_token_id] + s for s in output["ids"]]
            output["mask"] = [(batch_max - len(s)) * [0] + s for s in output["mask"]]
        output["ids"] = torch.tensor(output["ids"], dtype=torch.long)
        output["mask"] = torch.tensor(output["mask"], dtype=torch.long)
        return output
    
def _prepare_test_data_helper(args, tokenizer, ids, spell_check):
    test_samples = []
    for idx in ids:
        filename = os.path.join(args.input_path, "test", idx + ".txt")
        with open(filename, "r") as f:
            text = f.read()
        if spell_check:
            text = spell_correct(text)
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=False,
            return_offsets_mapping=True,
        )
        input_ids = encoded_text["input_ids"]
        offset_mapping = encoded_text["offset_mapping"]

        sample = {
            "id": idx,
            "input_ids": input_ids,
            "text": text,
            "offset_mapping": offset_mapping,
        }
        test_samples.append(sample)
    return test_samples

def prepare_test_data(df, tokenizer, args, spell_check = False):
    test_samples = []
    ids = df["id"].unique()
    ids_splits = np.array_split(ids, 4)
    results = Parallel(n_jobs=4, backend="multiprocessing")(
        delayed(_prepare_test_data_helper)(args, tokenizer, idx, spell_check) for idx in ids_splits
    )
    for result in results:
        test_samples.extend(result)
    return test_samples

In [4]:
# MODELS
class FeedbackModelLongformer(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)
        
    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}
    
class FeedbackModelDeberta(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}

class FeedbackModelBigbird(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
                "attention_type" : "original_full"
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}
    
class FeedbackModelFunnel(tez.Model):
    def __init__(self, model_name, num_labels):
        super().__init__()
        self.model_name = model_name
        self.num_labels = num_labels
        config = AutoConfig.from_pretrained(model_name)
        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )
        self.transformer = AutoModel.from_config(config)
        self.output = nn.Linear(config.hidden_size, self.num_labels)

    def forward(self, ids, mask):
        transformer_out = self.transformer(ids, mask)
        sequence_output = transformer_out.last_hidden_state
        logits = self.output(sequence_output)
        logits = torch.softmax(logits, dim=-1)
        return logits, 0, {}

In [5]:
# BLEND
df = pd.read_csv(os.path.join("../input/feedback-prize-2021/", "sample_submission.csv"))
# df = pd.read_csv("../input/creating-folds-properly-hopefully-p/train_folds.csv")
df_ids = df["id"].unique()

tokenizer = AutoTokenizer.from_pretrained(args1.model)
test_samples = prepare_test_data(df, tokenizer, args1, spell_check = True)
collate = Collate(tokenizer=tokenizer)
test_dataset = FeedbackDataset(test_samples, args1.max_len, tokenizer)
# test_dataset = FeedbackDataset(df_text, tokenizer, max_length)


tokenizer3 = AutoTokenizer.from_pretrained(args3.model)
test_samples3 = prepare_test_data(df, tokenizer3, args3, spell_check = True)
collate3 = Collate(tokenizer=tokenizer3)
test_dataset3 = FeedbackDataset(test_samples3, args3.max_len, tokenizer3)

# tokenizer4 = AutoTokenizer.from_pretrained(args4.model)
# test_samples4 = prepare_test_data(df, tokenizer4, args4, spell_check = True)
# collate4 = Collate(tokenizer=tokenizer4)
# test_dataset4 = FeedbackDataset(test_samples4, args4.max_len, tokenizer4)
# test_dataset4 = FeedbackDataset(df_text, tokenizer, max_length)

# tokenizer5 = AutoTokenizer.from_pretrained(args5.model)
# test_samples5 = prepare_test_data(df, tokenizer5, args5, spell_check = True)
# collate5 = Collate(tokenizer=tokenizer5)
# test_dataset5 = FeedbackDataset(test_samples5, args5.max_len, tokenizer5)

tokenizer8 = AutoTokenizer.from_pretrained(args8.model)
test_samples8 = prepare_test_data(df, tokenizer8, args8)
collate8 = Collate(tokenizer=tokenizer8)
test_dataset8 = FeedbackDataset(test_samples8, args8.max_len, tokenizer8)

tokenizer9 = AutoTokenizer.from_pretrained(args9.model)
test_samples9 = prepare_test_data(df, tokenizer9, args9)
collate9 = Collate(tokenizer=tokenizer9)
test_dataset9 = FeedbackDataset(test_samples9, args9.max_len, tokenizer9)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

models = {
    1: { #longformer
        "folds_used":[0,3],
        "weight": .10             # ensure that these weights sum to 1.0
    },
    2:{ #longformer trivia
        "folds_used":[1,3],
        "weight": .10
    },
    3:{ #deberta
        "folds_used":[1,3,4],
        "weight": .30
    },
#     4:{ #bigbird
#         "folds_used":[0],
#         "weight":1/4
#     },
#     5:{ #funnel
#         "folds_used":[0],
#         "weight":1/4
#     }
#     6:{ #crawl model
#         "folds_used":[2]
#     },
#     7:{ #at model diff data
#         "folds_used":[2]
#     }
#     8:{ #deberta base
#         "folds_used":[1,3],
#         "weight":.05
#     },
    9:{"folds_used":[0,2,4],
      "weight":0.20},
#     smooth
    10:{"folds_used":[0,1,3,4],
      "weight":0.30}
    
}

raw_preds = []
checksum = 0
total_folds = 0 
for model in models.keys():
    total_folds += len(models[model]["folds_used"])
print("Total folds: ", total_folds)
results = []
for i, model_ in enumerate(models.keys()):
    folds_used = models[model_]["folds_used"]
    for j, fold_ in enumerate(folds_used):
        current_idx = 0
        if model_ == 1:
            model = FeedbackModelLongformer(model_name=args1.model, num_labels=len(target_id_map) - 1)
            if (fold_) == 0:
                model.load('../input/spellchecker-fold0/model_0 (1).bin', weights_only=True)
            elif (fold_) == 3:
                model.load('../input/spellchecker-fold0/drive-download-20220131T060741Z-002/model_3.bin', weights_only=True)
            else:
                model.load(os.path.join(args1.tez_model, f"model_{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset, batch_size=args1.batch_size, n_jobs=-1, collate_fn=collate)
                
        if model_ == 2:
            model = FeedbackModelLongformer(model_name=args2.model, num_labels=len(target_id_map) - 1)
            if fold_ == 0:
                model.load(os.path.join(args2.tez_model, f"model_{fold_} (2).bin"), weights_only=True)
            else:
                model.load(os.path.join(args2.tez_model, f"model_{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset, batch_size=args1.batch_size, n_jobs=-1, collate_fn=collate)
                
        if model_ == 3:
            model = FeedbackModelDeberta(model_name=args3.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args3.tez_model, f"model_1024_debberta_{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset3, batch_size=args3.batch_size, n_jobs=-1, collate_fn=collate3)
            
        if model_ == 4:
            model = FeedbackModelBigbird(model_name=args4.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args4.tez_model, f"model_1024_bigbird_10_{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset4, batch_size=args4.batch_size, n_jobs=-1, collate_fn=collate4)

        if model_ == 5:
            model = FeedbackModelFunnel(model_name=args5.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args5.tez_model, f"funnel{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset5, batch_size=args5.batch_size, n_jobs=-1, collate_fn=collate5)
        
        if model_ == 6:
            model = FeedbackModelLongformer(model_name=args6.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args6.tez_model, f"crawl-model_1048_10_{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset, batch_size=args6.batch_size, n_jobs=-1, collate_fn=collate)
        
        if model_ == 7:
            model = FeedbackModelLongformer(model_name=args7.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args7.tez_model, f"model_1400_trivia_{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset, batch_size=args7.batch_size, n_jobs=-1, collate_fn=collate)
        
        if model_ == 8:
            model = FeedbackModelDeberta(model_name=args8.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args8.tez_model, f"deberta{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset3, batch_size=args8.batch_size, n_jobs=-1, collate_fn=collate8)
        
        if model_ == 9:
            model = FeedbackModelDeberta(model_name=args9.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args9.tez_model, f"debertaLarge{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset3, batch_size=args9.batch_size, n_jobs=-1, collate_fn=collate9)
            
        if model_ == 10:
            model = FeedbackModelDeberta(model_name=args10.model, num_labels=len(target_id_map) - 1)
            model.load(os.path.join(args10.tez_model, f"debertaSmoothlLarge{fold_}.bin"), weights_only=True)
            preds_iter = model.predict(test_dataset3, batch_size=args9.batch_size, n_jobs=-1, collate_fn=collate9)
            
        print(f"Predicting Model: {model_}\tFold: {fold_}\tWeight: {(1/len(folds_used)) * models[model_]['weight']}")
        checksum += (1/len(folds_used)) * models[model_]['weight']
        current_idx = 0
        for preds in preds_iter:
            preds = preds.astype(np.float32)
#             preds = preds / total_folds
            preds = preds * (1/len(folds_used)) * models[model_]['weight']
            if i==0 and j==0:
                raw_preds.append(preds)
            else:
#                 if preds.shape[1]>np.array(raw_preds,dtype=object).shape[2]:
#                     preds = preds[:,:np.array(raw_preds,dtype=object).shape[2],:]
                raw_preds[current_idx] += preds
                current_idx += 1
        torch.cuda.empty_cache()
        gc.collect()

Token indices sequence length is longer than the specified maximum sequence length for this model (1299 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (742 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (799 > 512). Running this sequence through the model will result in indexing errors


Total folds:  14
Predicting Model: 1	Fold: 0	Weight: 0.05


100%|██████████| 2/2 [00:02<00:00,  1.24s/it, stage=test]


Predicting Model: 1	Fold: 3	Weight: 0.05


100%|██████████| 2/2 [00:01<00:00,  1.40it/s, stage=test]


Predicting Model: 2	Fold: 1	Weight: 0.05


100%|██████████| 2/2 [00:01<00:00,  1.39it/s, stage=test]


Predicting Model: 2	Fold: 3	Weight: 0.05


100%|██████████| 2/2 [00:01<00:00,  1.40it/s, stage=test]


Predicting Model: 3	Fold: 1	Weight: 0.09999999999999999


100%|██████████| 2/2 [00:03<00:00,  1.76s/it, stage=test]


Predicting Model: 3	Fold: 3	Weight: 0.09999999999999999


100%|██████████| 2/2 [00:03<00:00,  1.70s/it, stage=test]


Predicting Model: 3	Fold: 4	Weight: 0.09999999999999999


100%|██████████| 2/2 [00:03<00:00,  1.72s/it, stage=test]


Predicting Model: 9	Fold: 0	Weight: 0.06666666666666667


100%|██████████| 2/2 [00:01<00:00,  1.09it/s, stage=test]


Predicting Model: 9	Fold: 2	Weight: 0.06666666666666667


100%|██████████| 2/2 [00:01<00:00,  1.05it/s, stage=test]


Predicting Model: 9	Fold: 4	Weight: 0.06666666666666667


100%|██████████| 2/2 [00:01<00:00,  1.09it/s, stage=test]


Predicting Model: 10	Fold: 0	Weight: 0.075


100%|██████████| 2/2 [00:01<00:00,  1.08it/s, stage=test]


Predicting Model: 10	Fold: 1	Weight: 0.075


100%|██████████| 2/2 [00:01<00:00,  1.10it/s, stage=test]


Predicting Model: 10	Fold: 3	Weight: 0.075


100%|██████████| 2/2 [00:01<00:00,  1.08it/s, stage=test]


Predicting Model: 10	Fold: 4	Weight: 0.075


100%|██████████| 2/2 [00:01<00:00,  1.10it/s, stage=test]


In [6]:
print(checksum)
# print(np.array(preds).shape)
print(np.array(raw_preds,dtype=object)[0][0][0].sum())

0.9999999999999998
1.0000001


In [7]:
final_preds = []
final_scores = []

for rp in raw_preds:
    pred_class = np.argmax(rp, axis=2)
    pred_scrs = np.max(rp, axis=2)
    for pred, pred_scr in zip(pred_class, pred_scrs):
        pred = pred.tolist()
        pred_scr = pred_scr.tolist()
        final_preds.append(pred)
        final_scores.append(pred_scr)

for j in range(len(test_samples)):
    tt = [id_target_map[p] for p in final_preds[j][1:]]
    tt_score = final_scores[j][1:]
    test_samples[j]["preds"] = tt
    test_samples[j]["pred_scores"] = tt_score

In [8]:
# SUBMISSION
def link_(oof, claxx = 'Evidence'):
  if not len(oof):
    return oof
  
  def jn(pst, start, end):
    return " ".join([str(x) for x in pst[start:end]])
  
  thresh = 1
  idu = oof['id'].unique()
  eoof = oof[oof['class'] == claxx]
  neoof = oof[oof['class'] != claxx]
  eoof.index = eoof[['id', 'class']]
  for thresh2 in range(26, 27, 1):
    retval = []
    for idv in tqdm(idu, desc='link_evidence', leave=False):
      for c in [claxx]:
        q = eoof[(eoof['id'] == idv)]
        if len(q) == 0:
          continue
        pst = []
        for r in q.itertuples():
          pst = [*pst, -1,  *[int(x) for x in r.predictionstring.split()]]
        start = 1
        end = 1
        for i in range(2, len(pst)):
          cur = pst[i]
          end = i
          if  ((cur == -1) and ((pst[i + 1] > pst[end - 1] + thresh) or (pst[i + 1] - pst[start] > thresh2))):
            retval.append((idv, c, jn(pst, start, end)))
            start = i + 1
        v = (idv, c, jn(pst, start, end + 1))
        retval.append(v)
    roof = pd.DataFrame(retval, columns=['id', 'class', 'predictionstring'])
    roof = roof.merge(neoof, how='outer')
    return roof

# proba_thresh = {
#     "Lead": 0.65,
#     "Position": 0.55,
#     "Evidence": 0.6,
#     "Claim": 0.55,
#     "Concluding Statement": 0.7,
#     "Counterclaim": 0.5,
#     "Rebuttal": 0.55,
# }
proba_thresh = { #higher
    "Lead": 0.687,
    "Position": 0.537,
    "Evidence": 0.637,
    "Claim": 0.537,
    "Concluding Statement": 0.687,
    "Counterclaim": 0.537,
    "Rebuttal": 0.537,
}

min_thresh = {
    "Lead": 9,
    "Position": 5,
    "Evidence": 14,
    "Claim": 3,
    "Concluding Statement": 11,
    "Counterclaim": 6,
    "Rebuttal": 4,
}

submission = []
for sample_idx, sample in enumerate(test_samples):
    preds = sample["preds"]
    offset_mapping = sample["offset_mapping"]
    sample_id = sample["id"]
    sample_text = sample["text"]
    sample_input_ids = sample["input_ids"]
    sample_pred_scores = sample["pred_scores"]
    sample_preds = []

    if len(preds) < len(offset_mapping):
        preds = preds + ["O"] * (len(offset_mapping) - len(preds))
        sample_pred_scores = sample_pred_scores + [0] * (len(offset_mapping) - len(sample_pred_scores))
    
    idx = 0
    phrase_preds = []
    while idx < len(offset_mapping):
        start, _ = offset_mapping[idx]
        if preds[idx] != "O":
            label = preds[idx][2:]
        else:
            label = "O"
        phrase_scores = []
        phrase_scores.append(sample_pred_scores[idx])
        idx += 1
        while idx < len(offset_mapping):
            if label == "O":
                matching_label = "O"
            else:
                matching_label = f"I-{label}"
            if preds[idx] == matching_label:
                _, end = offset_mapping[idx]
                phrase_scores.append(sample_pred_scores[idx])
                idx += 1
            else:
                break
        if "end" in locals():
            phrase = sample_text[start:end]
            phrase_preds.append((phrase, start, end, label, phrase_scores))

    temp_df = []
    for phrase_idx, (phrase, start, end, label, phrase_scores) in enumerate(phrase_preds):
        word_start = len(sample_text[:start].split())
        word_end = word_start + len(sample_text[start:end].split())
        word_end = min(word_end, len(sample_text.split()))
        ps = " ".join([str(x) for x in range(word_start, word_end)])
        if label != "O":
            if sum(phrase_scores) / len(phrase_scores) >= proba_thresh[label]:
                if len(ps.split()) >= min_thresh[label]:
                    temp_df.append((sample_id, label, ps))
    
    temp_df = pd.DataFrame(temp_df, columns=["id", "class", "predictionstring"])
    submission.append(temp_df)

submission = pd.concat(submission).reset_index(drop=True)
submission = link_(submission, 'Evidence')
submission = link_(submission, 'Lead')
submission = link_(submission, 'Position')
# submission = link_(submission, 'Claim')
submission = link_(submission, 'Concluding Statement')
submission = link_(submission, 'Counterclaim')
submission = link_(submission, 'Rebuttal')

submission.to_csv("submission.csv", index=False)

                                                    

In [9]:
submission.head()

Unnamed: 0,id,class,predictionstring
0,18409261F5C2,Concluding Statement,989 990 991 992 993 994 995 996 997 998 999 10...
1,D46BCB48440A,Concluding Statement,306 307 308 309 310 311 312 313 314 315 316 31...
2,0FB0700DAF44,Concluding Statement,560 561 562 563 564 565 566 567 568 569 570 57...
3,D72CB1C11673,Concluding Statement,364 365 366 367 368 369 370 371 372 373 374 37...
4,DF920E0A7337,Concluding Statement,620 621 622 623 624 625 626 627 628 629 630 63...
