<h2>chaii QA - 5 Fold XLMRoberta Inference in Torch w/o Trainer API</h2>
    
<h3><span style="color:#444">Introduction</span></h3>

The kernel implements inference for 5-Fold XLMRoberta QA Model without using the Trainer API from HuggingFace.

This is a three part kernel, for more information visit Finetuning (FIT) notebook,

- [External Data - MLQA, XQUAD Preprocessing](https://www.kaggle.com/rhtsingh/external-data-mlqa-xquad-preprocessing) which preprocesses the Hindi Corpus of MLQA and XQUAD. I have used these data for training.

- [chaii QA - 5 Fold XLMRoberta Torch | FIT](https://www.kaggle.com/rhtsingh/chaii-qa-5-fold-xlmroberta-torch-fit/edit) This kernel showcases Finetuning (FIT) combining several techniques on competition + external data.

- [chaii QA - 5 Fold XLMRoberta Torch | Infer](#xyz) The Inference kernel where we ensemble our 5 Fold XLMRoberta Models and do the submission.

### Import Dependencies

In [1]:
import os
import gc
gc.enable()
import math
import json
import time
import random
import multiprocessing
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn import model_selection
from sklearn.neighbors import LocalOutlierFactor

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import Parameter
import torch.optim as optim
from torch.utils.data import (
    Dataset, DataLoader,
    SequentialSampler, RandomSampler
)
from torch.utils.data.distributed import DistributedSampler

try:
    from apex import amp
    APEX_INSTALLED = True
except ImportError:
    APEX_INSTALLED = False

import transformers
from transformers import (
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
    logging,
    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
)
logging.set_verbosity_warning()
logging.set_verbosity_error()

def fix_all_seeds(seed):
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def optimal_num_of_loader_workers():
    num_cpus = multiprocessing.cpu_count()
    num_gpus = torch.cuda.device_count()
    optimal_value = min(num_cpus, num_gpus*4) if num_gpus else num_cpus - 1
    return optimal_value

print(f"Apex AMP Installed :: {APEX_INSTALLED}")
MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

Apex AMP Installed :: False


### Configuration

In [2]:
class Config:
    # model
    model_type = 'xlm_roberta'
    model_name_or_path = "deepset/xlm-roberta-large-squad2"
    config_name = "deepset/xlm-roberta-large-squad2"
    fp16 = True if APEX_INSTALLED else False
    fp16_opt_level = "O1"
    gradient_accumulation_steps = 2

    # tokenizer
    tokenizer_name = "deepset/xlm-roberta-large-squad2"
    max_seq_length = 310
    doc_stride = 256

    # train
    epochs = 1
    train_batch_size = 4
    eval_batch_size = 4

    # optimizer
    optimizer_type = 'AdamW'
    learning_rate = 1.5e-5
    weight_decay = 1e-2
    epsilon = 1e-8
    max_grad_norm = 1.0

    # scheduler
    decay_name = 'linear-warmup'
    warmup_ratio = 0.1

    # logging
    logging_steps = 10
    
    # evaluate
    output_dir = 'output'
    seed = 2021

### Dataset Retriever

In [3]:
class DatasetRetriever(Dataset):
    def __init__(self, features, mode='train'):
        super(DatasetRetriever, self).__init__()
        self.features = features
        self.mode = mode
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, item):   
        feature = self.features[item]
        if self.mode == 'train':
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':torch.tensor(feature['offset_mapping'], dtype=torch.long),
                'start_position':torch.tensor(feature['start_position'], dtype=torch.long),
                'end_position':torch.tensor(feature['end_position'], dtype=torch.long)
            }
        else:
            return {
                'input_ids':torch.tensor(feature['input_ids'], dtype=torch.long),
                'attention_mask':torch.tensor(feature['attention_mask'], dtype=torch.long),
                'offset_mapping':feature['offset_mapping'],
                'sequence_ids':feature['sequence_ids'],
                'id':feature['example_id'],
                'context': feature['context'],
                'question': feature['question']
            }

### Model

In [4]:
class Model(nn.Module):
    def __init__(self, modelname_or_path, config):
        super(Model, self).__init__()
        self.config = config
        self.xlm_roberta = AutoModel.from_pretrained(modelname_or_path, config=config)
        self.qa_outputs = nn.Linear(config.hidden_size, 2)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self._init_weights(self.qa_outputs)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()

    def forward(
        self, 
        input_ids, 
        attention_mask=None, 
    ):
        outputs = self.xlm_roberta(
            input_ids,
            attention_mask=attention_mask,
        )

        sequence_output = outputs[0]
        pooled_output = outputs[1]
        
        # sequence_output = self.dropout(sequence_output)
        qa_logits = self.qa_outputs(sequence_output)
        
        start_logits, end_logits = qa_logits.split(1, dim=-1)
        start_logits = start_logits.squeeze(-1)
        end_logits = end_logits.squeeze(-1)
    
        return start_logits, end_logits

### Utilities

In [5]:
def make_model(args):
    config = AutoConfig.from_pretrained(args.config_name)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name)
    model = Model(args.model_name_or_path, config=config)
    return config, tokenizer, model

### Covert Examples to Features (Preprocess)

In [6]:
def prepare_test_features(args, example, tokenizer):
    example["question"] = example["question"].lstrip()
    
    tokenized_example = tokenizer(
        example["question"],
        example["context"],
        truncation="only_second",
        max_length=args.max_seq_length,
        stride=args.doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    features = []
    for i in range(len(tokenized_example["input_ids"])):
        feature = {}
        feature["example_id"] = example['id']
        feature['context'] = example['context']
        feature['question'] = example['question']
        feature['input_ids'] = tokenized_example['input_ids'][i]
        feature['attention_mask'] = tokenized_example['attention_mask'][i]
        feature['offset_mapping'] = tokenized_example['offset_mapping'][i]
        feature['sequence_ids'] = [0 if i is None else i for i in tokenized_example.sequence_ids(i)]
        features.append(feature)
    return features

### Postprocess QA Predictions

In [7]:
import collections

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size = 20, max_answer_length = 30):
    all_start_logits, all_end_logits = raw_predictions
    
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    for example_index, example in examples.iterrows():
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        
        context = example["context"]
        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            sequence_ids = features[feature_index]["sequence_ids"]
            context_index = 1

            features[feature_index]["offset_mapping"] = [
                (o if sequence_ids[k] == context_index else None)
                for k, o in enumerate(features[feature_index]["offset_mapping"])
            ]
            offset_mapping = features[feature_index]["offset_mapping"]
            cls_index = features[feature_index]["input_ids"].index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    if (start_index==cls_index) & (end_index==cls_index):
#                         start_char = offset_mapping[start_index][0]
#                         end_char = offset_mapping[end_index][1]
                        valid_answers.append(
                            {
                                "score": start_logits[start_index] + end_logits[end_index],
                                "answer": ''
                            }
                        )
                        break
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or offset_mapping[end_index] is None
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "answer": context[start_char: end_char]
                        }
                    )
        
        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            best_answer = {"answer": "", "score": 0.0}
        
        predictions[example["id"]] = best_answer["answer"]
        
        
    return predictions

### Data Factory

In [8]:
import re

def clean_sentence(sentence):
    sentence = arToPersianChar(sentence)
    sentence = arToPersianNumb(sentence)
    # more_normalization_function()
    return sentence


def arToPersianNumb(number):
    dic = {
        '١': '۱',
        '٢': '۲',
        '٣': '۳',
        '٤': '۴',
        '٥': '۵',
        '٦': '۶',
        '٧': '۷',
        '٨': '۸',
        '٩': '۹',
        '٠': '۰',
    }
    return multiple_replace(dic, number)


def arToPersianChar(userInput):
    dic = {
        'ك': 'ک',
        'دِ': 'د',
        'بِ': 'ب',
        'زِ': 'ز',
        'ذِ': 'ذ',
        'شِ': 'ش',
        'سِ': 'س',
        'ى': 'ی',
        'ي': 'ی'
    }
    return multiple_replace(dic, userInput)


def multiple_replace(dic, text):
    pattern = "|".join(map(re.escape, dic.keys()))
    return re.sub(pattern, lambda m: dic[m.group()], str(text))

In [9]:
test = pd.read_csv('qa_test.csv')

test['context'] = test['context'].apply(clean_sentence)
test['question'] = test['question'].apply(clean_sentence)
test['answer1'] = test['answer1'].apply(clean_sentence)
test['answer2'] = test['answer2'].apply(clean_sentence)
test['answer3'] = test['answer3'].apply(clean_sentence)
test.head()

Unnamed: 0,context,question,answer1,answer2,answer3,start,end,id
0,ای . اچ . کار علم تاریخ را علم شناخت و تفسیر گ...,ای . اچ . کار علم تاریخ را علم چه چیزی در پرتو...,شناخت و تفسیر گذشته انسان ها,شناخت و تفسیر گذشته انسان ها,شناخت و تفسیر گذشته انسان ها,31,59,0
1,نگرش فلسفی به تاریخ در ایران با کتاب دیباچه ای...,کتاب دیباچه ای بر فلسفه تاریخ ایران نوشته کیست؟,ارسلان پوریا,ارسلان پوریا,ارسلان پوریا,74,86,1
2,ماقبل تاریخ یا پیشا تاریخ به دوره قبل از تاریخ...,ماقبل تاریخ یا پیشا تاریخ به چه دوره ای اطلاق ...,قبل از تاریخ مکتوب,دوره قبل از تاریخ مکتوب,دوره قبل از تاریخ مکتوب,34,52,2
3,ماقبل تاریخ یا پیشا تاریخ به دوره قبل از تاریخ...,چه عصری انسان راست قامت یا انسان مدرن در سراسر...,پارینه سنگی زیرین,عصر پارینه سنگی زیرین,عصر پارینه سنگی زیرین,101,118,3
4,ماقبل تاریخ یا پیشا تاریخ به دوره قبل از تاریخ...,استفاده کنترل شده از آتش اولین بار در کدام عصر...,پارینه سنگی میانی,عصر پارینه سنگی میانی,عصر پارینه سنگی میانی,247,264,4


In [10]:
test['id'] = range(len(test))
base_model_path = '.../input/notebook07a03d3908/'
# base_model_path = '../input/qa-training-parsbert'

tokenizer = AutoTokenizer.from_pretrained(Config().tokenizer_name)

test_features = []
for i, row in test.iterrows():
    test_features += prepare_test_features(Config(), row, tokenizer)

args = Config()
test_dataset = DatasetRetriever(test_features, mode='test')
test_dataloader = DataLoader(
    test_dataset,
    batch_size=args.eval_batch_size, 
    sampler=SequentialSampler(test_dataset),
    num_workers=optimal_num_of_loader_workers(),
    pin_memory=True, 
    drop_last=False
)

Downloading:   0%|          | 0.00/606 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/179 [00:00<?, ?B/s]

### Intialize Inference

In [11]:
def get_predictions(checkpoint_path):
    config, tokenizer, model = make_model(Config())
    model.cuda();
    model.load_state_dict(
        torch.load(checkpoint_path)
    );
    
    start_logits = []
    end_logits = []
    for batch in test_dataloader:
        with torch.no_grad():
            outputs_start, outputs_end = model(batch['input_ids'].cuda(), batch['attention_mask'].cuda())
            start_logits.append(outputs_start.cpu().numpy().tolist())
            end_logits.append(outputs_end.cpu().numpy().tolist())
            del outputs_start, outputs_end
    del model, tokenizer, config
    gc.collect()
    return np.vstack(start_logits), np.vstack(end_logits)

### Ensemble 5-Folds

In [12]:
start_logits1, end_logits1 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-0/pytorch_model.bin')
start_logits2, end_logits2 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-1/pytorch_model.bin')
start_logits3, end_logits3 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-2/pytorch_model.bin')
start_logits4, end_logits4 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-3/pytorch_model.bin')
start_logits5, end_logits5 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-4/pytorch_model.bin')
start_logits6, end_logits6 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-5/pytorch_model.bin')
start_logits7, end_logits7 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-6/pytorch_model.bin')
start_logits8, end_logits8 = get_predictions('../input/notebook07a03d3908/output/checkpoint-fold-7/pytorch_model.bin')

start_logits = np.zeros([len(start_logits1),400])
end_logits = np.zeros([len(end_logits1),400])

for i in range(len(start_logits1)):
    start_logits = (start_logits1 + start_logits2 + start_logits3 + start_logits4 + start_logits5 + start_logits6 + start_logits7 + start_logits8) / 8
    end_logits = (end_logits1 + end_logits2 + end_logits3 + end_logits4 + end_logits5 + end_logits6 + end_logits7 + end_logits8) / 8

predictions = postprocess_qa_predictions(test, test_features, (start_logits, end_logits))
predictions1 = postprocess_qa_predictions(test, test_features, (start_logits1, end_logits1))
predictions2 = postprocess_qa_predictions(test, test_features, (start_logits2, end_logits2))
predictions3 = postprocess_qa_predictions(test, test_features, (start_logits3, end_logits3))
predictions4 = postprocess_qa_predictions(test, test_features, (start_logits4, end_logits4))
predictions5 = postprocess_qa_predictions(test, test_features, (start_logits5, end_logits5))
predictions6 = postprocess_qa_predictions(test, test_features, (start_logits6, end_logits6))
predictions7 = postprocess_qa_predictions(test, test_features, (start_logits7, end_logits7))
predictions8 = postprocess_qa_predictions(test, test_features, (start_logits8, end_logits8))

test['PredictionString'] = test['id'].map(predictions)
test['PredictionString1'] = test['id'].map(predictions1)
test['PredictionString2'] = test['id'].map(predictions2)
test['PredictionString3'] = test['id'].map(predictions3)
test['PredictionString4'] = test['id'].map(predictions4)
test['PredictionString5'] = test['id'].map(predictions5)
test['PredictionString6'] = test['id'].map(predictions6)
test['PredictionString7'] = test['id'].map(predictions7)
test['PredictionString8'] = test['id'].map(predictions8)

# test[['id', 'PredictionString']].to_csv('submission.csv', index=False)

# print(test[['id', 'PredictionString']])

Downloading:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.
Post-processing 3182 example predictions split into 3629 features.


In [13]:
test["answer1"] = test["answer1"].apply(lambda x: str(x).strip())
test["answer2"] = test["answer2"].apply(lambda x: str(x).strip())
test["answer3"] = test["answer3"].apply(lambda x: str(x).strip())
test["PredictionString"] = test["PredictionString"].apply(lambda x: x.strip())
test["PredictionString1"] = test["PredictionString1"].apply(lambda x: x.strip())
test["PredictionString2"] = test["PredictionString2"].apply(lambda x: x.strip())
test["PredictionString3"] = test["PredictionString3"].apply(lambda x: x.strip())
test["PredictionString4"] = test["PredictionString4"].apply(lambda x: x.strip())
test["PredictionString5"] = test["PredictionString5"].apply(lambda x: x.strip())
test["PredictionString6"] = test["PredictionString6"].apply(lambda x: x.strip())
test["PredictionString7"] = test["PredictionString7"].apply(lambda x: x.strip())
test["PredictionString8"] = test["PredictionString8"].apply(lambda x: x.strip())

test.PredictionString = test.PredictionString.fillna('')
test.PredictionString1 = test.PredictionString1.fillna('')
test.PredictionString2 = test.PredictionString2.fillna('')
test.PredictionString3 = test.PredictionString3.fillna('')
test.PredictionString4 = test.PredictionString4.fillna('')
test.PredictionString5 = test.PredictionString5.fillna('')
test.PredictionString6 = test.PredictionString6.fillna('')
test.PredictionString7 = test.PredictionString7.fillna('')
test.PredictionString8 = test.PredictionString8.fillna('')

test.answer1[test.answer1=='-1'] = ''
test.answer2[test.answer2=='-1'] = ''
test.answer3[test.answer3=='-1'] = ''
test.answer1[test.answer1=='nan'] = ''
test.answer2[test.answer2=='nan'] = ''
test.answer3[test.answer3=='nan'] = ''

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [14]:
def jaccard_score(str1, str2):
    if str1==str2:
        return 1
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

def compute_f1(truth, prediction):
    pred_tokens = prediction.split()
    truth_tokens = truth.split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)

In [15]:
test["jaccard_score"] = 0.0
test["f1_score"] = 0.0
test["jaccard_score1"] = 0.0
test["f1_score1"] = 0.0
test["jaccard_score2"] = 0.0
test["f1_score2"] = 0.0
test["jaccard_score3"] = 0.0
test["f1_score3"] = 0.0
test["jaccard_score4"] = 0.0
test["f1_score4"] = 0.0
test["jaccard_score5"] = 0.0
test["f1_score5"] = 0.0
test["jaccard_score6"] = 0.0
test["f1_score6"] = 0.0
test["jaccard_score7"] = 0.0
test["f1_score7"] = 0.0
test["jaccard_score8"] = 0.0
test["f1_score8"] = 0.0
for i in test.index:
    j1 = jaccard_score(test["answer1"][i], test["PredictionString"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString"][i])
    test["jaccard_score"][i] = np.max([j1,j2,j3])
    test['f1_score'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString1"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString1"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString1"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString1"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString1"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString1"][i])
    test["jaccard_score1"][i] = np.max([j1,j2,j3])
    test['f1_score1'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString2"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString2"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString2"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString2"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString2"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString2"][i])
    test["jaccard_score2"][i] = np.max([j1,j2,j3])
    test['f1_score2'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString3"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString3"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString3"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString3"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString3"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString3"][i])
    test["jaccard_score3"][i] = np.max([j1,j2,j3])
    test['f1_score3'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString4"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString4"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString4"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString4"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString4"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString4"][i])
    test["jaccard_score4"][i] = np.max([j1,j2,j3])
    test['f1_score4'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString5"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString5"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString5"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString5"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString5"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString5"][i])
    test["jaccard_score5"][i] = np.max([j1,j2,j3])
    test['f1_score5'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString6"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString6"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString6"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString6"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString6"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString6"][i])
    test["jaccard_score6"][i] = np.max([j1,j2,j3])
    test['f1_score6'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString7"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString7"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString7"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString7"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString7"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString7"][i])
    test["jaccard_score7"][i] = np.max([j1,j2,j3])
    test['f1_score7'][i] = np.max([f1,f2,f3])
    
    j1 = jaccard_score(test["answer1"][i], test["PredictionString8"][i])
    j2 = jaccard_score(test["answer2"][i], test["PredictionString8"][i])
    j3 = jaccard_score(test["answer3"][i], test["PredictionString8"][i])
    f1 = compute_f1(test["answer1"][i], test["PredictionString8"][i])
    f2 = compute_f1(test["answer2"][i], test["PredictionString8"][i])
    f3 = compute_f1(test["answer3"][i], test["PredictionString8"][i])
    test["jaccard_score8"][i] = np.max([j1,j2,j3])
    test['f1_score8'][i] = np.max([f1,f2,f3])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [16]:
# test

In [17]:
print('jaccard: ', np.mean(test.jaccard_score))
print('exact match: ', len(test[(test.answer1==test.PredictionString) | (test.answer2==test.PredictionString) | (test.answer3==test.PredictionString)]) / len(test))
print('f1_score: ', np.mean(test.f1_score))

jaccard:  0.9391375310007204
exact match:  0.8984915147705845
f1_score:  0.9364871897837335


In [18]:
print('********** fold1 **********')
print('jaccard: ', np.mean(test.jaccard_score1))
print('exact match: ', len(test[(test.answer1==test.PredictionString1) | (test.answer2==test.PredictionString1) | (test.answer3==test.PredictionString1)]) / len(test))
print('f1_score: ', np.mean(test.f1_score1))

print('********** fold2 **********')
print('jaccard: ', np.mean(test.jaccard_score2))
print('exact match: ', len(test[(test.answer1==test.PredictionString2) | (test.answer2==test.PredictionString2) | (test.answer3==test.PredictionString2)]) / len(test))
print('f1_score: ', np.mean(test.f1_score2))

print('********** fold3 **********')
print('jaccard: ', np.mean(test.jaccard_score3))
print('exact match: ', len(test[(test.answer1==test.PredictionString3) | (test.answer2==test.PredictionString3) | (test.answer3==test.PredictionString3)]) / len(test))
print('f1_score: ', np.mean(test.f1_score3))

print('********** fold4 **********')
print('jaccard: ', np.mean(test.jaccard_score4))
print('exact match: ', len(test[(test.answer1==test.PredictionString4) | (test.answer2==test.PredictionString4) | (test.answer3==test.PredictionString4)]) / len(test))
print('f1_score: ', np.mean(test.f1_score4))

print('********** fold5 **********')
print('jaccard: ', np.mean(test.jaccard_score5))
print('exact match: ', len(test[(test.answer1==test.PredictionString5) | (test.answer2==test.PredictionString5) | (test.answer3==test.PredictionString5)]) / len(test))
print('f1_score: ', np.mean(test.f1_score5))

print('********** fold6 **********')
print('jaccard: ', np.mean(test.jaccard_score6))
print('exact match: ', len(test[(test.answer1==test.PredictionString6) | (test.answer2==test.PredictionString6) | (test.answer3==test.PredictionString6)]) / len(test))
print('f1_score: ', np.mean(test.f1_score6))

print('********** fold7 **********')
print('jaccard: ', np.mean(test.jaccard_score7))
print('exact match: ', len(test[(test.answer1==test.PredictionString7) | (test.answer2==test.PredictionString7) | (test.answer3==test.PredictionString7)]) / len(test))
print('f1_score: ', np.mean(test.f1_score7))

print('********** fold8 **********')
print('jaccard: ', np.mean(test.jaccard_score8))
print('exact match: ', len(test[(test.answer1==test.PredictionString8) | (test.answer2==test.PredictionString8) | (test.answer3==test.PredictionString8)]) / len(test))
print('f1_score: ', np.mean(test.f1_score8))

********** fold1 **********
jaccard:  0.928882494741766
exact match:  0.8805782526712759
f1_score:  0.9280400540217271
********** fold2 **********
jaccard:  0.9281876954401318
exact match:  0.88340666247643
f1_score:  0.9274420808521231
********** fold3 **********
jaccard:  0.9237951569912358
exact match:  0.8730358265241986
f1_score:  0.9241795710613486
********** fold4 **********
jaccard:  0.9276658026959771
exact match:  0.8796354494028913
f1_score:  0.9264380031740689
********** fold5 **********
jaccard:  0.9271703255752144
exact match:  0.8830923947203017
f1_score:  0.9254450665678515
********** fold6 **********
jaccard:  0.926402049968566
exact match:  0.8758642363293526
f1_score:  0.9257315991642112
********** fold7 **********
jaccard:  0.9313755874177019
exact match:  0.8859208045254557
f1_score:  0.9304448838164
********** fold8 **********
jaccard:  0.9192802970287194
exact match:  0.8727215587680704
f1_score:  0.9190659413470841


In [19]:
test.to_csv('./submission.csv', index=False)