In [8]:
import os
import torch
import argparse
import glob
import random
import numpy as np
import pandas as pd
import tqdm as tqdm
from scipy.special import softmax

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display, HTML
import json
from importlib import reload
import sys
import datasets
import pickle
import pathlib

from tqdm.auto import tqdm
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from transformers.trainer_pt_utils import LengthGroupedSampler

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
print("sys.path:", sys.path)

from GlobEnc.src.modeling.globenc_utils import GlobencConfig
from GlobEnc.src.modeling.modeling_bert_v3 import BertForSequenceClassification
from transformers import DefaultDataCollator
from transformers import AutoModelForQuestionAnswering

import datasets
from datasets import load_dataset, load_metric 
from datasets import list_datasets, list_metrics

sys.path: ['/home/modaresi/projects/globenc_analysis/notebooks/v3', '/home/modaresi/.conda/envs/globenc-venv/lib/python37.zip', '/home/modaresi/.conda/envs/globenc-venv/lib/python3.7', '/home/modaresi/.conda/envs/globenc-venv/lib/python3.7/lib-dynload', '', '/home/modaresi/.conda/envs/globenc-venv/lib/python3.7/site-packages', '/home/modaresi/.conda/envs/globenc-venv/lib/python3.7/site-packages/IPython/extensions', '/home/modaresi/.ipython', '/home/modaresi/projects/globenc_analysis/notebooks', '/opt/huggingface/modules']


In [3]:
model = AutoModelForQuestionAnswering.from_pretrained("deepset/bert-base-uncased-squad2").to("cuda")
model.eval()
tokenizer = AutoTokenizer.from_pretrained("deepset/bert-base-uncased-squad2")

Downloading:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/415M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/302 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
squad = load_dataset("squad_v2")
squad = squad.filter(lambda x: len(x["answers"]['text']) > 0)

Reusing dataset squad_v2 (/opt/huggingface/datasets/squad_v2/squad_v2/2.0.0/09187c73c1b837c95d9a249cd97c2c3f1cebada06efe667b4427714b27639b1d)


  0%|          | 0/2 [00:00<?, ?it/s]



  0%|          | 0/131 [00:00<?, ?ba/s]

  0%|          | 0/12 [00:00<?, ?ba/s]

In [19]:
squad['train']['context'][1]

'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".'

In [12]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding=False,
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [13]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

  0%|          | 0/87 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

In [14]:
tokenized_squad

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 86821
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 5928
    })
})

In [16]:
test_num = 400
to_test = np.array(tokenized_squad['validation'])
to_test_idx = np.random.choice(len(tokenized_squad['validation']), test_num, replace=False)
to_test = to_test[to_test_idx]
len(to_test)

400