In [1]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: dill, responses, 

In [2]:
pip install transformers[torch]

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2


In [3]:
import os
import torch
import evaluate
import datasets
import collections
import transformers
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from tqdm.auto import tqdm
from time import perf_counter
from torch.utils.data import DataLoader
from datasets import (
    load_dataset,
    disable_progress_bar
)
from transformers import (
    pipeline,
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    DataCollatorWithPadding,
    EarlyStoppingCallback,
    IntervalStrategy
)

In [4]:
model_checkpoint = "distilbert-base-uncased"
task_name = "squad" # "squad_v2"


In [5]:
cache_dir = None
datasets = load_dataset(task_name, cache_dir=cache_dir)
datasets

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [6]:
datasets["train"][0]

{'id': '5733be284776f41900661182',
 'title': 'University_of_Notre_Dame',
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [515]}}

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, cache_dir=cache_dir)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [8]:
word = "@huggingface"
tokenized = tokenizer(word, return_offsets_mapping=True)
tokenized

{'input_ids': [101, 1030, 17662, 12172, 102], 'attention_mask': [1, 1, 1, 1, 1], 'offset_mapping': [(0, 0), (0, 1), (1, 8), (8, 12), (0, 0)]}

In [9]:
def convert_id_to_string(tokenizer, input_ids):
    strings = []
    for input_id in input_ids:
        string = tokenizer.convert_ids_to_tokens(input_id)
        strings.append(string)

    return strings


def convert_offset_mapping_to_string(tokenized, offset_mapping, word):
    strings = []
    for offset in offset_mapping:
        start = offset[0]
        end = offset[1]
        if end != 0:
            strings.append(word[start:end])

    return strings

In [10]:
# excluding for special tokens, the two should be identical
strings = convert_id_to_string(tokenizer, tokenized["input_ids"])
print("input ids' string: ", strings)

strings = convert_offset_mapping_to_string(tokenizer, tokenized["offset_mapping"], word)
print("offset mapping string: ", strings)

input ids' string:  ['[CLS]', '@', 'hugging', '##face', '[SEP]']
offset mapping string:  ['@', 'hugging', 'face']


In [11]:
examples = [
    "We are going to split this sentence",
    "This sentence is longer, we are also going to split it"
]
tokenized = tokenizer(
    examples,
    truncation=True,
    return_overflowing_tokens=True,
    max_length=6,
    stride=2
)
print("number of examples: ", len(examples))
print("number of tokenized features: ", len(tokenized["input_ids"]))
tokenized

number of examples:  2
number of tokenized features:  8


{'input_ids': [[101, 2057, 2024, 2183, 2000, 102], [101, 2183, 2000, 3975, 2023, 102], [101, 3975, 2023, 6251, 102], [101, 2023, 6251, 2003, 2936, 102], [101, 2003, 2936, 1010, 2057, 102], [101, 1010, 2057, 2024, 2036, 102], [101, 2024, 2036, 2183, 2000, 102], [101, 2183, 2000, 3975, 2009, 102]], 'attention_mask': [[1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1]], 'overflow_to_sample_mapping': [0, 0, 0, 1, 1, 1, 1, 1]}

In [12]:
# if we print out the batched input ids, we'll see each one
# of our sentences has been split to multiple chunks/features
for input_id, sample_mapping in zip(tokenized["input_ids"], tokenized["overflow_to_sample_mapping"]):
    chunk = tokenizer.decode(input_id)
    print("Chunk: ", chunk)
    print("Orignal input: ", examples[sample_mapping])

Chunk:  [CLS] we are going to [SEP]
Orignal input:  We are going to split this sentence
Chunk:  [CLS] going to split this [SEP]
Orignal input:  We are going to split this sentence
Chunk:  [CLS] split this sentence [SEP]
Orignal input:  We are going to split this sentence
Chunk:  [CLS] this sentence is longer [SEP]
Orignal input:  This sentence is longer, we are also going to split it
Chunk:  [CLS] is longer, we [SEP]
Orignal input:  This sentence is longer, we are also going to split it
Chunk:  [CLS], we are also [SEP]
Orignal input:  This sentence is longer, we are also going to split it
Chunk:  [CLS] are also going to [SEP]
Orignal input:  This sentence is longer, we are also going to split it
Chunk:  [CLS] going to split it [SEP]
Orignal input:  This sentence is longer, we are also going to split it


In [13]:
tokenized = tokenizer(
    ["question section"],
    ["context section"]
)
tokenized.sequence_ids(0)

[None, 0, 0, None, 1, 1, None]

In [14]:
# maximum length of a feature (question and context)
max_length = 384
# overlap between two part of the context
doc_stride = 128

In [15]:
def prepare_qa_train(examples):
    """Prepare training data, input features plus label for question answering dataset."""
    answers = examples["answers"]
    examples["question"] = [question.strip() for question in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep overflows using a stride.
    # This results in one example potentially generating several features when a context is
    # long, each of those features having a context that overlaps a bit the previous
    # feature's context to prevent chopping off answer span.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=doc_stride,
        padding="max_length"
    )
    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]
    offset_mapping = tokenized_examples["offset_mapping"]

     # We will label impossible answers with CLS token's index.
    cls_index = 0

    # start_positions and end_positions will be the labels for extractive question answering
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []
    for i, offset in enumerate(offset_mapping):
        input_ids = tokenized_examples["input_ids"][i]

        sample_index = sample_mapping[i]
        answer = answers[sample_index]

        # if no answers are given, set CLS index as answer
        if len(answer["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])

            sequence_ids = tokenized_examples.sequence_ids(i)

            # find the context's corresponding start and end token index
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # if answer is within the context offset, move the token_start_index and token_end_index
            # to two ends of the answer else label it with cls index
            offset_start_char = offset[token_start_index][0]
            offset_end_char = offset[token_end_index][1]
            if offset_start_char <= start_char and offset_end_char >= end_char:
                while token_start_index < len(offset) and offset[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_position = token_start_index - 1

                while offset[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_position = token_end_index + 1

                tokenized_examples["start_positions"].append(start_position)
                tokenized_examples["end_positions"].append(end_position)
            else:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)

    return tokenized_examples

In [None]:
examples = datasets["train"][0:2]
answers = examples["answers"]

tokenized_examples = prepare_qa_train(examples)

start_positions = tokenized_examples["start_positions"]
end_positions = tokenized_examples["end_positions"]
for i, input_ids in enumerate(tokenized_examples["input_ids"]):
    start = start_positions[i]
    end = end_positions[i] + 1
    string = tokenizer.decode(input_ids[start:end])
    print("expected answer:", answers[i]["text"][0])
    print("preprocessing answer:", string)

expected answer: Saint Bernadette Soubirous
preprocessing answer: saint bernadette soubirous
expected answer: a copper statue of Christ
preprocessing answer: a copper statue of christ


In [None]:
# prevents progress bar from flooding our document
disable_progress_bar()

tokenized_datasets = datasets.map(
    prepare_qa_train,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=8
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
        num_rows: 88524
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'start_positions', 'end_positions'],
        num_rows: 10784
    })
})

In [None]:
model_name = model_checkpoint.split("/")[-1]
fine_tuned_model_checkpoint = f"{model_name}-fine_tuned-{task_name}"

if os.path.isdir(fine_tuned_model_checkpoint):
    do_train = False
    model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_checkpoint, cache_dir=cache_dir)
else:
    do_train = True
    model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint, cache_dir=cache_dir)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
os.environ['DISABLE_MLFLOW_INTEGRATION'] = 'TRUE'

args = TrainingArguments(
    output_dir=fine_tuned_model_checkpoint,
    learning_rate=0.0001,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    # we set it to evaluate/save per epoch to avoid flowing console
    evaluation_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    save_total_limit=2,
    do_train=do_train
)

trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

In [None]:
if trainer.args.do_train:
    train_output = trainer.train()
    # saving the model which allows us to leverage
    # .from_pretrained(model_path)
    trainer.save_model(fine_tuned_model_checkpoint)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,1.3141,1.164167
2,0.8205,1.143494
3,0.5197,1.247746


In [None]:
example = datasets["validation"][0]
qa_pipeline = pipeline(
    "question-answering",
    model=fine_tuned_model_checkpoint,
    tokenizer=fine_tuned_model_checkpoint
)

output = qa_pipeline({
    "question": example["question"],
    "context": example["context"]
})
answer_text = example["answers"]["text"][0]
print("output answer matches expected answer: ", output["answer"] == answer_text)
output

output answer matches expected answer:  True


{'score': 0.9385336637496948,
 'start': 177,
 'end': 191,
 'answer': 'Denver Broncos'}

In [None]:
def prepare_qa_test(examples):
    examples["question"] = [question.strip() for question in examples["question"]]

    # Tokenize our examples with truncation and padding, but keep overflows using a stride.
    # This results in one example potentially generating several features when a context is
    # long, each of those features having a context that overlaps a bit the previous
    # feature's context to prevent chopping off answer span.
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        max_length=max_length,
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=doc_stride
    )
    sample_mapping = tokenized_examples["overflow_to_sample_mapping"]

    tokenized_examples["example_id"] = []
    for i in range(len(tokenized_examples["input_ids"])):
        sequence_ids = tokenized_examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # for offset mapping that are not part of context, set it to None so it's easy to determine
        # if a token positiion is part of the context or not
        offset_mapping = []
        for k, offset in enumerate(tokenized_examples["offset_mapping"][i]):
            if sequence_ids[k] != 1:
                offset = None

            offset_mapping.append(offset)

        tokenized_examples["offset_mapping"][i] = offset_mapping

    return tokenized_examples

In [None]:
validation_features = datasets["validation"].map(
    prepare_qa_test,
    batched=True,
    remove_columns=datasets["validation"].column_names,
    num_proc=8
)
validation_features

Dataset({
    features: ['input_ids', 'attention_mask', 'offset_mapping', 'overflow_to_sample_mapping', 'example_id'],
    num_rows: 10784
})

In [None]:
raw_predictions = trainer.predict(validation_features)
raw_predictions.predictions

(array([[  -9.5625   ,  -10.859375 ,  -11.0078125, ..., -100.       ,
         -100.       , -100.       ],
        [ -10.6171875,  -11.0859375,  -11.2265625, ..., -100.       ,
         -100.       , -100.       ],
        [ -10.4296875,   -9.984375 ,  -10.8984375, ..., -100.       ,
         -100.       , -100.       ],
        ...,
        [  -6.375    ,  -10.4609375,  -11.2109375, ..., -100.       ,
         -100.       , -100.       ],
        [  -6.265625 ,  -10.5703125,  -10.9765625, ..., -100.       ,
         -100.       , -100.       ],
        [  -4.9101562,  -10.4296875,  -10.7578125, ..., -100.       ,
         -100.       , -100.       ]], dtype=float32),
 array([[  -7.7617188,  -10.1953125,   -9.6328125, ..., -100.       ,
         -100.       , -100.       ],
        [  -9.3671875,  -10.1796875,   -9.78125  , ..., -100.       ,
         -100.       , -100.       ],
        [  -9.8515625,   -9.6484375,   -9.8359375, ..., -100.       ,
         -100.       , -100.       ]

In [None]:
def postprocess_qa_predictions(
    examples,
    features,
    raw_predictions,
    n_best_size = 20,
    max_answer_length = 30,
    no_answer = False
):
    print(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    all_start_logits, all_end_logits = raw_predictions

    # build a dictionary that stores examples to features/chunks mapping
    # key : example, value : list of features
    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    cls_index = 0
    predictions = collections.OrderedDict()

    # for each example, loop through all its features/chunks for finding the best one
    for example_index, example in enumerate(tqdm(examples)):
        feature_indices = features_per_example[example_index]

        min_null_score = None
        valid_answers = []
        context = example["context"]
        for feature_index in feature_indices:
            # model prediction for this feature
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]

            offset_mapping = features[feature_index]["offset_mapping"]

            # update minimum null prediction's score
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # loop through all possibilities for `n_best_size` start and end logits.
            start_indexes = np.argsort(start_logits)[-1:-n_best_size - 1:-1].tolist()
            end_indexes = np.argsort(end_logits)[-1:-n_best_size - 1:-1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because indices
                    # are out of bounds or correspond to input_ids that
                    # are not part of the context section.
                    if (
                        start_index >= len(offset_mapping)
                        or end_index >= len(offset_mapping)
                        or offset_mapping[start_index] is None
                        or len(offset_mapping[start_index]) < 2
                        or offset_mapping[end_index] is None
                        or len(offset_mapping[end_index]) < 2
                    ):
                        continue
                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "text": context[start_char:end_char],
                            "score": start_logits[start_index] + end_logits[end_index]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = max(valid_answers, key=lambda x: x["score"])
        else:
            # In the very rare edge case we have not a single non-null prediction,
            # we create a fake prediction to avoid failure.
            best_answer = {"text": "", "score": 0.0}

        example_id = example["id"]
        if no_answer:
            answer = best_answer["text"] if best_answer["score"] > min_null_score else ""
            predictions[example_id] = answer
        else:
            predictions[example_id] = best_answer["text"]

    return predictions

In [None]:
final_predictions = postprocess_qa_predictions(
    datasets["validation"],
    validation_features,
    raw_predictions.predictions
)
print("output answer matches expected answer: ", final_predictions[example["id"]] == answer_text)

Post-processing 10570 example predictions split into 10784 features.


  0%|          | 0/10570 [00:00<?, ?it/s]

output answer matches expected answer:  True


In [None]:
squad_metric = evaluate.load(task_name, cache_dir=cache_dir)
formatted_predictions = [
    {"id": example_id, "prediction_text": answer}
    for example_id, answer in final_predictions.items()
]
references = [{"id": example["id"], "answers": example["answers"]} for example in datasets["validation"]]
squad_metric.compute(predictions=formatted_predictions, references=references)

Downloading builder script:   0%|          | 0.00/4.53k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.32k [00:00<?, ?B/s]

{'exact_match': 77.01986754966887, 'f1': 85.34776167544653}

In [None]:
#######################DisitillBert+MaMl

In [None]:
#############################################################################

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.5 MB/s[0m eta [36m0:00:00[0m
Col

In [None]:
from transformers import pipeline

question_answerer = pipeline("question-answering")
context = """
🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch, and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question = "Which deep learning libraries back 🤗 Transformers?"
question_answerer(question=question, context=context)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading (…)lve/main/config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

{'score': 0.9802603125572205,
 'start': 78,
 'end': 106,
 'answer': 'Jax, PyTorch, and TensorFlow'}

In [None]:
long_context = """
🤗 Transformers: State of the Art NLP

🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction,
question answering, summarization, translation, text generation and more in over 100 languages.
Its aim is to make cutting-edge NLP easier to use for everyone.

🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and
then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and
can be modified to enable quick research experiments.

Why should I use transformers?

1. Easy-to-use state-of-the-art models:
  - High performance on NLU and NLG tasks.
  - Low barrier to entry for educators and practitioners.
  - Few user-facing abstractions with just three classes to learn.
  - A unified API for using all our pretrained models.
  - Lower compute costs, smaller carbon footprint:

2. Researchers can share trained models instead of always retraining.
  - Practitioners can reduce compute time and production costs.
  - Dozens of architectures with over 10,000 pretrained models, some in more than 100 languages.

3. Choose the right framework for every part of a model's lifetime:
  - Train state-of-the-art models in 3 lines of code.
  - Move a single model between TF2.0/PyTorch frameworks at will.
  - Seamlessly pick the right framework for training, evaluation and production.

4. Easily customize a model or an example to your needs:
  - We provide examples for each architecture to reproduce the results published by its original authors.
  - Model internals are exposed as consistently as possible.
  - Model files can be used independently of the library for quick experiments.

🤗 Transformers is backed by the three most popular deep learning libraries — Jax, PyTorch and TensorFlow — with a seamless integration
between them. It's straightforward to train your models with one before loading them for inference with the other.
"""
question_answerer(question=question, context=long_context)


{'score': 0.9714871048927307,
 'start': 1892,
 'end': 1919,
 'answer': 'Jax, PyTorch and TensorFlow'}

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_checkpoint = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)


In [None]:
start_logits = outputs.start_logits
end_logits = outputs.end_logits
print(start_logits.shape, end_logits.shape)


torch.Size([1, 67]) torch.Size([1, 67])


In [None]:
import torch

sequence_ids = inputs.sequence_ids()
# 컨텍스트 토큰들을 제외하고는 모두 마스킹한다.
mask = [i != 1 for i in sequence_ids]
# [CLS] 토큰은 마스킹하지 않는다.
mask[0] = False
mask = torch.tensor(mask)[None]

start_logits[mask] = -10000
end_logits[mask] = -10000


In [None]:
start_probabilities = torch.nn.functional.softmax(start_logits, dim=-1)[0]
end_probabilities = torch.nn.functional.softmax(end_logits, dim=-1)[0]


In [None]:
inputs = tokenizer(question, long_context)
print(len(inputs["input_ids"]))


461


In [None]:
inputs = tokenizer(question, long_context, max_length=384, truncation="only_second")
print(tokenizer.decode(inputs["input_ids"]))


[CLS] Which deep learning libraries back [UNK] Transformers? [SEP] [UNK] Transformers : State of the Art NLP [UNK] Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation and more in over 100 languages. Its aim is to make cutting - edge NLP easier to use for everyone. [UNK] Transformers provides APIs to quickly download and use those pretrained models on a given text, fine - tune them on your own datasets and then share them with the community on our model hub. At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments. Why should I use transformers? 1. Easy - to - use state - of - the - art models : - High performance on NLU and NLG tasks. - Low barrier to entry for educators and practitioners. - Few user - facing abstractions with just three classes to learn. - A unified A

In [None]:
sentence = "This sentence is not too long but we are going to split it anyway."
inputs = tokenizer(
    sentence, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)
for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))


[CLS] This sentence is not [SEP]
[CLS] is not too long [SEP]
[CLS] too long but we [SEP]
[CLS] but we are going [SEP]
[CLS] are going to split [SEP]
[CLS] to split it anyway [SEP]
[CLS] it anyway. [SEP]


In [None]:
print(inputs.keys())


dict_keys(['input_ids', 'attention_mask', 'overflow_to_sample_mapping'])


In [None]:
print(inputs["overflow_to_sample_mapping"])


[0, 0, 0, 0, 0, 0, 0]


In [None]:
sentence = "This sentence is shorter but will still get split."
inputs = tokenizer(
    sentence, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)
for ids in inputs["input_ids"]:
    print(tokenizer.decode(ids))


[CLS] This sentence is shorter [SEP]
[CLS] is shorter but will [SEP]
[CLS] but will still get [SEP]
[CLS] still get split. [SEP]


In [None]:
sentences = [
    "This sentence is not too long but we are going to split it anyway.",
    "This sentence is shorter but will still get split.",
]
inputs = tokenizer(
    sentences, truncation=True, return_overflowing_tokens=True, max_length=6, stride=2
)

print(inputs["overflow_to_sample_mapping"])


[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1]


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
example = "My name is Sylvain and I work at Hugging Face in Brooklyn."
encoding = tokenizer(example)
#print(type(encoding))
for ids in encoding["input_ids"]:
    print(tokenizer.decode(ids),end=' ')
print(encoding)

[CLS] My name is S ##yl ##va ##in and I work at Hu ##gging Face in Brooklyn . [SEP] {'input_ids': [101, 1422, 1271, 1110, 156, 7777, 2497, 1394, 1105, 146, 1250, 1120, 20164, 10932, 10289, 1107, 6010, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
