In [1]:
from datasets import load_dataset, load_metric

from glob import glob
import os
from tqdm import tqdm
import json
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
!pwd

/home/mdorosan/2022/addiction_chatbot/qa_pipeline


In [3]:
from huggingface_hub import notebook_login
import transformers
transformers.__version__

'4.15.0'

In [4]:
from huggingface_hub import notebook_login

notebook_login()

Login successful
Your token has been saved to /home/mdorosan/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [23]:
# !pip install git-lfs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting git-lfs
  Downloading git_lfs-1.6-py2.py3-none-any.whl (5.6 kB)
Installing collected packages: git-lfs
Successfully installed git-lfs-1.6


## Read data

In [5]:
%%time
data_files = {'../data/qa-2/general-conference/*.json'}
ar = load_dataset("json", data_files=data_files)

Resolving data files:   0%|          | 0/26917 [00:00<?, ?it/s]

Using custom data configuration default-ae0363a304eda62b
Reusing dataset json (/home/mdorosan/.cache/huggingface/datasets/json/default-ae0363a304eda62b/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/1 [00:00<?, ?it/s]

CPU times: user 18.3 s, sys: 9.96 s, total: 28.2 s
Wall time: 21.6 s


In [6]:
# split
ar = ar['train'].train_test_split(test_size=0.2)

# inspect dataset
ar

DatasetDict({
    train: Dataset({
        features: ['page', 'index', 'title', 'context', 'question', 'answers'],
        num_rows: 21533
    })
    test: Dataset({
        features: ['page', 'index', 'title', 'context', 'question', 'answers'],
        num_rows: 5384
    })
})

## Preprocessing

In [7]:
from transformers import AutoTokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [8]:
# checlk if tokenizer is fast tokenizer
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [9]:
def preprocess_function(examples):
    """Understand this!"""
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
%%time
tokenized_ar = ar.map(preprocess_function, batched=True, remove_columns=ar["train"].column_names)

  0%|          | 0/22 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

CPU times: user 1min 16s, sys: 45.6 ms, total: 1min 16s
Wall time: 12.6 s


In [11]:
# inspect
x = tokenized_ar["train"][0]["input_ids"]
tokenizer.decode(x)

'[CLS] do you realize how deeply the lord loves and cherishes you, even now? [SEP] they need to know the dangers of pornography and how it overtakes lives, causing loss of the spirit, distorted feelings, deceit, damaged relationships, loss of self - control, and nearly total consumption of time, thought, and energy. pornography is more vile, evil, and graphic than ever before. as we counsel with our children, together we can create a family plan with standards and boundaries, being proactive to protect our homes with filters on electronic devices. parents, are we aware that mobile devices with internet capacity, not computers, are the biggest culprit? 2 young people and adults, if you are caught in satan ’ s trap of pornography, remember how merciful our beloved savior is. our savior has the power to cleanse and heal you. he can remove the pain and sorrow you feel and make you clean again through the power of his atonement. we as leaders are also greatly concerned about the spouses and

In [12]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [13]:
tf_train_set = tokenized_ar["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "start_positions", "end_positions"],
    dummy_labels=True,
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = tokenized_ar["test"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "start_positions", "end_positions"],
    dummy_labels=True,
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

2022-03-26 02:16:56.263162: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-26 02:16:57.888205: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 10419 MB memory:  -> device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:04:00.0, compute capability: 6.1
2022-03-26 02:16:57.889954: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 10419 MB memory:  -> device: 1, name: GeForce GTX 1080 Ti, pci bus id: 0000:84:00.0, compute capability: 6.1


In [14]:
from transformers import create_optimizer

batch_size = 16
num_epochs = 2
total_train_steps = (len(tokenized_ar["train"]) // batch_size) * num_epochs

optimizer, schedule = create_optimizer(
    init_lr=2e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)

In [15]:
from transformers import TFAutoModelForQuestionAnswering
model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)

2022-03-26 02:17:01.737807: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForQuestionAnswering: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-u

In [16]:
import tensorflow as tf

model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as the 'labels' key of the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [17]:
model.summary()

Model: "tf_distil_bert_for_question_answering"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
qa_outputs (Dense)           multiple                  1538      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,364,418
Trainable params: 66,364,418
Non-trainable params: 0
_________________________________________________________________


In [29]:
from transformers.keras_callbacks import PushToHubCallback
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint


tensorboard_callback = TensorBoard(log_dir="./qa_model_save/logs")

callbacks = [tensorboard_callback]

# portless : 7 mins for each epoch
model.fit(
    x=tf_train_set, 
    validation_data=tf_validation_set, 
    epochs=3,
    callbacks=callbacks,
)

2022-03-26 02:32:11.712298: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-03-26 02:32:11.712396: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-03-26 02:32:11.712689: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1666] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI could not be loaded or symbol could not be found.
2022-03-26 02:32:11.712810: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-03-26 02:32:11.712872: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1757] function cupti_interface_->Finalize()failed with error CUPTI could not be loaded or symbol could not be found.


Epoch 1/3
   1/1345 [..............................] - ETA: 9:46 - loss: 5.3270

2022-03-26 02:32:12.449867: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-03-26 02:32:12.449939: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-03-26 02:32:12.450059: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1666] function cupti_interface_->Subscribe( &subscriber_, (CUpti_CallbackFunc)ApiCallback, this)failed with error CUPTI could not be loaded or symbol could not be found.


   2/1345 [..............................] - ETA: 12:43 - loss: 5.3740

2022-03-26 02:32:12.829865: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-03-26 02:32:12.830018: E tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1757] function cupti_interface_->Finalize()failed with error CUPTI could not be loaded or symbol could not be found.
2022-03-26 02:32:12.843349: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:673]  GpuTracer has collected 0 callback api events and 0 activity events. 
2022-03-26 02:32:12.849914: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-03-26 02:32:12.854480: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ./qa_model_save/logs/train/plugins/profile/2022_03_26_02_32_12

2022-03-26 02:32:12.856553: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ./qa_model_save/logs/train/plugins/profile/2022_03_26_02_32_12/jupyter-mdorosan.trace.json.gz
2022-03-2



NotImplementedError: Saving the model to HDF5 format requires the model to be a Functional model or a Sequential model. It does not work for subclassed models, because such models are defined via the body of a Python method, which isn't safely serializable. Consider saving to the Tensorflow SavedModel format (by setting save_format="tf") or using `save_weights`.

In [35]:
# set up the git lfs thing in local machine
answer_size = 2
# initialize callback for best model save
if not os.path.exists('../models'):
    os.mkdir('../models')
model_savepath = f'../models/{model_checkpoint}-qa-{answer_size}-ft-ar'

model.save_pretrained(model_savepath)

# tf.keras.models.save_model(
#     model,
#     filepath=model_savepath,
#     overwrite=True,
#     save_format="tf",
# )

### Validation

In [63]:
batch = next(iter(tf_validation_set))
output = model.predict_on_batch(batch)
output.keys()

odict_keys(['start_logits', 'end_logits'])

In [64]:
import numpy as np

np.argmax(output.start_logits, -1), np.argmax(output.end_logits, -1)

(array([138, 118, 104, 148,  97, 117, 147, 191,  97, 123, 118,  86, 162,
         86, 179, 146]),
 array([181, 143, 159, 196, 137, 160, 189, 233, 137, 139, 143, 162, 196,
        162, 211, 205]))

In [65]:
n_best_size = 20

In [66]:
import numpy as np

start_logits = output.start_logits[0]
end_logits = output.end_logits[0]
# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
valid_answers = []
for start_index in start_indexes:
    for end_index in end_indexes:
        if (
            start_index <= end_index
        ):  # We need to refine that test to check the answer is inside the context
            valid_answers.append(
                {
                    "score": start_logits[start_index] + end_logits[end_index],
                    "text": "",  # We need to find a way to get back the original substring corresponding to the answer in the context
                }
            )

In [67]:
def prepare_validation_features(examples):
    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        examples["question" if pad_on_right else "context"],
        examples["context" if pad_on_right else "question"],
        truncation="only_second" if pad_on_right else "only_first",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # We keep the example_id that gave us this feature and we will store the offset mappings.
    tokenized_examples["example_id"] = []

    for i in range(len(tokenized_examples["input_ids"])):
        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)
        context_index = 1 if pad_on_right else 0

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        tokenized_examples["example_id"].append(examples["id"][sample_index])

        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
        # position is part of the context or not.
        tokenized_examples["offset_mapping"][i] = [
            (o if sequence_ids[k] == context_index else None)
            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
        ]

    return tokenized_examples

In [68]:
validation_features = ar["test"].map(
    prepare_validation_features,
    batched=True,
    remove_columns=ar["test"].column_names,
)

validation_dataset = validation_features.to_tf_dataset(
    columns=["attention_mask", "input_ids"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

  0%|          | 0/6 [00:00<?, ?ba/s]

NameError: name 'pad_on_right' is not defined

## Code for Run Time

In [40]:
question = "How do I stop smoking?"

text = "Laboratory studies have also shown that emphysema and chronic bronchitis result from cigarette smoking. Even high school students who smoke have poorer lung function than nonsmokers; autopsies have shown lung damage in adults without breathing problems who stopped smoking a decade before death. Smoke causes malfunction of special white blood cells in the windpipe that guard against invading germs; smoke also slows down the microscopic hairs along the windpipe that move trapped particles up away from the lungs. Thus, it is not surprising that smokers take longer to recover from mild respiratory virus infections than nonsmokers."

In [57]:
model_base = TFAutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
input_dict = tokenizer(question, text, return_tensors="tf")
outputs = model_base(input_dict)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1])
answer

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForQuestionAnswering: ['vocab_layer_norm', 'vocab_transform', 'vocab_projector', 'activation_13']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs', 'dropout_79']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
outputs.keys()

odict_keys(['start_logits', 'end_logits'])

In [43]:
input_dict = tokenizer(question, text, return_tensors="tf")
outputs = model(input_dict)

start_logits = outputs.start_logits
end_logits = outputs.end_logits

all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1])

In [44]:
answer

''

In [48]:
all_tokens

['[CLS]',
 'how',
 'do',
 'i',
 'stop',
 'smoking',
 '?',
 '[SEP]',
 'laboratory',
 'studies',
 'have',
 'also',
 'shown',
 'that',
 'em',
 '##phy',
 '##se',
 '##ma',
 'and',
 'chronic',
 'bro',
 '##nch',
 '##itis',
 'result',
 'from',
 'cigarette',
 'smoking',
 '.',
 'even',
 'high',
 'school',
 'students',
 'who',
 'smoke',
 'have',
 'poorer',
 'lung',
 'function',
 'than',
 'non',
 '##smo',
 '##kers',
 ';',
 'auto',
 '##ps',
 '##ies',
 'have',
 'shown',
 'lung',
 'damage',
 'in',
 'adults',
 'without',
 'breathing',
 'problems',
 'who',
 'stopped',
 'smoking',
 'a',
 'decade',
 'before',
 'death',
 '.',
 'smoke',
 'causes',
 'mal',
 '##fu',
 '##nction',
 'of',
 'special',
 'white',
 'blood',
 'cells',
 'in',
 'the',
 'wind',
 '##pipe',
 'that',
 'guard',
 'against',
 'invading',
 'ge',
 '##rm',
 '##s',
 ';',
 'smoke',
 'also',
 'slow',
 '##s',
 'down',
 'the',
 'microscopic',
 'hairs',
 'along',
 'the',
 'wind',
 '##pipe',
 'that',
 'move',
 'trapped',
 'particles',
 'up',
 'away',


In [54]:
outputs.start_logits.shape

TensorShape([1, 131])

In [55]:
tf.math.argmax(start_logits, 0)[0]

<tf.Tensor: shape=(), dtype=int64, numpy=0>

In [56]:
 tf.math.argmax(end_logits, 0)[0]

<tf.Tensor: shape=(), dtype=int64, numpy=0>