In [10]:
# https://keras.io/examples/nlp/question_answering/

In [11]:
# ! pip install datasets

In [12]:
import pandas as pd
from datasets import Dataset
import json

profiles = []
with open("data/courses_exams/data/profiles.json", "r") as json_file:
    profiles = json.load(json_file)
for item in profiles:
    file_name = item["context_file"]
    answers = item["answers"]["text"]
    file_name = f"data/courses_exams/{file_name}"
    context = ""
    with open(file_name, "r") as f:
        context = f.read()
    item['context'] = context
    
    indexes = []
    for answer in answers:
        index = context.index(answer)
        indexes.append(index)
    item["answers"]["answer_start"] = indexes

with open("courses_exams_profiles_dump.json", "w") as json_file:
    json.dump(profiles, json_file)

datasets = Dataset.from_list(profiles)

In [13]:
from transformers import AutoTokenizer

# model_checkpoint = "bert-large-cased-whole-word-masking"
# model_checkpoint = "bert-large-cased"
# model_checkpoint = "bert-base-cased"
model_checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [14]:
max_length = 1280  # The maximum length of a feature (question and context)
doc_stride = (
    128  # The authorized overlap between two part of the context when splitting
)

In [15]:
# from transformers import AutoTokenizer

# model_checkpoint = "distilbert-base-cased"

# # tokenizer2 = AutoTokenizer.from_pretrained(model_checkpoint)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# max_length = 384  # The maximum length of a feature (question and context)
# doc_stride = (
#     128  # The authorized overlap between two part of the context when splitting
# )

def prepare_train_features(examples):
    # Tokenize our examples with truncation and padding, but keep the overflows using a
    # stride. This results in one example possible giving several features when a context is long,
    # each of those features having a context that overlaps a bit the context of the previous
    # feature.
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]
    
    from transformers import AutoTokenizer

    model_checkpoint = "distilbert-base-uncased"
    # model_checkpoint = "bert-base-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
    max_length = 1280  # The maximum length of a feature (question and context)
    doc_stride = (
        128  # The authorized overlap between two part of the context when splitting
    )
        
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    # Since one example might give us several features if it has a long context, we need a
    # map from a feature to its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original
    # context. This will help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what
        # is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this
        # span of text.
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answers["answer_start"]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the
            # CLS index).
            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the
                # answer.
                # Note: we could go after the last offset if the answer is the last word (edge
                # case).
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

In [16]:
tokenized_datasets = datasets.map(
    prepare_train_features,
    batched=True,
    remove_columns=datasets.column_names,
    num_proc=3,
)

                                                                      

In [17]:
train_set = tokenized_datasets.with_format("numpy")[:]  # Load the whole dataset as a dict of numpy arrays
validation_set = train_set

In [18]:
from transformers import TFAutoModelForQuestionAnswering, TFDistilBertForQuestionAnswering

# model = TFAutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
model = TFDistilBertForQuestionAnswering.from_pretrained(model_checkpoint)

In [None]:
import tensorflow as tf
from tensorflow import keras

optimizer = keras.optimizers.Adam(learning_rate=5e-5)

In [None]:
# Optionally uncomment the next line for float16 training. Make sence for a GPU what has compute capability of at least 7.0.
keras.mixed_precision.set_global_policy("mixed_float16")

model.compile(optimizer=optimizer)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3060 Laptop GPU, compute capability 8.6


No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [None]:
# model.fit(train_set, validation_data=validation_set, epochs=100, batch_size=1)
# # max_length = 896  # The maximum length of a feature (question and context)
# # doc_stride = (
# #     128  # The authorized overlap between two part of the context when splitting
# # )
# model.fit(train_set, validation_data=validation_set, epochs=250, batch_size=1)
model.fit(train_set, validation_data=validation_set, epochs=250, batch_size=1)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

<keras.callbacks.History at 0x20c066a1370>

In [None]:
def get_answer(context, question):
    inputs = tokenizer([context], [question], return_tensors="np")
    outputs = model(inputs)
    start_position = tf.argmax(outputs.start_logits, axis=1)
    end_position = tf.argmax(outputs.end_logits, axis=1)
    answer = inputs["input_ids"][0, int(start_position) : int(end_position) + 1]
    return tokenizer.decode(answer)

In [None]:
import os

for x in os.listdir("data/courses_exams/"):
    if not x.endswith(".txt"):
        continue
    print(f"{'='*15}{x}")
    # Prints only text file present in My Folder
    with open(f"data/courses_exams/{x}", "r") as f:
        context = f.read()
        print(f'{"-"*10}Courses:')
        print(get_answer(context, "what courses are listed?"))

----------Courses:
2021 mipt school of deep learning fpmi mipt, deep learning 2019 mipt mckinsey, data science in consulting 2019 stepik. org samsung research center, neural networks and computer vision 2019 stepik. org bioinformatics institute, introduction to data science and machine learning
----------Courses:
2018 hse fkn, minor " data mining " 2018 coursera hse & university of california, data structures and algorithms 2017 sololearn - introduction in sql 2016 coursera hse introduction in python
----------Courses:
2018 fast. ai fast. ai, data science 2017 changellenge summer school changellenge, consultant 2017 estiem international student organization, coordinator 2017 machine learning courses data camp, machine learning 2016 tinkoff data tinkoff bank, data analysis 2016 preparation for ielts windsor english 2016 data analysis and machine learning yandex, data scientist 2015 best group moscow international student organization, organization
----------Courses:
2020 national resear

In [None]:
from datetime import datetime
output_folder_name = "models/" + datetime.now().strftime("%Y%m%d-%H%M%S")

TFDistilBertForQuestionAnswering.save_pretrained(model, output_folder_name)
tokenizer.save_pretrained(output_folder_name)

('models/20230703-211917\\tokenizer_config.json',
 'models/20230703-211917\\special_tokens_map.json',
 'models/20230703-211917\\vocab.txt',
 'models/20230703-211917\\added_tokens.json',
 'models/20230703-211917\\tokenizer.json')