In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast
from transformers import TrainingArguments, Trainer
from transformers import DefaultDataCollator
from transformers import ElectraForQuestionAnswering, ElectraTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset

In [2]:
context = r'''Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
        is the process of deriving high-quality information from text. It involves 
        "the discovery by computer of new, previously unknown information, 
        by automatically extracting information from different written resources." 
        Written resources may include websites, books, emails, reviews, and articles. 
        High-quality information is typically obtained by devising patterns and trends 
        by means such as statistical pattern learning. According to Hotho et al. (2005)
        we can distinguish between three different perspectives of text mining: 
        information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process.'''

In [3]:
question = "What is text mining?"
question2 = "What are the perspectives of text mining?"

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [5]:
inputs = tokenizer(question, context, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model(**inputs)

In [6]:
type(tokenizer), type(model), type(outputs)

(transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast,
 transformers.models.distilbert.modeling_distilbert.DistilBertForQuestionAnswering,
 transformers.modeling_outputs.QuestionAnsweringModelOutput)

In [9]:
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
answer_start = torch.argmax(answer_start_scores, dim=-1)
answer_end = torch.argmax(answer_end_scores, dim=-1) + 1
answer_start, answer_end

(tensor([35]), tensor([46]))

In [13]:
input_ids = inputs['input_ids'].tolist()[0]
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(question, ':', answer)

What is text mining? : the process of deriving high - quality information from text


In [14]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [15]:
context = """The city is the birthplace of many cultural movements, including the Harlem 
        Renaissance in literature and visual art; abstract expressionism 
        (also known as the New York School) in painting; and hip hop, punk, salsa, disco, 
        freestyle, Tin Pan Alley, and Jazz in music. New York City has been considered 
        the dance capital of the world. The city is also widely celebrated in popular lore, 
        frequently the setting for books, movies (see List of films set in New York City), 
        and television programs."""
question = "The dance capital of the world is what city in the US?"

In [17]:
squad = load_dataset('squad', split='train[:5000]')
squad = squad.train_test_split(test_size=0.2)
print(squad['train'][0])

{'id': '56ceddd1aab44d1400b88b58', 'title': 'Spectre_(2015_film)', 'context': 'In September 2015 it was announced that Sam Smith and regular collaborator Jimmy Napes had written the film\'s title theme, "Writing\'s on the Wall", with Smith performing it for the film. Smith said the song came together in one session and that he and Napes wrote it in under half an hour before recording a demo. Satisfied with the quality, the demo was used in the final release.', 'question': 'What was the name of the song played during the opening credits?', 'answers': {'text': ["Writing's on the Wall"], 'answer_start': [124]}}


In [18]:
def preprocess(data):
    questions = [q.strip() for q in data["question"]]
    inputs = tokenizer(
        questions,
        data["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = data["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [19]:
tokenized_squad = squad.map(preprocess, batched=True, remove_columns=squad["train"].column_names)
tokenized_squad

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1000
    })
})

In [20]:
squad['train'][0]['answers']['answer_start'], squad['train'][0]['answers']['text']

([124], ["Writing's on the Wall"])

In [21]:
data_collator = DefaultDataCollator()

In [22]:
training_args = TrainingArguments(
    output_dir="./QandA",
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)
trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 250
  Number of trainable parameters = 66364418


  0%|          | 0/250 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 16


  0%|          | 0/63 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 5.531589508056641, 'eval_runtime': 419.3744, 'eval_samples_per_second': 2.385, 'eval_steps_per_second': 0.15, 'epoch': 1.0}
{'train_runtime': 4610.0212, 'train_samples_per_second': 0.868, 'train_steps_per_second': 0.054, 'train_loss': 5.36508154296875, 'epoch': 1.0}


TrainOutput(global_step=250, training_loss=5.36508154296875, metrics={'train_runtime': 4610.0212, 'train_samples_per_second': 0.868, 'train_steps_per_second': 0.054, 'train_loss': 5.36508154296875, 'epoch': 1.0})

In [23]:
inputs = tokenizer(question, context, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model(**inputs)

In [24]:
answer_start_logits = outputs.start_logits
answer_end_logits = outputs.end_logits
answer_start = torch.argmax(answer_start_logits)
answer_end = torch.argmax(answer_end_logits) + 1

In [25]:
input_ids = inputs['input_ids'].tolist()[0]
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(question, ':', answer)

The dance capital of the world is what city in the US? : frequently


In [26]:
question = "수원 화성은 언제 완성되었는가?"
context = """수원 화성은 조선시대 화성유수부 시가지를 둘러싼 성곽이다. 
1789년(정조 13) 수원을 팔달산 동쪽 아래로 옮기고, 
1794년(정조 18) 축성을 시작해 1796년에 완성했다."""
context = context.strip().replace("\n","")

In [27]:
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-small-v2-distilled-korquad-384")
model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-small-v2-distilled-korquad-384")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

loading file vocab.txt from cache at C:\Users\admin/.cache\huggingface\hub\models--monologg--koelectra-small-v2-distilled-korquad-384\snapshots\256efd8763caf2d3936e141107f113e2ab51f653\vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at C:\Users\admin/.cache\huggingface\hub\models--monologg--koelectra-small-v2-distilled-korquad-384\snapshots\256efd8763caf2d3936e141107f113e2ab51f653\special_tokens_map.json
loading file tokenizer_config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--monologg--koelectra-small-v2-distilled-korquad-384\snapshots\256efd8763caf2d3936e141107f113e2ab51f653\tokenizer_config.json
loading configuration file config.json from cache at C:\Users\admin/.cache\huggingface\hub\models--monologg--koelectra-small-v2-distilled-korquad-384\snapshots\256efd8763caf2d3936e141107f113e2ab51f653\config.json
Model config ElectraConfig {
  "_name_or_path": "monologg/koelectra-small-v2-distilled-korquad-384",

In [28]:
inputs = tokenizer(question, context, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model(**inputs)

In [29]:
answer_start_logits = outputs.start_logits
answer_end_logits = outputs.end_logits
answer_start = torch.argmax(answer_start_logits)
answer_end = torch.argmax(answer_end_logits) + 1

In [30]:
input_ids = inputs['input_ids'].tolist()[0]
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(question, ':', answer)

수원 화성은 언제 완성되었는가? : 1796년
