In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast
from transformers import TrainingArguments, Trainer
from transformers import DefaultDataCollator
from transformers import ElectraForQuestionAnswering, ElectraTokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import load_dataset

In [2]:
context = r'''Text mining, also referred to as text data mining (abbr.: TDM), similar to text analytics, 
        is the process of deriving high-quality information from text. It involves 
        "the discovery by computer of new, previously unknown information, 
        by automatically extracting information from different written resources." 
        Written resources may include websites, books, emails, reviews, and articles. 
        High-quality information is typically obtained by devising patterns and trends 
        by means such as statistical pattern learning. According to Hotho et al. (2005)
        we can distinguish between three different perspectives of text mining: 
        information extraction, data mining, and a KDD (Knowledge Discovery in Databases) process.'''

In [3]:
question = "What is text mining?"
question2 = "What are the perspectives of text mining?"

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased-distilled-squad')
model = AutoModelForQuestionAnswering.from_pretrained('distilbert-base-cased-distilled-squad')
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

In [5]:
inputs = tokenizer(question, context, return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model(**inputs)

In [6]:
type(tokenizer), type(model), type(outputs)

(transformers.models.distilbert.tokenization_distilbert_fast.DistilBertTokenizerFast,
 transformers.models.distilbert.modeling_distilbert.DistilBertForQuestionAnswering,
 transformers.modeling_outputs.QuestionAnsweringModelOutput)

In [9]:
answer_start_scores = outputs.start_logits
answer_end_scores = outputs.end_logits
answer_start = torch.argmax(answer_start_scores, dim=-1)
answer_end = torch.argmax(answer_end_scores, dim=-1) + 1
answer_start, answer_end

(tensor([35]), tensor([46]))

In [13]:
input_ids = inputs['input_ids'].tolist()[0]
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
print(question, ':', answer)

What is text mining? : the process of deriving high - quality information from text


In [14]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [15]:
context = """The city is the birthplace of many cultural movements, including the Harlem 
        Renaissance in literature and visual art; abstract expressionism 
        (also known as the New York School) in painting; and hip hop, punk, salsa, disco, 
        freestyle, Tin Pan Alley, and Jazz in music. New York City has been considered 
        the dance capital of the world. The city is also widely celebrated in popular lore, 
        frequently the setting for books, movies (see List of films set in New York City), 
        and television programs."""
question = "The dance capital of the world is what city in the US?"

In [17]:
squad = load_dataset('squad', split='train[:5000]')
squad = squad.train_test_split(test_size=0.2)
print(squad['train'][0])

{'id': '56ceddd1aab44d1400b88b58', 'title': 'Spectre_(2015_film)', 'context': 'In September 2015 it was announced that Sam Smith and regular collaborator Jimmy Napes had written the film\'s title theme, "Writing\'s on the Wall", with Smith performing it for the film. Smith said the song came together in one session and that he and Napes wrote it in under half an hour before recording a demo. Satisfied with the quality, the demo was used in the final release.', 'question': 'What was the name of the song played during the opening credits?', 'answers': {'text': ["Writing's on the Wall"], 'answer_start': [124]}}
