In [16]:
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import json
import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [17]:
with open('../Preprocessor/output_preprocessing/corpus.json', 'r') as corpusfile:
    corpus = json.load(corpusfile)
corpus[9]['products'][0]['languages']['en']

[{'headerId': 1,
  'paragraphText': '• The appliance is not intended for use by persons (including children) with reduced physical, sensory or mental capabilities, or lack of experience and knowledge, unless they have been given supervision or instruction concerning use of the appliance by a person responsible for their safety. • Children should be supervised to ensure that they do not play with the appliance. • Designated use: this appliance is designed and made to prepare coffee and milk based beverages and hot water. Any other use is considered improper and thus dan- gerous. The manufacturer is not liable for damage deriving from improper use of the appliance. • The surface of the heating element remains hot after use and the outside of the appli- ance may retain the heat for several minutes depending on use. • To clean, follow the instructions in the section “14. Cleaning”. • Cleaning and user maintenance shall not be made by children without supervision. • Never clean by immersing

In [18]:
manufacturer = 'Delonghi'
product_id = 'ecam22110b_magnifica_s_kaffeevollautomat'
language = 'en'
question_strings = [
    "Does the machine get hot?",
    "Should my children supervise the machine?",
    "Should I submerge the machine in water?",
    "Is the appliance suited for my office?"
]

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
nlp  = pipeline("question-answering", model="deepset/roberta-base-squad2")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [19]:
embeddings = []
for header in corpus[9]['products'][0]['languages']['en']:
  embedding_dict = {}

  embedding_dict['id'] = header['headerId']
  embedding_dict['embedding'] = model.encode(header['paragraphText'], convert_to_tensor=True)

  embeddings.append(embedding_dict)

In [20]:
n = 3
n_highest = []
for question_string in question_strings:
    relevances = []
    for paragraph in embeddings:
      relevances.append(util.pytorch_cos_sim(model.encode(question_string, convert_to_tensor=True), paragraph['embedding']))
    n_highest.append(np.argsort(relevances)[::-1][:n])

  result = getattr(asarray(obj), method)(*args, **kwds)


In [21]:
paragraph_for_question = []
for set in n_highest:
    likely_paragraphs = []
    for i in set:
        likely_paragraphs.append(corpus[9]['products'][0]['languages']['en'][i]['paragraphText'])
    paragraph_for_question.append(likely_paragraphs)

In [22]:
answers = []
for i, question_string in enumerate(question_strings):
    results = []

    for paragraph in paragraph_for_question[i]:
        result = nlp(question = question_string, context = paragraph)
        results.append( result)

    results = sorted( results , key=lambda k: k['score'], reverse=True)
    answers.append(results)

In [23]:
for question_string, results in zip(question_strings, answers):
    print("Question: ", question_string)
    for r in results:
      #  if r["score"] > 0.5:
            print("--Answer:", r["answer"])

Question:  Does the machine get hot?
--Answer: off after being used
--Answer: - matic preheat and rinse cycle which cannot be interrupt
--Answer: old and
Question:  Should my children supervise the machine?
--Answer: cappuc- cino machine. We hope you
--Answer: ). Important! • Do not use solvents,
--Answer: instruction concerning use of the appliance in
Question:  Should I submerge the machine in water?
--Answer: default for a hardness value of
--Answer: with a water softener filter (C4
--Answer: 2. Empty
Question:  Is the appliance suited for my office?
--Answer: 1362 should be fitted. This appliance conforms to the Norms EN
--Answer: instruction concerning use of the appliance in
--Answer: plug. if your electricity supply point has only two pin socket outlets, or


In [24]:

from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True)
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')

context = ""
for i in range(3):
    context += corpus[9]['products'][0]['languages']['en'][10]["paragraphText"]

questions = [
            "Does the machine get hot?",
            "Should my children supervise the machine?",
            "Should I submerge the machine in water?",
            "Is the appliance suited for my office?"
            ]

question_context_for_batch = []

for question in questions :
    question_context_for_batch.append((question, context))

encoding = tokenizer.batch_encode_plus(question_context_for_batch,pad_to_max_length=True, return_tensors="pt")
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
print(start_scores)

for index,(start_score,end_score,input_id) in enumerate(zip(start_scores,end_scores,input_ids)):
    max_startscore = torch.argmax(torch.Tensor(start_score))
    max_endscore = torch.argmax(torch.Tensor(end_score))
    ans_tokens = input_ids[index][max_startscore: max_endscore + 1]
    answer_tokens = tokenizer.convert_ids_to_tokens(ans_tokens, skip_special_tokens=True)
    answer_tokens_to_string = tokenizer.convert_tokens_to_string(answer_tokens)
    print ("\nQuestion: ",questions[index])
    print ("Answer: ", answer_tokens_to_string)

start_logits


TypeError: new(): invalid data type 'str'