In [5]:
import requests
import hashlib
from urllib.parse import urljoin
import json
import itertools

In [6]:
questions = [
  "What are vendors of microcontrollers?",
  "Where can I find microcontrollers or CPUs?",
  "What is SVN?",
  "What is the final deliverable in capstone?",
  "Where are the templates for design reports located?",
  "How do I order parts?",
  "How do I resolve an issue?",
  "How do I upload to the repository?",
  "What are the learning goals of capstone?"
]
server_url = "http://45.77.102.63:8000"
ask_endpoint = urljoin(server_url, 'ask')

try:
  with open('request_memoizations.json', 'r') as question_context_f:
    question_context_data = json.load(question_context_f)
except FileNotFoundError as _:
  question_context_data = dict()



In [7]:

def make_qc_request(question):
  if question not in question_context_data:
    question_context_data[question] = requests.post(url=ask_endpoint, data=json.dumps({'question': question})).json()
    with open('request_memoizations.json', 'w') as question_context_f:
      json.dump(question_context_data, question_context_f)
  
  return [{'question': question, 'context': ''.join(e['surrounding_context_text']),
  'answer': e['answer_text'], 'answer_start': len(e['surrounding_context_text'][0])} for e in question_context_data[question]['answers']]



def get_context_and_answers(question_list):
  return list(itertools.chain.from_iterable(map(lambda q : make_qc_request(q), question_list)))

def save_as_dataset(question_context_list):
  try:
    with open('dataset.json', 'r') as dsfp:
      ds = json.load(dsfp)
  except FileNotFoundError as _:
    ds = {"version": "0.1.0", "data": []}

  for el in question_context_list:
    question = el['question']
    context = el['context']
    text = [el['answer']]
    answer_start = [el['answer_start']]
    title = 'capstone_validation_set'
    answers = {'text': text, 'answer_start': answer_start}
    id = hashlib.sha384((question + context).encode()).hexdigest()
    add = True 
    for o in ds['data']:
      if o['id'] == id:
        add = False
        if 'answer_start' not in o['answers']:
          o['answers']['answer_start'] = list(map(lambda a : o['context'].find(a), o['answers']['text']))

    if add:
      ds["data"]["validation"].append({'id': id, 'title': title, 'context': context, 'question': question, 'answers': answers})
    ds["data"]["validation"].sort(key=lambda o : o['question'] + o['context'])
  with open('dataset.json', 'w') as dsfp:
    json.dump(ds, dsfp)

def update_answer_start():
  try:
    with open('dataset.json', 'r') as dsfp:
      ds = json.load(dsfp)
      for o in ds['data']:
        if 'answer_start' not in o['answers']:
          o['answers']['answer_start'] = list(map(lambda a : o['context'].find(a), o['answers']['text']))
  
    with open('dataset.json', 'w') as dsfp:
      json.dump(ds, dsfp)
    
  except FileNotFoundError as _:
    return
  
  

In [8]:
# tmp = get_context_and_answers(questions)
# update_answer_start()

In [215]:
# save_as_dataset(tmp)


In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import transformers
import torch

In [10]:
data_set = load_dataset('json', data_files={'validation': './dataset.json'}, field="data", split="validation")

Using custom data configuration default-89ba9d214aab74c0


Downloading and preparing dataset json/default to /home/miteshkumar/.cache/huggingface/datasets/json/default-89ba9d214aab74c0/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1278.75it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 320.98it/s]

Dataset json downloaded and prepared to /home/miteshkumar/.cache/huggingface/datasets/json/default-89ba9d214aab74c0/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.





In [11]:
data_set

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 12
})

In [12]:
tokenizer = AutoTokenizer.from_pretrained('deepset/bert-base-cased-squad2', return_token_type_ids=True)
model = AutoModelForQuestionAnswering.from_pretrained('deepset/bert-base-cased-squad2')
qa_pl = pipeline('question-answering', model=model, tokenizer=tokenizer)
# from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased',return_token_type_ids = True)
# model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased-distilled-squad')


In [13]:
# def get_answer_token_ids(model, encoding):
#     input_ids, attention_mask = encoding['input_ids'], encoding['attention_mask']
#     res = model(torch.tensor([input_ids]),
#         attention_mask=torch.tensor([attention_mask]))
    
#     start_scores = res.start_logits
#     end_scores = res.end_logits
    
#     a_start = torch.argmax(start_scores)
#     a_end = torch.argmax(end_scores) + 1
#     return input_ids[a_start: a_end]

def get_answer_token_ids(question, context):
    return tokenizer.encode(qa_pl(question=question, context=context)['answer'])

    

    

In [14]:
# context = "The US has passed the peak on new coronavirus cases, " \
#           "President Donald Trump said and predicted that some states would reopen this month. " \
#           "The US has over 637,000 confirmed Covid-19 cases and over 30,826 deaths, the highest for any country in the world."

# context = context.lower()
# question = "What was President Donald Trump's prediction?"
# question = question.lower()
context = "Capstone is a design course."
question = "What is capstone?"

encoding = tokenizer.encode_plus(question, context)
answer_token_ids = get_answer_token_ids(question, context)
answer_tokens = tokenizer.convert_ids_to_tokens(answer_token_ids, skip_special_tokens=True)
print(f'Question: {question} \nAnswer Tokens: {answer_token_ids}')
print(f'Answer: {tokenizer.convert_tokens_to_string(answer_tokens)}')


Question: What is capstone? 
Answer Tokens: [101, 170, 1902, 1736, 102]
Answer: a design course


In [15]:
data_set

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 12
})

In [16]:
def predict_helper(context, question):
    return {'predictions': get_answer_token_ids(question, context)}

with_predictions = data_set.map(lambda x : predict_helper(x['context'], x['question']))

100%|██████████| 12/12 [00:04<00:00,  2.62ex/s]


In [17]:
with_predictions

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'predictions'],
    num_rows: 12
})

In [18]:
data_set

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 12
})

In [19]:
def get_ans_token_ids(answers):
    ans_text = answers['text']
    ans_text = ans_text[0]
    return tokenizer.encode(ans_text)
with_answers = with_predictions.map(lambda x : {'answer_tokens': get_ans_token_ids(x['answers'])})

100%|██████████| 12/12 [00:00<00:00, 2007.32ex/s]


In [20]:
with_answers

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'predictions', 'answer_tokens'],
    num_rows: 12
})

In [21]:
from collections import Counter
def f1_accuracy(predicted_tokens, actual_tokens):
    predicted_tokens = Counter(predicted_tokens)
    actual_tokens = Counter(actual_tokens)
    num_shared = sum(min(predicted_tokens[k], actual_tokens[k]) for k in predicted_tokens)
    if num_shared == 0:
        return {'f1_accuracy': -1}
    precision = num_shared/len(predicted_tokens)
    recall = num_shared/len(actual_tokens)

    return {'f1_accuracy': 2.0 * ((precision * recall)/(precision + recall))}
    # return {'f1_accuracy': 2.0/((1.0/precision) + (1.0/recall))}

with_f1 = with_answers.map(lambda x : f1_accuracy(x['predictions'], x['answer_tokens']))
# with_f1 = with_f1.filter(lambda x : x['f1_accuracy'] >= 0)

def token_ids_to_sentence(token_ids):
    return tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(token_ids, skip_special_tokens=True))

with_f1 = with_f1.map(lambda x : {'predicted_sentence': token_ids_to_sentence(x['predictions'])})
with_f1 = with_f1.map(lambda x : {'answer_text': x['answers']['text']})

100%|██████████| 12/12 [00:00<00:00, 2685.93ex/s]
100%|██████████| 12/12 [00:00<00:00, 744.57ex/s]
100%|██████████| 12/12 [00:00<00:00, 1246.91ex/s]


In [22]:
accuracy_csv = with_f1.to_pandas().to_csv()
with open('accuracy_table.csv', 'w') as accuracy_fp:
    accuracy_fp.write(accuracy_csv)
