In [5]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

In [16]:
# Hyperparameters
max_per_slot = 2 # Max number of candidate responses to extract (per iteration)
max_span_length = 20 # Max length of one answer span
min_score = 0 # Min score for an answer span

In [17]:
class QA_Model:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
        self.model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
    
    def answer(self, text, questions):
        answerss = []
        for question in questions:
            inputs = self.tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
            input_ids = inputs["input_ids"].tolist()[0]

            text_tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
            answer_start_scores, answer_end_scores = self.model(**inputs, return_dict=False)

            answer_start = torch.argmax(answer_start_scores)  # Get the most likely beginning of answer with the argmax of the score
            answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
            score = torch.max(answer_start_scores) + torch.max(answer_end_scores)
            
            if answer_start < 1: 
                answer = "[None]" # cannot start with CLS
            elif answer_end - answer_start + 1 > max_span_length:
                answer = "[None]" # cannot be longer than hyperparam
            elif score < min_score:
                answer = "[None]" # cannot be < hyperparam
            else:
                answer = self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
                print(f"Question: {question}")
                print(f"Answer: {answer}")
                print(f"Score: {score}") 
                
            answerss.append(answer)
        return answerss

In [18]:
folder = '../data/'
file = 'event_1054347.txt'

questions = ["Where did the theft take place?",
             "What was stolen?" ,
             "Whose object was stolen?",
             "When was the item last seen?",
             "When was the item stolen?"]

qa = QA_Model()

In [19]:
answers = []
for q in questions:
    answers.append([""])
print(answers)
f = open(folder + file)
text = f.readlines()

for i in range(len(text)):
    print(str(i) + " " + text[i])
    
for i in range(len(text)):
    print(f"Up to line {i}")
    section = " ".join(text[:i+1])
    answers_i = qa.answer(section, questions)
    for j in range(len(answers_i)):
        if answers_i[j] not in answers[j]:
            answers[j].append(answers_i[j])
print(answers)

[[''], [''], [''], [''], ['']]
0 User "This evening ([DATE]), between #:## [ACRONYM] and #:## [ACRONYM] my bike lights (both front and rear) were stolen off my bike in front of [ORG]. The bike was parked at a sign directly in front of a security camera, and I was wondering if I could view who stole my lights. I also know that a security guard constantly stands in front of the academic building, checking student IDs. Would I be able to contact that security guard regarding my stolen lights? Thank you for your understanding."

1 Admin "Thank you for contacting [ORG]. We can reach out to the officer to see if anything suspicious happened around the bike rack during their shift."

2 Admin "Would you like to file a report at this time?"

3 User "Yes, please. My bike wasn't on the rack around the building; it was near the front of the entrance, locked to a sign. Thank you so much."

4 Admin "Are you currently at [FAC]"

5 User "No, I have returned home."

6 Admin "[ACRONYM], Reports are typi