# Data exploration

In [37]:
from datasets import load_dataset
from transformers import pipeline
from tqdm import tqdm
from transformers import AutoTokenizer
import time
import numpy as np
import pandas as pd

In [2]:
split = "train"
cache_dir = "./data_cache"

dialogue_dataset = load_dataset(
    "doc2dial",
    name="dialogue_domain",  # this is the name of the dataset for the second subtask, dialog generation
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Reusing dataset doc2dial (./data_cache/doc2dial/dialogue_domain/1.0.1/c15afdf53780a8d6ebea7aec05384432195b356f879aa53a4ee39b740d520642)


In [3]:
document_dataset = load_dataset(
    "doc2dial",
    name="document_domain",  # this is the name of the dataset for the second subtask, dialog generation
    split=split,
    ignore_verifications=True,
    cache_dir=cache_dir,
)

Reusing dataset doc2dial (./data_cache/doc2dial/document_domain/1.0.1/c15afdf53780a8d6ebea7aec05384432195b356f879aa53a4ee39b740d520642)


In [4]:
dialogue_dataset[0]

{'dial_id': '9f44c1539efe6f7e79b02eb1b413aa43',
 'doc_id': 'Top 5 DMV Mistakes and How to Avoid Them#3_0',
 'domain': 'dmv',
 'turns': [{'turn_id': 1,
   'role': 'user',
   'da': 'query_condition',
   'references': [{'sp_id': '4', 'label': 'precondition'}],
   'utterance': 'Hello, I forgot o update my address, can you help me with that?'},
  {'turn_id': 2,
   'role': 'agent',
   'da': 'respond_solution',
   'references': [{'sp_id': '6', 'label': 'solution'},
    {'sp_id': '7', 'label': 'solution'}],
   'utterance': 'hi, you have to report any change of address to DMV within 10 days after moving. You should do this both for the address associated with your license and all the addresses associated with all your vehicles.'},
  {'turn_id': 3,
   'role': 'user',
   'da': 'query_solution',
   'references': [{'sp_id': '56', 'label': 'solution'}],
   'utterance': 'Can I do my DMV transactions online?'},
  {'turn_id': 4,
   'role': 'agent',
   'da': 'respond_solution',
   'references': [{'sp_id

In [5]:
document_dataset[54]

{'domain': 'ssa',
 'doc_id': 'Direct Deposit | Social Security Administration#2_0',
 'title': 'Direct Deposit | Social Security Administration#2',
 'doc_text': "\n\nSet Up or Change Your Direct Deposit of Benefit Payment \nDo you want to set up or change the direct deposit of your benefit payment? We are constantly expandingand improving our online services, including the ability to set up or change your direct depositinformation. If you already receive Social Security or Supplemental Security Income SSI benefitsand you have a bank account, simply log in to or create your personal and secure my Social Security account. \n\nWhat is Direct Deposit? \nDirect deposit is a simple, safe, and secure way to get benefits. If you need us to send your payment to abank or credit union account , have all of the following information ready when you apply. Social Security number Bank routing transit number Account type checking or savings Account number How to Set Up or Change Direct Deposit of Benef

In [6]:
search_domain = 'dmv'
search_doc_id = 'Top 5 DMV Mistakes and How to Avoid Them#3_0'
search_id_sp = ['6', '7']

def text_from_spans(search_domain, search_doc_id, search_id_sp, document_dataset):
    start = time.time()
    total_answer = ''
    for doc in document_dataset:
        if doc['domain'] == search_domain and doc['doc_id'] == search_doc_id:
            for span in doc['spans']:
                if span['id_sp'] in search_id_sp:
                    total_answer+=span['text_sp']
            break
    print(f"Time elapsed: {time.time() - start}")
    return total_answer

text_from_spans(search_domain, search_doc_id, search_id_sp, document_dataset)

Time elapsed: 0.2696061134338379


'you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. '

In [7]:
question_answerer = pipeline("question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


In [8]:
context = r"""By statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ."""

In [9]:
result = question_answerer(question="Hello, I forgot o update my address, can you help me with that?", context=context)
print(result)

{'score': 0.11147879809141159, 'start': 137, 'end': 144, 'answer': 'license'}


  fw_args = {k: torch.tensor(v, device=self.device) for (k, v) in fw_args.items()}


## Creating the dataset

Steps:
- [X] Sliding windows from the Document
- [ ] Extract user utterance
- [ ] Extract Dialogue history

### Sliding windows from the Document

In [45]:
train_dict = dict()
train_dict['train_document'] = []
train_dict['train_id_sp'] = []
train_dict['train_user_utterance'] = []
train_dict['train_doc_domain'] = []
train_dict['train_doc_id'] = []
train_dict['train_text_sp'] = []
train_dict['train_dial_id_turn_id'] = []     # necessary for evaluation

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
tokenizer_mode = True

start = time.time()
for idx, dialogue in tqdm(enumerate(dialogue_dataset)):
    if idx == 100:
        break
    dial_id_turn_id = []       # running list of <dial_id>_<turn_id> for evaluation
    sp_id_list = []            # running list of spans per document
    user_utterance_list = []   # running list of user utterances per document
    for turn in dialogue['turns']:
        dial_id_turn_id.append(dialogue['dial_id'] + '_' + str(turn['turn_id']))
        if turn['role'] == 'user':
            # TURN UTTERANCE IS FLATTENED AND ONLY THE [INPUT_IDS] IS STORED
            if tokenizer_mode:
                turn['utterance'] = tokenizer(turn['utterance'], padding=True, truncation=True, return_tensors="pt")['input_ids'].view(-1)
            user_utterance_list.append(turn['utterance'])   # adding user utterance to user_utterance_list
        else:
            references = turn['references']
            ref_sp_id = []
            for ref in references:
                ref_sp_id.append(ref['sp_id'])
            sp_id_list.append(ref_sp_id)          # adding list of sp_ids per dialogue to list of sp_ids per document
    train_dict['train_id_sp'].append(sp_id_list)
    train_dict['train_user_utterance'].append(user_utterance_list)
    train_dict['train_doc_domain'].append(dialogue['domain'])
    train_dict['train_doc_id'].append(dialogue['doc_id'])
    train_dict['train_dial_id_turn_id'].append(dial_id_turn_id)
    for doc in document_dataset:
        if doc['doc_id'] == train_dict['train_doc_id'][-1]:
            if tokenizer_mode:
                doc['doc_text'] = tokenizer(doc['doc_text'], padding=True, truncation=True, return_tensors="pt")['input_ids'].view(-1)
            train_dict['train_document'].append(doc['doc_text'])          # adding the total document text
            text_sp_2 = []
            for train_spans_id in train_dict['train_id_sp'][-1]:    
                text_sp = []
                for span in doc['spans']:
                    if span['id_sp'] in train_spans_id:
                        text_sp.append(span['text_sp'])
                text_sp_2.append(text_sp)
            train_dict['train_text_sp'].append(text_sp_2)
            break
end = time.time()
print(f'Total time: {end-start}')

data = pd.DataFrame(train_dict)

100it [00:18,  5.35it/s]

Total time: 18.686965942382812





Results:

In [34]:
print('User utterances:')
print(train_dict['train_user_utterance'][0])

print('\nID Sp:')
print(train_dict['train_id_sp'][0])

print('\nDoc ID:')
print(train_dict['train_doc_id'][0])

print('\nDoc domain:')
print(train_dict['train_doc_domain'][0])

print('\nTrain text spans:')
print(train_dict['train_text_sp'][0])

print('\nDial_ID Turn_ID:')
print(train_dict['train_dial_id_turn_id'][0])

User utterances:
[tensor([  101,  8667,   117,   146,  9424,   184, 11984,  1139,  4134,   117,
         1169,  1128,  1494,  1143,  1114,  1115,   136,   102]), tensor([  101,  2825,   146,  1202,  1139,   141,  2107,  2559, 14409,  3294,
          136,   102]), tensor([  101,  5749,   117,  1105,  1107,  1692,   146,  5042,  1106,  2498,
         1155,  1104,  1103, 14371,  1834,  1106,  1103,   141,  2107,  2559,
         1701,   117,  1184,  1169,   146,  1202,   136,   102]), tensor([  101, 23330,   117,  1105,  1169,  1128,  1587,  1143,  1254,  1187,
         1431,   146,  2592,  1139,  1207,  4134,   136,   102]), tensor([  101,  2825,  1128,  1587,  1143,  1167,  1164, 15727,  1827,  1105,
         1147,  2616,   136,   102])]

ID Sp:
[['6', '7'], ['56'], ['49', '50', '52'], ['6', '7'], ['41', '43']]

Doc ID:
Top 5 DMV Mistakes and How to Avoid Them#3_0

Doc domain:
dmv

Train text spans:
[['you must report a change of address to DMV within ten days of moving. ', 'That is the 

In [36]:
print('\nDoc text:')
print(train_dict['train_document'][0])


Doc text:
tensor([  101,  2408,   141,  2107,  2559,  5793,  1294,  3253,  3644,  1895,
        12572,  1115,  2612,  1172,  2418,  2645,   117,  1259, 12270,  1114,
         1644,  7742,  1105, 24034,  6094, 21160,  4011,   119,  2279,  1195,
         1267,  5793,  1294,  1292, 12572,  1166,  1105,  1166,  1254,   117,
         1195,  1132, 16141,  1142,  2190,  1104,  1103,  1499,  1421,   141,
         2107,  2559, 12572,  1105,  1293,  1106,  3644,  1172,   119,   122,
          119, 17323,  1916,  1106,  3725,  9216, 24930, 18380,  1650, 14610,
          117,  1128,  1538,  2592,   170,  1849,  1104,  4134,  1106,   141,
         2107,  2559,  1439,  1995,  1552,  1104,  2232,   119,  1337,  1110,
         1103,  1692,  1111,  1103,  4134,  2628,  1114,  1240,  5941,   117,
         1112,  1218,  1112,  1155,  1103, 11869,  2628,  1114,  1296,  4410,
         3686,   117,  1134,  1336, 11271,   119,  1135,  1110,  1136,  6664,
         1106,  1178,   131,  3593,  1240,  1207,  41

## Create a Dataframe out of the train_dictionary

In [38]:
data = pd.DataFrame(train_dict)

In [49]:
np.unique(data.loc[:,'train_doc_id'])

array(['Appeal a TVB ticket conviction#1_0',
       'Commercial vehicle registration fees, vehicle use taxes and supplemental fees#3_0',
       'DIAL-IN search accounts#3_0',
       'New York State Insurance Requirements#3_0',
       'Prepare for your road test#3_0',
       'Refunds and transfer credits for surrendered plates#3_0',
       'Registrations#3_0', 'Renew non-driver ID card#1_0',
       'Surrender (return or turn in) your plates to the DMV#3_0',
       'Top 5 DMV Mistakes and How to Avoid Them#3_0'], dtype=object)

In [12]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

In [13]:
batch = tokenizer(train_dict['train_document'][0], padding=True, truncation=True, return_tensors="pt")

In [28]:
batch_padding = tokenizer(['Hi how are you', 'hi how are you i am ok'], padding=True, truncation=False, return_tensors="pt")

In [25]:
batch_padding

{'input_ids': tensor([[  101,  8790,  1293,  1132,  1128,   102,     0,     0,     0],
        [  101, 20844,  1293,  1132,  1128,   178,  1821, 21534,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [14]:
batch

{'input_ids': tensor([[  101,  2408,   141,  2107,  2559,  5793,  1294,  3253,  3644,  1895,
         12572,  1115,  2612,  1172,  2418,  2645,   117,  1259, 12270,  1114,
          1644,  7742,  1105, 24034,  6094, 21160,  4011,   119,  2279,  1195,
          1267,  5793,  1294,  1292, 12572,  1166,  1105,  1166,  1254,   117,
          1195,  1132, 16141,  1142,  2190,  1104,  1103,  1499,  1421,   141,
          2107,  2559, 12572,  1105,  1293,  1106,  3644,  1172,   119,   122,
           119, 17323,  1916,  1106,  3725,  9216, 24930, 18380,  1650, 14610,
           117,  1128,  1538,  2592,   170,  1849,  1104,  4134,  1106,   141,
          2107,  2559,  1439,  1995,  1552,  1104,  2232,   119,  1337,  1110,
          1103,  1692,  1111,  1103,  4134,  2628,  1114,  1240,  5941,   117,
          1112,  1218,  1112,  1155,  1103, 11869,  2628,  1114,  1296,  4410,
          3686,   117,  1134,  1336, 11271,   119,  1135,  1110,  1136,  6664,
          1106,  1178,   131,  3593,  

In [15]:
tokenizer.decode(batch["input_ids"][0][-2])

'000'

In [16]:
len(batch['input_ids'][0])

512

In [None]:
from datasets import load_metric

metric = load_metric("squad_v2")
print(metric.features) #this shows you what format the metric is expecting

prediction = {'id': <rc dataset is of shape dialid_turnid - this value has to match the answer>,
              'prediction_text': <your prediction>,
              'no_answer_probability': 0.0} #edwin said we can ignore this for task 1
reference = {'id': <see prediction>, 
              'answers': {
                  'text': [list of answer, best to use the ones from the rc dataset],                                       
                  'answer_start': [list of numbers of the answer star char again see rc dataset. ]}
            }

metric.add(prediction=prediction, reference=reference)
final_score = metric.compute()
final_score