# Dialogue and Narrative Coursework - Subtask 1 - New Dataset

In [None]:
## imports
%%capture
!pip install datasets

from datasets import load_dataset, Dataset
import pandas as pd

In [None]:
## A function to load a specific doc2dial dataset

def load_doc2dial_dataset(name='dialogue_domain', split='train'):
  cache_dir = "./data_cache"

  return load_dataset(
      "doc2dial",
      name=name,
      split=split,
      ignore_verifications=True,
      cache_dir=cache_dir,
  )


In [None]:
train_dialogues = load_doc2dial_dataset(name="dialogue_domain", split="train")
val_dialogues = load_doc2dial_dataset(name="dialogue_domain", split="validation")

documents = load_doc2dial_dataset(name="document_domain", split="train")

train_data = load_doc2dial_dataset(name="doc2dial_rc", split="train")
val_data = load_doc2dial_dataset(name="doc2dial_rc", split="validation")

Downloading:   0%|          | 0.00/3.03k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

Downloading and preparing dataset doc2dial/dialogue_domain (download: 5.61 MiB, generated: 7.86 MiB, post-processed: Unknown size, total: 13.47 MiB) to ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


Downloading:   0%|          | 0.00/5.88M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.


Reusing dataset doc2dial (./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c)


Downloading and preparing dataset doc2dial/document_domain (download: 5.61 MiB, generated: 195.38 MiB, post-processed: Unknown size, total: 200.99 MiB) to ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.
Downloading and preparing dataset doc2dial/doc2dial_rc (download: 5.61 MiB, generated: 131.12 MiB, post-processed: Unknown size, total: 136.72 MiB) to ./data_cache/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c...


0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset doc2dial downloaded and prepared to ./data_cache/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c. Subsequent calls will reuse this data.


Reusing dataset doc2dial (./data_cache/doc2dial/doc2dial_rc/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c)


In [None]:
train_data[0]

{'answers': {'answer_start': [346],
  'text': ['you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. ']},
 'context': 'Many DMV customers make easily avoidable mistakes that cause them significant problems, including encounters with law enforcement and impounded vehicles. Because we see customers make these mistakes over and over again , we are issuing this list of the top five DMV mistakes and how to avoid them. \n\n1. Forgetting to Update Address \nBy statute , you must report a change of address to DMV within ten days of moving. That is the case for the address associated with your license, as well as all the addresses associated with each registered vehicle, which may differ. It is not sufficient to only: write your new address on the back of your old license; tell the United States Postal Service; or inform the poli

In [None]:
def add_spans(sample, dialogues):
  dial_id, turn_id = sample['id'].split('_')

  dial = dialogues.filter(lambda ex: ex['dial_id'] == dial_id)[0]

  doc_id = dial['doc_id']   ## corresponding document id
  span_ids = []             ## corresponding span ids

  for tr in dial['turns']:
    if tr['turn_id'] == int(turn_id)+1:

      for sp in tr['references']:
        span_ids.append(sp['sp_id'])

      break

  doc = documents.filter(lambda ex: ex['doc_id'] == doc_id)[0]
  
  spans = {}
  answer_spans = {}
  for span in doc['spans']:
    spans[span['id_sp']] = span['text_sp']
    if span['id_sp'] in span_ids:
      answer_spans[span['id_sp']] = span['text_sp']

  sample['spans'] = spans
  sample['answers']['spans'] = answer_spans

  return sample

In [None]:
from tqdm import tqdm

new_train_data = pd.DataFrame(columns=['id', 'question', 'context', 'answers', 'spans',  'domain', 'title'])

for ex in tqdm(train_data):
  new_train_data = new_train_data.append(add_spans(ex, train_dialogues), ignore_index=True)

new_train_data.head()

  0%|          | 0/20431 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 1/20431 [00:02<14:18:17,  2.52s/it]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-97c73a4132fe7d79.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-97c73a4132fe7d79.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-97c73a4132fe7d79.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-d59bf7f143a6d167.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-d59bf7f143a6d167.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 8/20431 [00:04<2:26:10,  2.33it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 11/20431 [00:05<2:34:59,  2.20it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-0d9928b71aad6d66.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-0d9928b71aad6d66.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 16/20431 [00:06<2:07:43,  2.66it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-88c736c556dee58d.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-88c736c556dee58d.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 21/20431 [00:08<1:58:00,  2.88it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-bb0c9cc45a9dbe2a.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-bb0c9cc45a9dbe2a.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 26/20431 [00:10<1:53:28,  3.00it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-e1a5c117085479a4.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-e1a5c117085479a4.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-cb6ce7d3b70aa43e.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 33/20431 [00:11<1:43:29,  3.28it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-cb6ce7d3b70aa43e.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 38/20431 [00:13<1:46:08,  3.20it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-d50de64607dbbbdb.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-d50de64607dbbbdb.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 43/20431 [00:14<1:45:11,  3.23it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-4bf4af5f0a1741d6.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-4bf4af5f0a1741d6.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-65b46e40dcc38779.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 50/20431 [00:16<1:43:13,  3.29it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-65b46e40dcc38779.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 55/20431 [00:17<1:44:48,  3.24it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-15fad31f254fa326.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-15fad31f254fa326.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-2a70339a904351ed.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
  0%|          | 62/20431 [00:19<1:39:29,  3.41it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-2a70339a904351ed.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-fe750ef1cc2bb737.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-5ed2dcbfe5aaa4b1.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-5ed2dcbfe5aaa4b1.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  0%|          | 69/20431 [00:22<2:05:01,  2.71it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-5ed2dcbfe5aaa4b1.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  0%|          | 74/20431 [00:23<1:59:10,  2.85it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-b268c1054ff2c194.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-b268c1054ff2c194.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  0%|          | 79/20431 [00:25<1:53:36,  2.99it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-531224a82895017c.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-531224a82895017c.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-4a2d4c633410ba7c.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  0%|          | 85/20431 [00:26<1:53:53,  2.98it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-4a2d4c633410ba7c.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-d4ea98d1ead2578d.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-d4ea98d1ead2578d.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  0%|          | 92/20431 [00:28<1:41:35,  3.34it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-dfd1953ecb7c3a01.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-dfd1953ecb7c3a01.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  0%|          | 99/20431 [00:30<1:38:30,  3.44it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-6144c8a829f4bc8c.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-6144c8a829f4bc8c.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  1%|          | 106/20431 [00:31<1:37:19,  3.48it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0

  0%|          | 0/4 [00:00<?, ?ba/s]

Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
  1%|          | 110/20431 [00:33<1:46:34,  3.18it/s]Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-4ce36a6b1d3be1fa.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-4ce36a6b1d3be1fa.arrow
Loading cached processed dataset at ./data_cache/doc2dial/document_domain/1.0.1/765cb4d9af421b599d910080fd61b4a43440c1232693876470ef3245daa5fa4c/cache-8233e1a1196f4098.arrow
Loading cached processed dataset at ./data_cache/doc2dial/dialogue_domain/1.0

  0%|          | 0/4 [00:00<?, ?ba/s]

  1%|          | 114/20431 [00:34<1:43:18,  3.28it/s]


KeyboardInterrupt: ignored

In [None]:
new_train_data.to_csv('doc2dial_rc_train.csv')
# new_train_data = pd.read_csv('doc2dial_rc_train.csv')
new_train_data.head()

In [None]:
new_train_data

In [None]:
new_val_data = pd.DataFrame(columns=['id', 'question', 'context', 'answers', 'spans',  'domain', 'title'])

for ex in tqdm(val_data):
  new_val_data = new_val_data.append(add_spans(ex, val_dialogues), ignore_index=True)

new_val_data.head()

In [None]:
new_val_data.to_csv('doc2dial_rc_val.csv')