In [1]:
from datasets import Dataset, load_dataset
import numpy as np

In [2]:
names = ['Mary', 'John', 'Daniel', 'Sandra']
actions = ['moved', 'went', 'went back', 'journeyed', 'travelled']
places = ['bathroom', 'hallway', 'garden', 'office', 'bedroom', 'kitchen']
choices_dict = {'names': names, 'actions': actions, 'places': places}

class MemoryDataset():
    def __init__(self, choices_dict=choices_dict, num_facts=1, split='train', dataset='quality', num_samples=None):
        self.choices_dict = choices_dict
        self.dataset_ = load_dataset('tau/scrolls', dataset)[split]
        self.num_facts = num_facts
        self.num_samples = num_samples

    # def getitem_(self, ind):
    #     # try:
    #     #     items = [self[i] for i in ind]
    #     #     return {k:[its[k] for its in items] for k in items[0].keys}

    #     # except(TypeError):
    #     if self.num_samples:
    #         ind = np.random.randint(len(self.dataset_))
    #     sample = self.dataset_[ind]
    #     sample['fact'], sample['question'], sample['answer'] = self.generate_qa() 
    #     return sample
            
    # def __getitems__(self, keys: List) -> List:
    #     """Can be used to get a batch using a list of integers indices."""
    #     batch = self.__getitem__(keys)
    #     n_examples = len(batch[next(iter(batch))])
    #     return [{col: array[i] for col, array in batch.items()} for i in range(n_examples)]
    
    def __getitem__(self, ind):
        try:
            items = [self[i] for i in ind]
            return {k:[its[k] for its in items] for k in items[0].keys()}
        except(TypeError):
            if self.num_samples:
                ind = np.random.randint(len(self.dataset_))
            sample = self.dataset_[ind]
            sample['fact'], sample['question'], sample['answer'] = self.generate_qa() 
            return sample

    
    
    def __len__(self):
        return len(self.dataset_) if self.num_samples is None else self.num_samples

    def generate_qa(self):
        names, actions, places = self.choices_dict['names'], self.choices_dict['actions'], self.choices_dict['places']

        np.random.shuffle(names)
        facts, questions, answers = [], [], []
        for fact_num, name in zip(range(self.num_facts), names):
            action, place = np.random.choice(actions), np.random.choice(places)

            facts.append(f'{name} {action} to the {place}')
            questions.append(f'Where is {name}?')
            answers.append(place)

        facts = ', '.join(facts) + '.'
        questions = ' '.join(questions)
        answers = ', '.join(answers)
        
        return facts, questions, answers

In [6]:
train_dataset = MemoryDataset(choices_dict, num_facts=1, split='train', dataset='quality')
valid_dataset = MemoryDataset(choices_dict, num_facts=1, split='validation', dataset='quality')#, num_samples=100)

Found cached dataset scrolls (/home/bulatov/.cache/huggingface/datasets/tau___scrolls/quality/1.0.0/672021d5d8e1edff998a6ea7a5bff35fdfd0ae243e7cf6a8c88a57a04afb46ac)


  0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset scrolls (/home/bulatov/.cache/huggingface/datasets/tau___scrolls/quality/1.0.0/672021d5d8e1edff998a6ea7a5bff35fdfd0ae243e7cf6a8c88a57a04afb46ac)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
len(train_dataset), len(valid_dataset)

(2523, 2086)

In [78]:
train_dataset[0]

{'id': '52995_I3M5VUMM_1',
 'pid': '52995_I3M5VUMM_1_0',
 'input': 'Why is Si retirement so significant to the Space Exploration Team? \n\n (A) There aren’t enough working people in the world. They won’t be able to find a replacement.\n (B) As one of two remaining spacemen, it would likely mean the defunding and shut down of the Space Exploration Team.\n (C) Training new spacemen is costly and time consuming. They won’t have anyone else ready after him.\n (D) His retirement may inspire others to stop working as well, which would be hugely detrimental as most people don\'t feel the drive to work as is.  \n\n\nSPACEMAN ON A SPREE\n\n\n\n\n   BY MACK REYNOLDS\n\n\n\n\n   Illustrated by Nodel\n\n\n\n\n   What\'s more important—Man\'s conquest\n\n\n   of space, or one spaceman\'s life?\n\n\n\n\n\n\n   I\n\n\n\n\n   They gave him a gold watch. It was meant to be symbolical, of course.\n In the old tradition. It was in the way of an antique, being one of the\n timepieces made generations past

In [79]:
items = train_dataset[list(range(10))]

In [81]:
items

{'id': ['52995_I3M5VUMM_1',
  '52995_I3M5VUMM_2',
  '52995_I3M5VUMM_3',
  '52995_I3M5VUMM_4',
  '52995_I3M5VUMM_5',
  '52995_I3M5VUMM_6',
  '52995_I3M5VUMM_7',
  '52995_I3M5VUMM_8',
  '63477_65UJ979R_1',
  '63477_65UJ979R_2'],
 'pid': ['52995_I3M5VUMM_1_0',
  '52995_I3M5VUMM_2_0',
  '52995_I3M5VUMM_3_0',
  '52995_I3M5VUMM_4_0',
  '52995_I3M5VUMM_5_0',
  '52995_I3M5VUMM_6_0',
  '52995_I3M5VUMM_7_0',
  '52995_I3M5VUMM_8_0',
  '63477_65UJ979R_1_0',
  '63477_65UJ979R_2_0'],
 'input': ['Why is Si retirement so significant to the Space Exploration Team? \n\n (A) There aren’t enough working people in the world. They won’t be able to find a replacement.\n (B) As one of two remaining spacemen, it would likely mean the defunding and shut down of the Space Exploration Team.\n (C) Training new spacemen is costly and time consuming. They won’t have anyone else ready after him.\n (D) His retirement may inspire others to stop working as well, which would be hugely detrimental as most people don\'t fe